[RFC,v7,10/12] megaraid_sas: switch fusion adapters to MQ

Message ID 1591810159-240929-11-git-send-email-john.garry@huawei.com
State New
Headers show
Series
  • blk-mq/scsi: Provide hostwide shared tags for SCSI HBAs
Related show

Commit Message

John Garry June 10, 2020, 5:29 p.m.
From: Hannes Reinecke <hare@suse.com>


Fusion adapters can steer completions to individual queues, and
we now have support for shared host-wide tags.
So we can enable multiqueue support for fusion adapters and
drop the hand-crafted interrupt affinity settings.

Signed-off-by: Hannes Reinecke <hare@suse.com>

Signed-off-by: John Garry <john.garry@huawei.com>

---
 drivers/scsi/megaraid/megaraid_sas.h        |  1 -
 drivers/scsi/megaraid/megaraid_sas_base.c   | 59 +++++++--------------
 drivers/scsi/megaraid/megaraid_sas_fusion.c | 24 +++++----
 3 files changed, 32 insertions(+), 52 deletions(-)

-- 
2.26.2

Comments

Kashyap Desai July 2, 2020, 10:23 a.m. | #1
>

> From: Hannes Reinecke <hare@suse.com>

>

> Fusion adapters can steer completions to individual queues, and we now

have
> support for shared host-wide tags.

> So we can enable multiqueue support for fusion adapters and drop the

hand-
> crafted interrupt affinity settings.


Shared host tag is primarily introduced for completeness of CPU hotplug as
discussed earlier -
https://lwn.net/Articles/819419/

How shall I test CPU hotplug on megaraid_sas driver ? My understanding is
- This RFC + patch set from above link is required for it. I could not see
above series is committed.
Am I missing anything. ?

We do not want to completely move to shared host tag. It will be shared
host tag support by default, but user should have choice to go back to
legacy path.
We will completely move to shared host tag path once it is stable and no
more field issue observed over a period of time. -

Updated <megaraid_sas> patch will looks like this -

diff --git a/megaraid_sas_base.c b/megaraid_sas_base.c
index 0066833..3b503cb 100644
--- a/megaraid_sas_base.c
+++ b/megaraid_sas_base.c
@@ -37,6 +37,7 @@
 #include <linux/poll.h>
 #include <linux/vmalloc.h>
 #include <linux/irq_poll.h>
+#include <linux/blk-mq-pci.h>

 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
@@ -113,6 +114,10 @@ unsigned int enable_sdev_max_qd;
 module_param(enable_sdev_max_qd, int, 0444);
 MODULE_PARM_DESC(enable_sdev_max_qd, "Enable sdev max qd as can_queue.
Default: 0");

+int host_tagset_disabled = 0;
+module_param(host_tagset_disabled, int, 0444);
+MODULE_PARM_DESC(host_tagset_disabled, "Shared host tagset enable/disable
Default: enable(1)");
+
 MODULE_LICENSE("GPL");
 MODULE_VERSION(MEGASAS_VERSION);
 MODULE_AUTHOR("megaraidlinux.pdl@broadcom.com");
@@ -3115,6 +3120,18 @@ megasas_bios_param(struct scsi_device *sdev, struct
block_device *bdev,
        return 0;
 }

+static int megasas_map_queues(struct Scsi_Host *shost)
+{
+       struct megasas_instance *instance;
+       instance = (struct megasas_instance *)shost->hostdata;
+
+       if (instance->host->nr_hw_queues == 1)
+               return 0;
+
+       return
blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],
+                       instance->pdev,
instance->low_latency_index_start);
+}
+
 static void megasas_aen_polling(struct work_struct *work);

 /**
@@ -3423,8 +3440,10 @@ static struct scsi_host_template megasas_template =
{
        .eh_timed_out = megasas_reset_timer,
        .shost_attrs = megaraid_host_attrs,
        .bios_param = megasas_bios_param,
+       .map_queues = megasas_map_queues,
        .change_queue_depth = scsi_change_queue_depth,
        .max_segment_size = 0xffffffff,
+       .host_tagset = 1,
 };

 /**
@@ -6793,7 +6812,21 @@ static int megasas_io_attach(struct
megasas_instance *instance)
        host->max_id = MEGASAS_MAX_DEV_PER_CHANNEL;
        host->max_lun = MEGASAS_MAX_LUN;
        host->max_cmd_len = 16;
+       host->nr_hw_queues = 1;

+       /* Use shared host tagset only for fusion adaptors
+        * if there are more than one managed interrupts.
+        */
+       if ((instance->adapter_type != MFI_SERIES) &&
+               (instance->msix_vectors > 0) &&
+               !host_tagset_disabled &&
+               instance->smp_affinity_enable)
+               host->nr_hw_queues = instance->msix_vectors -
+                       instance->low_latency_index_start;
+
+       dev_info(&instance->pdev->dev, "Max firmware commands: %d"
+               " for nr_hw_queues = %d\n", instance->max_fw_cmds,
+               host->nr_hw_queues);
        /*
         * Notify the mid-layer about the new controller
         */
@@ -8842,6 +8875,7 @@ static int __init megasas_init(void)
                msix_vectors = 1;
                rdpq_enable = 0;
                dual_qdepth_disable = 1;
+               host_tagset_disabled = 1;
        }

        /*
diff --git a/megaraid_sas_fusion.c b/megaraid_sas_fusion.c
index 319f241..14d4f35 100755
--- a/megaraid_sas_fusion.c
+++ b/megaraid_sas_fusion.c
@@ -373,24 +373,28 @@ megasas_get_msix_index(struct megasas_instance
*instance,
 {
        int sdev_busy;

-       /* nr_hw_queue = 1 for MegaRAID */
-       struct blk_mq_hw_ctx *hctx =
-               scmd->device->request_queue->queue_hw_ctx[0];
-
-       sdev_busy = atomic_read(&hctx->nr_active);
+       /* TBD - if sml remove device_busy in future, driver
+        * should track counter in internal structure.
+        */
+       sdev_busy = atomic_read(&scmd->device->device_busy);

        if (instance->perf_mode == MR_BALANCED_PERF_MODE &&
-           sdev_busy > (data_arms * MR_DEVICE_HIGH_IOPS_DEPTH))
+           sdev_busy > (data_arms * MR_DEVICE_HIGH_IOPS_DEPTH)) {
                cmd->request_desc->SCSIIO.MSIxIndex =
                        mega_mod64((atomic64_add_return(1,
&instance->high_iops_outstanding) /
                                        MR_HIGH_IOPS_BATCH_COUNT),
instance->low_latency_index_start);
-       else if (instance->msix_load_balance)
+       } else if (instance->msix_load_balance) {
                cmd->request_desc->SCSIIO.MSIxIndex =
                        (mega_mod64(atomic64_add_return(1,
&instance->total_io_count),
                                instance->msix_vectors));
-       else
+       } else if (instance->host->nr_hw_queues > 1) {
+               u32 tag = blk_mq_unique_tag(scmd->request);
+               cmd->request_desc->SCSIIO.MSIxIndex =
blk_mq_unique_tag_to_hwq(tag) +
+                       instance->low_latency_index_start;
+       } else {
                cmd->request_desc->SCSIIO.MSIxIndex =
                        instance->reply_map[raw_smp_processor_id()];
+       }
 }

 /**
@@ -970,9 +974,6 @@ megasas_alloc_cmds_fusion(struct megasas_instance
*instance)
        if (megasas_alloc_cmdlist_fusion(instance))
                goto fail_exit;

-       dev_info(&instance->pdev->dev, "Configured max firmware commands:
%d\n",
-                instance->max_fw_cmds);
-
        /* The first 256 bytes (SMID 0) is not used. Don't add to the cmd
list */
        io_req_base = fusion->io_request_frames +
MEGA_MPI2_RAID_DEFAULT_IO_FRAME_SIZE;
        io_req_base_phys = fusion->io_request_frames_phys +
MEGA_MPI2_RAID_DEFAULT_IO_FRAME_SIZE;

Kashyap

>

> Signed-off-by: Hannes Reinecke <hare@suse.com>

> Signed-off-by: John Garry <john.garry@huawei.com>

> ---

>  drivers/scsi/megaraid/megaraid_sas.h        |  1 -

>  drivers/scsi/megaraid/megaraid_sas_base.c   | 59 +++++++--------------

>  drivers/scsi/megaraid/megaraid_sas_fusion.c | 24 +++++----

>  3 files changed, 32 insertions(+), 52 deletions(-)

>

> diff --git a/drivers/scsi/megaraid/megaraid_sas.h

> b/drivers/scsi/megaraid/megaraid_sas.h

> index af2c7a2a9565..b27a34a5f5de 100644

> --- a/drivers/scsi/megaraid/megaraid_sas.h

> +++ b/drivers/scsi/megaraid/megaraid_sas.h

> @@ -2261,7 +2261,6 @@ enum MR_PERF_MODE {

>

>  struct megasas_instance {

>

> -	unsigned int *reply_map;

>  	__le32 *producer;

>  	dma_addr_t producer_h;

>  	__le32 *consumer;

> diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c

> b/drivers/scsi/megaraid/megaraid_sas_base.c

> index 00668335c2af..e6bb2a64d51c 100644

> --- a/drivers/scsi/megaraid/megaraid_sas_base.c

> +++ b/drivers/scsi/megaraid/megaraid_sas_base.c

> @@ -37,6 +37,7 @@

>  #include <linux/poll.h>

>  #include <linux/vmalloc.h>

>  #include <linux/irq_poll.h>

> +#include <linux/blk-mq-pci.h>

>

>  #include <scsi/scsi.h>

>  #include <scsi/scsi_cmnd.h>

> @@ -3115,6 +3116,19 @@ megasas_bios_param(struct scsi_device *sdev,

> struct block_device *bdev,

>  	return 0;

>  }

>

> +static int megasas_map_queues(struct Scsi_Host *shost) {

> +	struct megasas_instance *instance;

> +

> +	instance = (struct megasas_instance *)shost->hostdata;

> +

> +	if (!instance->smp_affinity_enable)

> +		return 0;

> +

> +	return blk_mq_pci_map_queues(&shost-

> >tag_set.map[HCTX_TYPE_DEFAULT],

> +			instance->pdev,

instance->low_latency_index_start);
> +}

> +

>  static void megasas_aen_polling(struct work_struct *work);

>

>  /**

> @@ -3423,8 +3437,10 @@ static struct scsi_host_template

> megasas_template = {

>  	.eh_timed_out = megasas_reset_timer,

>  	.shost_attrs = megaraid_host_attrs,

>  	.bios_param = megasas_bios_param,

> +	.map_queues = megasas_map_queues,

>  	.change_queue_depth = scsi_change_queue_depth,

>  	.max_segment_size = 0xffffffff,

> +	.host_tagset = 1,

>  };

>

>  /**

> @@ -5708,34 +5724,6 @@ megasas_setup_jbod_map(struct

> megasas_instance *instance)

>  		instance->use_seqnum_jbod_fp = false;  }

>

> -static void megasas_setup_reply_map(struct megasas_instance *instance)

-{
> -	const struct cpumask *mask;

> -	unsigned int queue, cpu, low_latency_index_start;

> -

> -	low_latency_index_start = instance->low_latency_index_start;

> -

> -	for (queue = low_latency_index_start; queue < instance-

> >msix_vectors; queue++) {

> -		mask = pci_irq_get_affinity(instance->pdev, queue);

> -		if (!mask)

> -			goto fallback;

> -

> -		for_each_cpu(cpu, mask)

> -			instance->reply_map[cpu] = queue;

> -	}

> -	return;

> -

> -fallback:

> -	queue = low_latency_index_start;

> -	for_each_possible_cpu(cpu) {

> -		instance->reply_map[cpu] = queue;

> -		if (queue == (instance->msix_vectors - 1))

> -			queue = low_latency_index_start;

> -		else

> -			queue++;

> -	}

> -}

> -

>  /**

>   * megasas_get_device_list -	Get the PD and LD device list from FW.

>   * @instance:			Adapter soft state

> @@ -6158,8 +6146,6 @@ static int megasas_init_fw(struct megasas_instance

> *instance)

>  			goto fail_init_adapter;

>  	}

>

> -	megasas_setup_reply_map(instance);

> -

>  	dev_info(&instance->pdev->dev,

>  		"current msix/online cpus\t: (%d/%d)\n",

>  		instance->msix_vectors, (unsigned int)num_online_cpus());

> @@ -6793,6 +6779,9 @@ static int megasas_io_attach(struct

> megasas_instance *instance)

>  	host->max_id = MEGASAS_MAX_DEV_PER_CHANNEL;

>  	host->max_lun = MEGASAS_MAX_LUN;

>  	host->max_cmd_len = 16;

> +	if (instance->adapter_type != MFI_SERIES && instance->msix_vectors

> > 0)

> +		host->nr_hw_queues = instance->msix_vectors -

> +			instance->low_latency_index_start;

>

>  	/*

>  	 * Notify the mid-layer about the new controller @@ -6960,11

> +6949,6 @@ static inline int megasas_alloc_mfi_ctrl_mem(struct

> megasas_instance *instance)

>   */

>  static int megasas_alloc_ctrl_mem(struct megasas_instance *instance)  {

> -	instance->reply_map = kcalloc(nr_cpu_ids, sizeof(unsigned int),

> -				      GFP_KERNEL);

> -	if (!instance->reply_map)

> -		return -ENOMEM;

> -

>  	switch (instance->adapter_type) {

>  	case MFI_SERIES:

>  		if (megasas_alloc_mfi_ctrl_mem(instance))

> @@ -6981,8 +6965,6 @@ static int megasas_alloc_ctrl_mem(struct

> megasas_instance *instance)

>

>  	return 0;

>   fail:

> -	kfree(instance->reply_map);

> -	instance->reply_map = NULL;

>  	return -ENOMEM;

>  }

>

> @@ -6995,7 +6977,6 @@ static int megasas_alloc_ctrl_mem(struct

> megasas_instance *instance)

>   */

>  static inline void megasas_free_ctrl_mem(struct megasas_instance

> *instance)  {

> -	kfree(instance->reply_map);

>  	if (instance->adapter_type == MFI_SERIES) {

>  		if (instance->producer)

>  			dma_free_coherent(&instance->pdev->dev,

> sizeof(u32), @@ -7683,8 +7664,6 @@ megasas_resume(struct pci_dev

> *pdev)

>  			goto fail_reenable_msix;

>  	}

>

> -	megasas_setup_reply_map(instance);

> -

>  	if (instance->adapter_type != MFI_SERIES) {

>  		megasas_reset_reply_desc(instance);

>  		if (megasas_ioc_init_fusion(instance)) { diff --git

> a/drivers/scsi/megaraid/megaraid_sas_fusion.c

> b/drivers/scsi/megaraid/megaraid_sas_fusion.c

> index 319f241da4b6..8e25b700988e 100644

> --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c

> +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c

> @@ -373,24 +373,24 @@ megasas_get_msix_index(struct megasas_instance

> *instance,  {

>  	int sdev_busy;

>

> -	/* nr_hw_queue = 1 for MegaRAID */

> -	struct blk_mq_hw_ctx *hctx =

> -		scmd->device->request_queue->queue_hw_ctx[0];

> +	struct blk_mq_hw_ctx *hctx = scmd->request->mq_hctx;

>

>  	sdev_busy = atomic_read(&hctx->nr_active);

>

>  	if (instance->perf_mode == MR_BALANCED_PERF_MODE &&

> -	    sdev_busy > (data_arms * MR_DEVICE_HIGH_IOPS_DEPTH))

> +	    sdev_busy > (data_arms * MR_DEVICE_HIGH_IOPS_DEPTH)) {

>  		cmd->request_desc->SCSIIO.MSIxIndex =

>  			mega_mod64((atomic64_add_return(1, &instance-

> >high_iops_outstanding) /

>  					MR_HIGH_IOPS_BATCH_COUNT),

> instance->low_latency_index_start);

> -	else if (instance->msix_load_balance)

> +	} else if (instance->msix_load_balance) {

>  		cmd->request_desc->SCSIIO.MSIxIndex =

>  			(mega_mod64(atomic64_add_return(1, &instance-

> >total_io_count),

>  				instance->msix_vectors));

> -	else

> -		cmd->request_desc->SCSIIO.MSIxIndex =

> -			instance->reply_map[raw_smp_processor_id()];

> +	} else {

> +		u32 tag = blk_mq_unique_tag(scmd->request);

> +

> +		cmd->request_desc->SCSIIO.MSIxIndex =

> blk_mq_unique_tag_to_hwq(tag) + instance->low_latency_index_start;

> +	}

>  }

>

>  /**

> @@ -3326,7 +3326,7 @@ megasas_build_and_issue_cmd_fusion(struct

> megasas_instance *instance,  {

>  	struct megasas_cmd_fusion *cmd, *r1_cmd = NULL;

>  	union MEGASAS_REQUEST_DESCRIPTOR_UNION *req_desc;

> -	u32 index;

> +	u32 index, blk_tag, unique_tag;

>

>  	if ((megasas_cmd_type(scmd) == READ_WRITE_LDIO) &&

>  		instance->ldio_threshold &&

> @@ -3342,7 +3342,9 @@ megasas_build_and_issue_cmd_fusion(struct

> megasas_instance *instance,

>  		return SCSI_MLQUEUE_HOST_BUSY;

>  	}

>

> -	cmd = megasas_get_cmd_fusion(instance, scmd->request->tag);

> +	unique_tag = blk_mq_unique_tag(scmd->request);

> +	blk_tag = blk_mq_unique_tag_to_tag(unique_tag);

> +	cmd = megasas_get_cmd_fusion(instance, blk_tag);

>

>  	if (!cmd) {

>  		atomic_dec(&instance->fw_outstanding);

> @@ -3383,7 +3385,7 @@ megasas_build_and_issue_cmd_fusion(struct

> megasas_instance *instance,

>  	 */

>  	if (cmd->r1_alt_dev_handle != MR_DEVHANDLE_INVALID) {

>  		r1_cmd = megasas_get_cmd_fusion(instance,

> -				(scmd->request->tag + instance-

> >max_fw_cmds));

> +				(blk_tag + instance->max_fw_cmds));

>  		megasas_prepare_secondRaid1_IO(instance, cmd, r1_cmd);

>  	}

>

> --

> 2.26.2
John Garry July 6, 2020, 8:23 a.m. | #2
On 02/07/2020 11:23, Kashyap Desai wrote:
>>

>> From: Hannes Reinecke <hare@suse.com>

>>

>> Fusion adapters can steer completions to individual queues, and we now

> have

>> support for shared host-wide tags.

>> So we can enable multiqueue support for fusion adapters and drop the

> hand-

>> crafted interrupt affinity settings.

> 

> Shared host tag is primarily introduced for completeness of CPU hotplug as

> discussed earlier -

> https://lwn.net/Articles/819419/

> 

> How shall I test CPU hotplug on megaraid_sas driver ?


I have scripts like this:

----8<-----

# hotplug.sh
# enable all cpus in the system
./enable_all.sh

for((i = 0; i < 50 ; i++))
do
echo "Looping ... number $i"
# run fio on all cpus with 40 second runtime
./create_fio_task_cpu.sh 4k read 2048 1&
echo "short sleep, then disable"
sleep 5
# disable some set of cpus which means managed interrupts get shutdown
# like cpu1-50 from 0-63
./disable_all.sh
echo "long sleep $i"
sleep 50
echo "long sleep over number $i"
./enable_all.sh
sleep 3
done

----->8-----

# enable_all.sh
for((i=0; i<63; i++))
do
echo 1 > /sys/devices/system/cpu/cpu$i/online
done

--->8----

I hope to add such a test to blktests when I get a chance.

> My understanding is

> - This RFC + patch set from above link is required for it. I could not see

> above series is committed.


It is committed and part of 5.8-rc1

The latest rc should have some scheduler fixes also.

I also note that there has been much churn on blk-mq tag code lately, 
and something may be broken, so I plan to verify latest rc myself soon.

> Am I missing anything. ?


You could also add this from Hannes (and add megaraid sas support):

https://lore.kernel.org/linux-scsi/20200629072021.9864-1-hare@suse.de/T/#t

That is, if it is required. I am not sure if megaraid sas uses 
"internal" commands which needs to be guarded against cpu hotplug. Nor 
would any of these commands be used during a test. For hisi_sas testing, 
I did not bother adding support, and I guess that you don't need to either.

> 

> We do not want to completely move to shared host tag. It will be shared

> host tag support by default, but user should have choice to go back to

> legacy path.

> We will completely move to shared host tag path once it is stable and no

> more field issue observed over a period of time. -

> 

> Updated <megaraid_sas> patch will looks like this -

> 

> diff --git a/megaraid_sas_base.c b/megaraid_sas_base.c

> index 0066833..3b503cb 100644

> --- a/megaraid_sas_base.c

> +++ b/megaraid_sas_base.c

> @@ -37,6 +37,7 @@

>   #include <linux/poll.h>

>   #include <linux/vmalloc.h>

>   #include <linux/irq_poll.h>

> +#include <linux/blk-mq-pci.h>

> 

>   #include <scsi/scsi.h>

>   #include <scsi/scsi_cmnd.h>

> @@ -113,6 +114,10 @@ unsigned int enable_sdev_max_qd;

>   module_param(enable_sdev_max_qd, int, 0444);

>   MODULE_PARM_DESC(enable_sdev_max_qd, "Enable sdev max qd as can_queue.

> Default: 0");

> 

> +int host_tagset_disabled = 0;

> +module_param(host_tagset_disabled, int, 0444);

> +MODULE_PARM_DESC(host_tagset_disabled, "Shared host tagset enable/disable

> Default: enable(1)");


The logic seems inverted here: for passing 1, I would expect Shared host 
tagset enabled, while it actually means to disable, right?

> +

>   MODULE_LICENSE("GPL");

>   MODULE_VERSION(MEGASAS_VERSION);

>   MODULE_AUTHOR("megaraidlinux.pdl@broadcom.com");

> @@ -3115,6 +3120,18 @@ megasas_bios_param(struct scsi_device *sdev, struct

> block_device *bdev,

>          return 0;

>   }

> 

> +static int megasas_map_queues(struct Scsi_Host *shost)

> +{

> +       struct megasas_instance *instance;

> +       instance = (struct megasas_instance *)shost->hostdata;

> +

> +       if (instance->host->nr_hw_queues == 1)

> +               return 0;

> +

> +       return

> blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],

> +                       instance->pdev,

> instance->low_latency_index_start);

> +}

> +

>   static void megasas_aen_polling(struct work_struct *work);

> 

>   /**

> @@ -3423,8 +3440,10 @@ static struct scsi_host_template megasas_template =

> {

>          .eh_timed_out = megasas_reset_timer,

>          .shost_attrs = megaraid_host_attrs,

>          .bios_param = megasas_bios_param,

> +       .map_queues = megasas_map_queues,

>          .change_queue_depth = scsi_change_queue_depth,

>          .max_segment_size = 0xffffffff,

> +       .host_tagset = 1,


Is your intention to always have this set for Scsi_Host, and just change 
nr_hw_queues?

>   };

> 

>   /**

> @@ -6793,7 +6812,21 @@ static int megasas_io_attach(struct

> megasas_instance *instance)

>          host->max_id = MEGASAS_MAX_DEV_PER_CHANNEL;

>          host->max_lun = MEGASAS_MAX_LUN;

>          host->max_cmd_len = 16;

> +       host->nr_hw_queues = 1;

> 

> +       /* Use shared host tagset only for fusion adaptors

> +        * if there are more than one managed interrupts.

> +        */

> +       if ((instance->adapter_type != MFI_SERIES) &&

> +               (instance->msix_vectors > 0) &&

> +               !host_tagset_disabled &&

> +               instance->smp_affinity_enable)

> +               host->nr_hw_queues = instance->msix_vectors -

> +                       instance->low_latency_index_start;

> +

> +       dev_info(&instance->pdev->dev, "Max firmware commands: %d"

> +               " for nr_hw_queues = %d\n", instance->max_fw_cmds,

> +               host->nr_hw_queues);


note: it may be good for us to add a nr_hw_queues file to scsi host 
sysfs folder

>          /*

>           * Notify the mid-layer about the new controller

>           */

> @@ -8842,6 +8875,7 @@ static int __init megasas_init(void)

>                  msix_vectors = 1;

>                  rdpq_enable = 0;

>                  dual_qdepth_disable = 1;

> +               host_tagset_disabled = 1;

>          }

> 


Thanks,
John
Hannes Reinecke July 6, 2020, 8:45 a.m. | #3
On 7/6/20 10:23 AM, John Garry wrote:
> On 02/07/2020 11:23, Kashyap Desai wrote:

[ .. ]
>> My understanding is

>> - This RFC + patch set from above link is required for it. I could not

>> see

>> above series is committed.

> 

> It is committed and part of 5.8-rc1

> 

> The latest rc should have some scheduler fixes also.

> 

> I also note that there has been much churn on blk-mq tag code lately,

> and something may be broken, so I plan to verify latest rc myself soon.

> 

>> Am I missing anything. ?

> 

> You could also add this from Hannes (and add megaraid sas support):

> 

> https://lore.kernel.org/linux-scsi/20200629072021.9864-1-hare@suse.de/T/#t

> 

> That is, if it is required. I am not sure if megaraid sas uses

> "internal" commands which needs to be guarded against cpu hotplug. Nor

> would any of these commands be used during a test. For hisi_sas testing,

> I did not bother adding support, and I guess that you don't need to either.

> 

Oh, it certainly uses internal commands, most notably to set up the
queue mapping :-)

The idea of the 'internal command' patchset is to let the block-layer
control _all_ tags, be it for internal or 'normal' I/O.
With that we can do away with all the tag mapping etc these drivers have
to do (cf the hpsa conversion for an example).
And only then we can safely use the blk layer busy iterators, knowing
that all tags are accounted for and a 1:1 mapping between tags and
internal hardware resources exist.
Originally I thought it would help for CPU hotplug, too, but typically
the internal commands are not bound to any specific CPU, so they
typically will not accounted for when looking at the CPU-related resources.
But that depends on the driver etc, so it's hard to give a guideline.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer
John Garry July 6, 2020, 9:26 a.m. | #4
On 06/07/2020 09:45, Hannes Reinecke wrote:
> Originally I thought it would help for CPU hotplug, too, but typically

> the internal commands are not bound to any specific CPU, 


When we alloc the request in scsi_get_internal_cmd() - > 
blk_mq_alloc_request() -> __blk_mq_alloc_request(), the request will 
have an associated hctx.

As such, I would expect the LLDD to honor this, in that it should use 
the hwq associated with the hctx to send/receive the command.

And from that, the hwq managed interrupt should not be shut down until 
the queue is drained, including internal commands.

Is there something wrong with this idea?

Thanks,
John

> so they

> typically will not accounted for when looking at the CPU-related resources.

> But that depends on the driver etc, so it's hard to give a guideline.
Hannes Reinecke July 6, 2020, 9:40 a.m. | #5
On 7/6/20 11:26 AM, John Garry wrote:
> On 06/07/2020 09:45, Hannes Reinecke wrote:

>> Originally I thought it would help for CPU hotplug, too, but typically

>> the internal commands are not bound to any specific CPU, 

> 

> When we alloc the request in scsi_get_internal_cmd() - >

> blk_mq_alloc_request() -> __blk_mq_alloc_request(), the request will

> have an associated hctx.

> 

> As such, I would expect the LLDD to honor this, in that it should use

> the hwq associated with the hctx to send/receive the command.

> 

> And from that, the hwq managed interrupt should not be shut down until

> the queue is drained, including internal commands.

> 

> Is there something wrong with this idea?

> 

Oh, no, not at all.

What I'm referring to are driver internals; some driver allow (or set)
the MSIx vector only for 'normal' I/O; internal commands don't have an
MSIx vector set.
As such it's pretty much driver-dependent _where_ they end up, be it on
the same CPU, or on any CPU who might be listening.
Be it as it may, when waiting for hctx to drain we are pretty safe, and
hence CPU hotplug will be improved here.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer
Kashyap Desai July 6, 2020, 7:19 p.m. | #6
>

> On 02/07/2020 11:23, Kashyap Desai wrote:

> >>

> >> From: Hannes Reinecke <hare@suse.com>

> >>

> >> Fusion adapters can steer completions to individual queues, and we

> >> now

> > have

> >> support for shared host-wide tags.

> >> So we can enable multiqueue support for fusion adapters and drop the

> > hand-

> >> crafted interrupt affinity settings.

> >

> > Shared host tag is primarily introduced for completeness of CPU

> > hotplug as discussed earlier - https://lwn.net/Articles/819419/

> >

> > How shall I test CPU hotplug on megaraid_sas driver ?

>

> I have scripts like this:

>

> ----8<-----

>

> # hotplug.sh

> # enable all cpus in the system

> ./enable_all.sh

>

> for((i = 0; i < 50 ; i++))

> do

> echo "Looping ... number $i"

> # run fio on all cpus with 40 second runtime ./create_fio_task_cpu.sh 4k

> read

> 2048 1& echo "short sleep, then disable"

> sleep 5

> # disable some set of cpus which means managed interrupts get shutdown #

> like cpu1-50 from 0-63 ./disable_all.sh echo "long sleep $i"

> sleep 50

> echo "long sleep over number $i"

> ./enable_all.sh

> sleep 3

> done

>

> ----->8-----

>

> # enable_all.sh

> for((i=0; i<63; i++))

> do

> echo 1 > /sys/devices/system/cpu/cpu$i/online

> done

>

> --->8----

>

> I hope to add such a test to blktests when I get a chance.

>

> > My understanding is

> > - This RFC + patch set from above link is required for it. I could not

> > see above series is committed.

>

> It is committed and part of 5.8-rc1

>

> The latest rc should have some scheduler fixes also.

>

> I also note that there has been much churn on blk-mq tag code lately, and

> something may be broken, so I plan to verify latest rc myself soon.


Thanks. I will try merging 5.8-rc1 and RFC and see how CPU hot plug works.

>

> > Am I missing anything. ?

>

> You could also add this from Hannes (and add megaraid sas support):

>

> https://lore.kernel.org/linux-scsi/20200629072021.9864-1-

> hare@suse.de/T/#t

>

> That is, if it is required. I am not sure if megaraid sas uses "internal"

> commands which needs to be guarded against cpu hotplug. Nor would any of

> these commands be used during a test. For hisi_sas testing, I did not

> bother

> adding support, and I guess that you don't need to either.


Megaraid driver use internal command but it is excluded from can_queue.
All internal command are mapped to msix index 0, which is non-managed. So we
are good w.r.t internal command.

>

> >

> > We do not want to completely move to shared host tag. It will be shared

> > host tag support by default, but user should have choice to go back to

> > legacy path.

> > We will completely move to shared host tag path once it is stable and no

> > more field issue observed over a period of time. -

> >

> > Updated <megaraid_sas> patch will looks like this -

> >

> > diff --git a/megaraid_sas_base.c b/megaraid_sas_base.c

> > index 0066833..3b503cb 100644

> > --- a/megaraid_sas_base.c

> > +++ b/megaraid_sas_base.c

> > @@ -37,6 +37,7 @@

> >   #include <linux/poll.h>

> >   #include <linux/vmalloc.h>

> >   #include <linux/irq_poll.h>

> > +#include <linux/blk-mq-pci.h>

> >

> >   #include <scsi/scsi.h>

> >   #include <scsi/scsi_cmnd.h>

> > @@ -113,6 +114,10 @@ unsigned int enable_sdev_max_qd;

> >   module_param(enable_sdev_max_qd, int, 0444);

> >   MODULE_PARM_DESC(enable_sdev_max_qd, "Enable sdev max qd as

> can_queue.

> > Default: 0");

> >

> > +int host_tagset_disabled = 0;

> > +module_param(host_tagset_disabled, int, 0444);

> > +MODULE_PARM_DESC(host_tagset_disabled, "Shared host tagset

> enable/disable

> > Default: enable(1)");

>

> The logic seems inverted here: for passing 1, I would expect Shared host

> tagset enabled, while it actually means to disable, right?


No. passing 1 means shared_hosttag support will be turned off.
>

> > +

> >   MODULE_LICENSE("GPL");

> >   MODULE_VERSION(MEGASAS_VERSION);

> >   MODULE_AUTHOR("megaraidlinux.pdl@broadcom.com");

> > @@ -3115,6 +3120,18 @@ megasas_bios_param(struct scsi_device *sdev,

> struct

> > block_device *bdev,

> >          return 0;

> >   }

> >

> > +static int megasas_map_queues(struct Scsi_Host *shost)

> > +{

> > +       struct megasas_instance *instance;

> > +       instance = (struct megasas_instance *)shost->hostdata;

> > +

> > +       if (instance->host->nr_hw_queues == 1)

> > +               return 0;

> > +

> > +       return

> > blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],

> > +                       instance->pdev,

> > instance->low_latency_index_start);

> > +}

> > +

> >   static void megasas_aen_polling(struct work_struct *work);

> >

> >   /**

> > @@ -3423,8 +3440,10 @@ static struct scsi_host_template

> megasas_template =

> > {

> >          .eh_timed_out = megasas_reset_timer,

> >          .shost_attrs = megaraid_host_attrs,

> >          .bios_param = megasas_bios_param,

> > +       .map_queues = megasas_map_queues,

> >          .change_queue_depth = scsi_change_queue_depth,

> >          .max_segment_size = 0xffffffff,

> > +       .host_tagset = 1,

>

> Is your intention to always have this set for Scsi_Host, and just change

> nr_hw_queues?


Actually I wanted to turn off  this feature using host_tagset and not
through nr_hw_queue. I will address this.

Additional request -
In MR we have old controllers (called MFI_SERIES). We prefer not to change
behavior for those controller.
Having host_tagset in template does not allow to cherry pick different
values for different type of controller.
If host_tagset is part of Scsi_Host OR we add check in scsi_lib.c that
host_tagset = 1 only make sense if nr_hw_queues > 1, we can cherry pick in
driver.


>

> >   };

> >

> >   /**

> > @@ -6793,7 +6812,21 @@ static int megasas_io_attach(struct

> > megasas_instance *instance)

> >          host->max_id = MEGASAS_MAX_DEV_PER_CHANNEL;

> >          host->max_lun = MEGASAS_MAX_LUN;

> >          host->max_cmd_len = 16;

> > +       host->nr_hw_queues = 1;

> >

> > +       /* Use shared host tagset only for fusion adaptors

> > +        * if there are more than one managed interrupts.

> > +        */

> > +       if ((instance->adapter_type != MFI_SERIES) &&

> > +               (instance->msix_vectors > 0) &&

> > +               !host_tagset_disabled &&

> > +               instance->smp_affinity_enable)

> > +               host->nr_hw_queues = instance->msix_vectors -

> > +                       instance->low_latency_index_start;

> > +

> > +       dev_info(&instance->pdev->dev, "Max firmware commands: %d"

> > +               " for nr_hw_queues = %d\n", instance->max_fw_cmds,

> > +               host->nr_hw_queues);

>

> note: it may be good for us to add a nr_hw_queues file to scsi host

> sysfs folder


I will accommodate this.

>

> >          /*

> >           * Notify the mid-layer about the new controller

> >           */

> > @@ -8842,6 +8875,7 @@ static int __init megasas_init(void)

> >                  msix_vectors = 1;

> >                  rdpq_enable = 0;

> >                  dual_qdepth_disable = 1;

> > +               host_tagset_disabled = 1;

> >          }

> >

>

> Thanks,

> John
John Garry July 7, 2020, 7:58 a.m. | #7
>>>

>>>    #include <scsi/scsi.h>

>>>    #include <scsi/scsi_cmnd.h>

>>> @@ -113,6 +114,10 @@ unsigned int enable_sdev_max_qd;

>>>    module_param(enable_sdev_max_qd, int, 0444);

>>>    MODULE_PARM_DESC(enable_sdev_max_qd, "Enable sdev max qd as

>> can_queue.

>>> Default: 0");

>>>

>>> +int host_tagset_disabled = 0;

>>> +module_param(host_tagset_disabled, int, 0444);

>>> +MODULE_PARM_DESC(host_tagset_disabled, "Shared host tagset

>> enable/disable

>>> Default: enable(1)");

>> The logic seems inverted here: for passing 1, I would expect Shared host

>> tagset enabled, while it actually means to disable, right?

> No. passing 1 means shared_hosttag support will be turned off.


Just reading "Shared host tagset enable/disable Default: enable(1)" 
looked inconsistent to me.

>>> +

>>>    MODULE_LICENSE("GPL");

>>>    MODULE_VERSION(MEGASAS_VERSION);

>>>    MODULE_AUTHOR("megaraidlinux.pdl@broadcom.com");

>>> @@ -3115,6 +3120,18 @@ megasas_bios_param(struct scsi_device *sdev,

>> struct

>>> block_device *bdev,

>>>           return 0;

>>>    }

>>>

>>> +static int megasas_map_queues(struct Scsi_Host *shost)

>>> +{

>>> +       struct megasas_instance *instance;

>>> +       instance = (struct megasas_instance *)shost->hostdata;

>>> +

>>> +       if (instance->host->nr_hw_queues == 1)

>>> +               return 0;

>>> +

>>> +       return

>>> blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],

>>> +                       instance->pdev,

>>> instance->low_latency_index_start);

>>> +}

>>> +

>>>    static void megasas_aen_polling(struct work_struct *work);

>>>

>>>    /**

>>> @@ -3423,8 +3440,10 @@ static struct scsi_host_template

>> megasas_template =

>>> {

>>>           .eh_timed_out = megasas_reset_timer,

>>>           .shost_attrs = megaraid_host_attrs,

>>>           .bios_param = megasas_bios_param,

>>> +       .map_queues = megasas_map_queues,

>>>           .change_queue_depth = scsi_change_queue_depth,

>>>           .max_segment_size = 0xffffffff,

>>> +       .host_tagset = 1,

>> Is your intention to always have this set for Scsi_Host, and just change

>> nr_hw_queues?

> Actually I wanted to turn off  this feature using host_tagset and not

> through nr_hw_queue. I will address this.

> 

> Additional request -

> In MR we have old controllers (called MFI_SERIES). We prefer not to change

> behavior for those controller.

> Having host_tagset in template does not allow to cherry pick different

> values for different type of controller.


Ok, so it seems sensible to add host_tagset to Scsi_Host structure also, 
to allow overwriting during probe time.

If you want to share an updated megaraid sas driver patch based on that, 
then that's fine. I can incorporate that change in the patch where we 
add host_tagset to the scsi host template.

> If host_tagset is part of Scsi_Host OR we add check in scsi_lib.c that

> host_tagset = 1 only make sense if nr_hw_queues > 1, we can cherry pick in

> driver.

> 

>
Kashyap Desai July 7, 2020, 2:45 p.m. | #8
> >>>

> >>>    #include <scsi/scsi.h>

> >>>    #include <scsi/scsi_cmnd.h>

> >>> @@ -113,6 +114,10 @@ unsigned int enable_sdev_max_qd;

> >>>    module_param(enable_sdev_max_qd, int, 0444);

> >>>    MODULE_PARM_DESC(enable_sdev_max_qd, "Enable sdev max qd as

> >> can_queue.

> >>> Default: 0");

> >>>

> >>> +int host_tagset_disabled = 0;

> >>> +module_param(host_tagset_disabled, int, 0444);

> >>> +MODULE_PARM_DESC(host_tagset_disabled, "Shared host tagset

> >> enable/disable

> >>> Default: enable(1)");

> >> The logic seems inverted here: for passing 1, I would expect Shared

> >> host tagset enabled, while it actually means to disable, right?

> > No. passing 1 means shared_hosttag support will be turned off.

>

> Just reading "Shared host tagset enable/disable Default: enable(1)"

> looked inconsistent to me.


I will change to "host_tagset_enable" that will be good for readability.
Default value will of host_tagset_enable will be 1 and user can turnoff
passing 0.

>

> >>> +

> >>>    MODULE_LICENSE("GPL");

> >>>    MODULE_VERSION(MEGASAS_VERSION);

> >>>    MODULE_AUTHOR("megaraidlinux.pdl@broadcom.com");

> >>> @@ -3115,6 +3120,18 @@ megasas_bios_param(struct scsi_device

> *sdev,

> >> struct

> >>> block_device *bdev,

> >>>           return 0;

> >>>    }

> >>>

> >>> +static int megasas_map_queues(struct Scsi_Host *shost) {

> >>> +       struct megasas_instance *instance;

> >>> +       instance = (struct megasas_instance *)shost->hostdata;

> >>> +

> >>> +       if (instance->host->nr_hw_queues == 1)

> >>> +               return 0;

> >>> +

> >>> +       return

> >>> blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],

> >>> +                       instance->pdev,

> >>> instance->low_latency_index_start);

> >>> +}

> >>> +

> >>>    static void megasas_aen_polling(struct work_struct *work);

> >>>

> >>>    /**

> >>> @@ -3423,8 +3440,10 @@ static struct scsi_host_template

> >> megasas_template =

> >>> {

> >>>           .eh_timed_out = megasas_reset_timer,

> >>>           .shost_attrs = megaraid_host_attrs,

> >>>           .bios_param = megasas_bios_param,

> >>> +       .map_queues = megasas_map_queues,

> >>>           .change_queue_depth = scsi_change_queue_depth,

> >>>           .max_segment_size = 0xffffffff,

> >>> +       .host_tagset = 1,

> >> Is your intention to always have this set for Scsi_Host, and just

> >> change nr_hw_queues?

> > Actually I wanted to turn off  this feature using host_tagset and not

> > through nr_hw_queue. I will address this.

> >

> > Additional request -

> > In MR we have old controllers (called MFI_SERIES). We prefer not to

> > change behavior for those controller.

> > Having host_tagset in template does not allow to cherry pick different

> > values for different type of controller.

>

> Ok, so it seems sensible to add host_tagset to Scsi_Host structure also,

> to

> allow overwriting during probe time.

>

> If you want to share an updated megaraid sas driver patch based on that,

> then

> that's fine. I can incorporate that change in the patch where we add

> host_tagset to the scsi host template.


If you share git repo link of next submission, I can send you megaraid_sas
driver patch which you can include in series.

Kashyap
>

> > If host_tagset is part of Scsi_Host OR we add check in scsi_lib.c that

> > host_tagset = 1 only make sense if nr_hw_queues > 1, we can cherry

> > pick in driver.

> >

> >

>

>
John Garry July 7, 2020, 4:17 p.m. | #9
On 07/07/2020 15:45, Kashyap Desai wrote:
>>>>>            .eh_timed_out = megasas_reset_timer,

>>>>>            .shost_attrs = megaraid_host_attrs,

>>>>>            .bios_param = megasas_bios_param,

>>>>> +       .map_queues = megasas_map_queues,

>>>>>            .change_queue_depth = scsi_change_queue_depth,

>>>>>            .max_segment_size = 0xffffffff,

>>>>> +       .host_tagset = 1,

>>>> Is your intention to always have this set for Scsi_Host, and just

>>>> change nr_hw_queues?

>>> Actually I wanted to turn off  this feature using host_tagset and not

>>> through nr_hw_queue. I will address this.

>>>

>>> Additional request -

>>> In MR we have old controllers (called MFI_SERIES). We prefer not to

>>> change behavior for those controller.

>>> Having host_tagset in template does not allow to cherry pick different

>>> values for different type of controller.

>> Ok, so it seems sensible to add host_tagset to Scsi_Host structure also,

>> to

>> allow overwriting during probe time.

>>

>> If you want to share an updated megaraid sas driver patch based on that,

>> then

>> that's fine. I can incorporate that change in the patch where we add

>> host_tagset to the scsi host template.

> If you share git repo link of next submission, I can send you megaraid_sas

> driver patch which you can include in series.


So this is my work-en-progress branch:

https://github.com/hisilicon/kernel-dev/commits/private-topic-blk-mq-shared-tags-rfc-v8

I just updated to include the change to have Scsi_Host.host_tagset in 
4291f617a02b commit ("scsi: Add host and host template flag 'host_tagset'")

megaraid sas support is not on the branch yet, but I think everything 
else required is. And it is mutable, so I'd clone it now if I were you - 
or just replace the required patch onto your v7 branch.

Thanks,
John
John Garry July 8, 2020, 11:31 a.m. | #10
On 06/07/2020 20:19, Kashyap Desai wrote:
>>> My understanding is

>>> - This RFC + patch set from above link is required for it. I could not

>>> see above series is committed.

>> It is committed and part of 5.8-rc1

>>

>> The latest rc should have some scheduler fixes also.

>>

>> I also note that there has been much churn on blk-mq tag code lately, and

>> something may be broken, so I plan to verify latest rc myself soon.

> Thanks. I will try merging 5.8-rc1 and RFC and see how CPU hot plug works.

> 


JFYI, I tested 5.8-rc4 for scheduler=none and =mq-deadline (no fully 
tested), and it looks ok

john
Kashyap Desai July 9, 2020, 7:01 p.m. | #11
> >> If you want to share an updated megaraid sas driver patch based on

> >> that, then that's fine. I can incorporate that change in the patch

> >> where we add host_tagset to the scsi host template.

> > If you share git repo link of next submission, I can send you

> > megaraid_sas driver patch which you can include in series.

>

> So this is my work-en-progress branch:

>

> https://github.com/hisilicon/kernel-dev/commits/private-topic-blk-mq-

> shared-tags-rfc-v8


I tested this repo + megaraid_sas shared hosttag driver. This repo (5.8-rc)
has CPU hotplug patch.
" bf0beec0607d blk-mq: drain I/O when all CPUs in a hctx are offline"

Looking at description of above patch and changes, it looks like
megaraid_sas driver can still work without shared host tag for this feature.

I observe CPU hotplug works irrespective of shared host tag in megaraid_sas
on 5.8-rc.

Without shared host tag, megaraid driver will expose single hctx and all the
CPU will be mapped to hctx0.
Any CPU offline event will have " blk_mq_hctx_notify_offline" callback in
blk-mq module. If we do not have this callback/patch, we will see IO
timeout.
blk_mq_hctx_notify_offline callback will make sure all the outstanding on
hctx0 is cleared and only after it is cleared, CPU will go offline.

megaraid_sas driver has  internal reply_queue mapping which helps to get IO
completion on same cpu.  Driver get msix index from that table based on "
raw_smp_processor_id".
If table is mapped correctly at probe time,  It is not possible to pick
entry of offline CPU.

Am I missing anything ?

If you can help me to understand why we need shared host tag for CPU
hotplug, I can try to frame some test case for possible reproduction.

>

> I just updated to include the change to have Scsi_Host.host_tagset in

> 4291f617a02b commit ("scsi: Add host and host template flag

> 'host_tagset'")

>

> megaraid sas support is not on the branch yet, but I think everything else

> required is. And it is mutable, so I'd clone it now if I were you - or

> just replace

> the required patch onto your v7 branch.


I am working on this.

>

> Thanks,

> John
John Garry July 10, 2020, 8:10 a.m. | #12
>>

>> https://github.com/hisilicon/kernel-dev/commits/private-topic-blk-mq-

>> shared-tags-rfc-v8

> I tested this repo + megaraid_sas shared hosttag driver. This repo (5.8-rc)

> has CPU hotplug patch.

> " bf0beec0607d blk-mq: drain I/O when all CPUs in a hctx are offline"

> 

> Looking at description of above patch and changes, it looks like

> megaraid_sas driver can still work without shared host tag for this feature.

> 

> I observe CPU hotplug works irrespective of shared host tag


Can you be clear exactly what you mean by "irrespective of shared host tag"?

Do you mean that for your test Scsi_Host.nr_hw_queues is set to expose 
hw queues and scsi_host_template.map_queues = blk_mq_pci_map_queues(), 
but you just don't set the host_tagset flag?

  in megaraid_sas
> on 5.8-rc.

> 

> Without shared host tag, megaraid driver will expose single hctx and all the

> CPU will be mapped to hctx0.


right

> Any CPU offline event will have " blk_mq_hctx_notify_offline" callback in

> blk-mq module. If we do not have this callback/patch, we will see IO

> timeout.

> blk_mq_hctx_notify_offline callback will make sure all the outstanding on

> hctx0 is cleared and only after it is cleared, CPU will go offline.


But that is only for when the last CPU for the hctx is going offline. If 
nr_hw_queues == 1, then hctx0 would cover all CPUs, so that would never 
occur during normal operation. See initial check in 
blk_mq_hctx_notify_offline():

static int blk_mq_hctx_notify_offline(unsigned int cpu, struct 
hlist_node *node)
{
	if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
	    !blk_mq_last_cpu_in_hctx(cpu, hctx))
		return 0;

> 

> megaraid_sas driver has  internal reply_queue mapping which helps to get IO

> completion on same cpu.  Driver get msix index from that table based on "

> raw_smp_processor_id".

> If table is mapped correctly at probe time,  It is not possible to pick

> entry of offline CPU.

> 

> Am I missing anything ?


Not sure, I think I need to be clear exactly what you're doing.

> 

> If you can help me to understand why we need shared host tag for CPU

> hotplug, I can try to frame some test case for possible reproduction.


I think it's best explained in cover letter for "blk-mq: Facilitate a 
shared sbitmap per tagset".

See points "HBA HW queues are required to be mapped to that of the 
blk-mq hctx", "HBA LLDD would have to generate this tag internally", and 
"blk-mq assumes the host may accept (Scsi_host.can_queue * #hw queue) 
commands".
> 

>> I just updated to include the change to have Scsi_Host.host_tagset in

>> 4291f617a02b commit ("scsi: Add host and host template flag

>> 'host_tagset'")

>>

>> megaraid sas support is not on the branch yet, but I think everything else

>> required is. And it is mutable, so I'd clone it now if I were you - or

>> just replace

>> the required patch onto your v7 branch.

> I am working on this.

> 


Great, thanks
Kashyap Desai July 13, 2020, 7:55 a.m. | #13
> > Looking at description of above patch and changes, it looks like

> > megaraid_sas driver can still work without shared host tag for this

> > feature.

> >

> > I observe CPU hotplug works irrespective of shared host tag

>

> Can you be clear exactly what you mean by "irrespective of shared host

> tag"?

>

> Do you mean that for your test Scsi_Host.nr_hw_queues is set to expose hw

> queues and scsi_host_template.map_queues = blk_mq_pci_map_queues(),

> but you just don't set the host_tagset flag?


Yes. I only disabled "host_tagset". <map_queue> is still hooked.

>

>   in megaraid_sas

> > on 5.8-rc.

> >

> > Without shared host tag, megaraid driver will expose single hctx and

> > all the CPU will be mapped to hctx0.

>

> right

>

> > Any CPU offline event will have " blk_mq_hctx_notify_offline" callback

> > in blk-mq module. If we do not have this callback/patch, we will see

> > IO timeout.

> > blk_mq_hctx_notify_offline callback will make sure all the outstanding

> > on

> > hctx0 is cleared and only after it is cleared, CPU will go offline.

>

> But that is only for when the last CPU for the hctx is going offline. If

> nr_hw_queues == 1, then hctx0 would cover all CPUs, so that would never

> occur during normal operation. See initial check in

> blk_mq_hctx_notify_offline():

>

> static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node

> *node) {

> 	if (!cpumask_test_cpu(cpu, hctx->cpumask) ||

> 	    !blk_mq_last_cpu_in_hctx(cpu, hctx))

> 		return 0;

>


Thanks John for this pointer. I missed this part and now I understood what
was happening in my testing.
There were more than one CPU mapped to one msix index in my earlier testing
and because of that I could see Interrupt migration happens on available CPU
from affinity mask. So my earlier testing was incorrect.

Now I am consistently able to reproduce issue - Best setup is have 1:1
mapping of CPU to MSIX vector mapping. I used 128 logical CPU and 128 msix
vectors and I noticed IO timeout without this RFC (without host_tagset).
I did not noticed IO timeout with RFC (with host_tagset.) I will update this
data in Driver's commit message.

Just for my understanding -
What if we have below code in blk_mq_hctx_notify_offline, CPU hotplug should
work for megaraid_sas driver without this RFC (without shared host tagset).
Right ?
If answer is yes, will there be any side effect of having below code in
block layer ?

static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node
 *node) {
 	if (hctx->queue->nr_hw_queues > 1
	    && (!cpumask_test_cpu(cpu, hctx->cpumask) ||
 	    !blk_mq_last_cpu_in_hctx(cpu, hctx)))
 		return 0;

I also noticed nr_hw_queues are now exposed in sysfs -

/sys/devices/pci0000:85/0000:85:00.0/0000:86:00.0/0000:87:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0/host14/scsi_host/host14/nr_hw_queues:128
John Garry July 13, 2020, 8:42 a.m. | #14
On 13/07/2020 08:55, Kashyap Desai wrote:
>> ring normal operation. See initial check in

>> blk_mq_hctx_notify_offline():

>>

>> static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node

>> *node) {

>> 	if (!cpumask_test_cpu(cpu, hctx->cpumask) ||

>> 	    !blk_mq_last_cpu_in_hctx(cpu, hctx))

>> 		return 0;

>>

> Thanks John for this pointer. I missed this part and now I understood what

> was happening in my testing.


JFYI, I always have this as a sanity check for my testing:

void irq_shutdown(struct irq_desc *desc)
{
+	pr_err("%s irq%d\n", __func__, desc->irq_data.irq);
+
	if (irqd_is_started(&desc->irq_data)) {
		desc->depth = 1;
		if (desc->irq_data.chip->irq_shutdown) {

> There were more than one CPU mapped to one msix index in my earlier testing

> and because of that I could see Interrupt migration happens on available CPU

> from affinity mask. So my earlier testing was incorrect.

> 

> Now I am consistently able to reproduce issue - Best setup is have 1:1

> mapping of CPU to MSIX vector mapping. I used 128 logical CPU and 128 msix

> vectors and I noticed IO timeout without this RFC (without host_tagset).

> I did not noticed IO timeout with RFC (with host_tagset.) I will update this

> data in Driver's commit message.


ok, great. That's what we want. I'm not sure exactly what your test 
consists of, though.

> 

> Just for my understanding -

> What if we have below code in blk_mq_hctx_notify_offline, CPU hotplug should

> work for megaraid_sas driver without this RFC (without shared host tagset).

> Right ?


> If answer is yes, will there be any side effect of having below code in

> block layer ?

> 


Sorry, I'm sure sure what you're getting at with this change, below.

It seems that you're trying to drain hctx0 (which is your only hctx, as 
nr_hw_queues = 0 without this patchset) and set it inactive whenever any 
CPU is offlined. If so, that's not right.

> static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node

>   *node) {

>   	if (hctx->queue->nr_hw_queues > 1

> 	    && (!cpumask_test_cpu(cpu, hctx->cpumask) ||

>   	    !blk_mq_last_cpu_in_hctx(cpu, hctx)))

>   		return 0;

> 

> I also noticed nr_hw_queues are now exposed in sysfs -

> 

> /sys/devices/pci0000:85/0000:85:00.0/0000:86:00.0/0000:87:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0/host14/scsi_host/host14/nr_hw_queues:128

> .


That's on my v8 wip branch, so I guess you're picking it up from there.

Thanks,
John
Kashyap Desai July 19, 2020, 7:07 p.m. | #15
> > I also noticed nr_hw_queues are now exposed in sysfs -

> >

> > /sys/devices/pci0000:85/0000:85:00.0/0000:86:00.0/0000:87:04.0/0000:8b

> >

> :00.0/0000:8c:00.0/0000:8d:00.0/host14/scsi_host/host14/nr_hw_queues:1

> > 28

> > .

>

> That's on my v8 wip branch, so I guess you're picking it up from there.


John - I did more testing on v8 wip branch.  CPU hotplug is working as
expected, but I still see some performance issue on Logical Volumes.

I created 8 Drives Raid-0 VD on MR controller and below is performance
impact of this RFC. Looks like contention is on single <sdev>.

I used command - "numactl -N 1  fio
1vd.fio --iodepth=128 --bs=4k --rw=randread
--cpus_allowed_policy=split --ioscheduler=none
 --group_reporting --runtime=200 --numjobs=1"
IOPS without RFC = 300K IOPS with RFC = 230K.

Perf top (shared host tag. IOPS = 230K)

13.98%  [kernel]        [k] sbitmap_any_bit_set
     6.43%  [kernel]        [k] blk_mq_run_hw_queue
     6.00%  [kernel]        [k] __audit_syscall_exit
     3.47%  [kernel]        [k] read_tsc
     3.19%  [megaraid_sas]  [k] complete_cmd_fusion
     3.19%  [kernel]        [k] irq_entries_start
     2.75%  [kernel]        [k] blk_mq_run_hw_queues
     2.45%  fio             [.] fio_gettime
     1.76%  [kernel]        [k] entry_SYSCALL_64
     1.66%  [kernel]        [k] add_interrupt_randomness
     1.48%  [megaraid_sas]  [k] megasas_build_and_issue_cmd_fusion
     1.42%  [kernel]        [k] copy_user_generic_string
     1.36%  [kernel]        [k] scsi_queue_rq
     1.03%  [kernel]        [k] kmem_cache_alloc
     1.03%  [kernel]        [k] internal_get_user_pages_fast
     1.01%  [kernel]        [k] swapgs_restore_regs_and_return_to_usermode
     0.96%  [kernel]        [k] kmem_cache_free
     0.88%  [kernel]        [k] blkdev_direct_IO
     0.84%  fio             [.] td_io_queue
     0.83%  [kernel]        [k] __get_user_4

Perf top (shared host tag. IOPS = 300K)

    6.36%  [kernel]        [k] unroll_tree_refs
     5.77%  [kernel]        [k] __do_softirq
     4.56%  [kernel]        [k] irq_entries_start
     4.38%  [kernel]        [k] read_tsc
     3.95%  [megaraid_sas]  [k] complete_cmd_fusion
     3.21%  fio             [.] fio_gettime
     2.98%  [kernel]        [k] add_interrupt_randomness
     1.79%  [kernel]        [k] entry_SYSCALL_64
     1.61%  [kernel]        [k] copy_user_generic_string
     1.61%  [megaraid_sas]  [k] megasas_build_and_issue_cmd_fusion
     1.34%  [kernel]        [k] scsi_queue_rq
     1.11%  [kernel]        [k] kmem_cache_free
     1.05%  [kernel]        [k] blkdev_direct_IO
     1.05%  [kernel]        [k] internal_get_user_pages_fast
     1.00%  [kernel]        [k] __memset
     1.00%  fio             [.] td_io_queue
     0.98%  [kernel]        [k] kmem_cache_alloc
     0.94%  [kernel]        [k] __get_user_4
     0.93%  [kernel]        [k] lookup_ioctx
     0.88%  [kernel]        [k] sbitmap_any_bit_set

Kashyap

>

> Thanks,

> John
Kashyap Desai July 20, 2020, 7:23 a.m. | #16
> > > I also noticed nr_hw_queues are now exposed in sysfs -

> > >

> > >

> /sys/devices/pci0000:85/0000:85:00.0/0000:86:00.0/0000:87:04.0/0000:8b

> > >

> >

> :00.0/0000:8c:00.0/0000:8d:00.0/host14/scsi_host/host14/nr_hw_queues:1

> > > 28

> > > .

> >

> > That's on my v8 wip branch, so I guess you're picking it up from there.

>

> John - I did more testing on v8 wip branch.  CPU hotplug is working as

> expected, but I still see some performance issue on Logical Volumes.

>

> I created 8 Drives Raid-0 VD on MR controller and below is performance

> impact of this RFC. Looks like contention is on single <sdev>.

>

> I used command - "numactl -N 1  fio 1vd.fio --iodepth=128 --bs=4k --

> rw=randread --cpus_allowed_policy=split --ioscheduler=none --

> group_reporting --runtime=200 --numjobs=1"

> IOPS without RFC = 300K IOPS with RFC = 230K.

>

> Perf top (shared host tag. IOPS = 230K)

>

> 13.98%  [kernel]        [k] sbitmap_any_bit_set

>      6.43%  [kernel]        [k] blk_mq_run_hw_queue


blk_mq_run_hw_queue function take more CPU which is called from "
scsi_end_request"
It looks like " blk_mq_hctx_has_pending" handles only elevator (scheduler)
case. If  queue has ioscheduler=none, we can skip. I case of scheduler=none,
IO will be pushed to hardware queue and it by pass software queue.
Based on above understanding, I added below patch and I can see performance
scale back to expectation.

Ming mentioned that - we cannot remove blk_mq_run_hw_queues() from IO
completion path otherwise we may see IO hang. So I have just modified
completion path assuming it is only required for IO scheduler case.
https://www.spinics.net/lists/linux-block/msg55049.html

Please review and let me know if this is good or we have to address with
proper fix.

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1be7ac5a4040..b6a5b41b7fc2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1559,6 +1559,9 @@ void blk_mq_run_hw_queues(struct request_queue *q,
bool async)
        struct blk_mq_hw_ctx *hctx;
        int i;

+       if (!q->elevator)
+               return;
+
        queue_for_each_hw_ctx(q, hctx, i) {
                if (blk_mq_hctx_stopped(hctx))
                        continue;

Kashyap

>      6.00%  [kernel]        [k] __audit_syscall_exit

>      3.47%  [kernel]        [k] read_tsc

>      3.19%  [megaraid_sas]  [k] complete_cmd_fusion

>      3.19%  [kernel]        [k] irq_entries_start

>      2.75%  [kernel]        [k] blk_mq_run_hw_queues

>      2.45%  fio             [.] fio_gettime

>      1.76%  [kernel]        [k] entry_SYSCALL_64

>      1.66%  [kernel]        [k] add_interrupt_randomness

>      1.48%  [megaraid_sas]  [k] megasas_build_and_issue_cmd_fusion

>      1.42%  [kernel]        [k] copy_user_generic_string

>      1.36%  [kernel]        [k] scsi_queue_rq

>      1.03%  [kernel]        [k] kmem_cache_alloc

>      1.03%  [kernel]        [k] internal_get_user_pages_fast

>      1.01%  [kernel]        [k] swapgs_restore_regs_and_return_to_usermode

>      0.96%  [kernel]        [k] kmem_cache_free

>      0.88%  [kernel]        [k] blkdev_direct_IO

>      0.84%  fio             [.] td_io_queue

>      0.83%  [kernel]        [k] __get_user_4

>

> Perf top (shared host tag. IOPS = 300K)

>

>     6.36%  [kernel]        [k] unroll_tree_refs

>      5.77%  [kernel]        [k] __do_softirq

>      4.56%  [kernel]        [k] irq_entries_start

>      4.38%  [kernel]        [k] read_tsc

>      3.95%  [megaraid_sas]  [k] complete_cmd_fusion

>      3.21%  fio             [.] fio_gettime

>      2.98%  [kernel]        [k] add_interrupt_randomness

>      1.79%  [kernel]        [k] entry_SYSCALL_64

>      1.61%  [kernel]        [k] copy_user_generic_string

>      1.61%  [megaraid_sas]  [k] megasas_build_and_issue_cmd_fusion

>      1.34%  [kernel]        [k] scsi_queue_rq

>      1.11%  [kernel]        [k] kmem_cache_free

>      1.05%  [kernel]        [k] blkdev_direct_IO

>      1.05%  [kernel]        [k] internal_get_user_pages_fast

>      1.00%  [kernel]        [k] __memset

>      1.00%  fio             [.] td_io_queue

>      0.98%  [kernel]        [k] kmem_cache_alloc

>      0.94%  [kernel]        [k] __get_user_4

>      0.93%  [kernel]        [k] lookup_ioctx

>      0.88%  [kernel]        [k] sbitmap_any_bit_set

>

> Kashyap

>

> >

> > Thanks,

> > John
John Garry July 20, 2020, 9:18 a.m. | #17
On 20/07/2020 08:23, Kashyap Desai wrote:

Hi Kashyap,

>> John - I did more testing on v8 wip branch.  CPU hotplug is working as

>> expected, 


Good to hear.

but I still see some performance issue on Logical Volumes.
>>

>> I created 8 Drives Raid-0 VD on MR controller and below is performance

>> impact of this RFC. Looks like contention is on single <sdev>.

>>

>> I used command - "numactl -N 1  fio 1vd.fio --iodepth=128 --bs=4k --

>> rw=randread --cpus_allowed_policy=split --ioscheduler=none --

>> group_reporting --runtime=200 --numjobs=1"

>> IOPS without RFC = 300K IOPS with RFC = 230K.

>>

>> Perf top (shared host tag. IOPS = 230K)

>>

>> 13.98%  [kernel]        [k] sbitmap_any_bit_set


I guess that this comes from the blk_mq_hctx_has_pending() -> 
sbitmap_any_bit_set(&hctx->ctx_map) call. The 
list_empty_careful(&hctx->dispatch) and blk_mq_sched_has_work(hctx) 
[when scheduler=none] calls look pretty lightweight.

>>       6.43%  [kernel]        [k] blk_mq_run_hw_queue

> blk_mq_run_hw_queue function take more CPU which is called from "

> scsi_end_request"

> It looks like " blk_mq_hctx_has_pending" handles only elevator (scheduler)

> case. If  queue has ioscheduler=none, we can skip. I case of scheduler=none,

> IO will be pushed to hardware queue and it by pass software queue.

> Based on above understanding, I added below patch and I can see performance

> scale back to expectation.

> 

> Ming mentioned that - we cannot remove blk_mq_run_hw_queues() from IO

> completion path otherwise we may see IO hang. So I have just modified

> completion path assuming it is only required for IO scheduler case.

> https://www.spinics.net/lists/linux-block/msg55049.html

> 

> Please review and let me know if this is good or we have to address with

> proper fix.

> 


So what you're doing looks reasonable, but I would be concerned about 
missing the blk_mq_run_hw_queue() -> blk_mq_hctx_has_pending() -> 
list_empty_careful(&hctx->dispatch) check - if it's not really needed 
there, then why not remove?

Hi Ming, any opinion on this?

> diff --git a/block/blk-mq.c b/block/blk-mq.c

> index 1be7ac5a4040..b6a5b41b7fc2 100644

> --- a/block/blk-mq.c

> +++ b/block/blk-mq.c

> @@ -1559,6 +1559,9 @@ void blk_mq_run_hw_queues(struct request_queue *q,

> bool async)

>          struct blk_mq_hw_ctx *hctx;

>          int i;

> 

> +       if (!q->elevator)

> +               return;

> +

>          queue_for_each_hw_ctx(q, hctx, i) {

>                  if (blk_mq_hctx_stopped(hctx))

>                          continue;

> 


Thanks,
John
Ming Lei July 21, 2020, 1:13 a.m. | #18
On Mon, Jul 20, 2020 at 12:53:55PM +0530, Kashyap Desai wrote:
> > > > I also noticed nr_hw_queues are now exposed in sysfs -

> > > >

> > > >

> > /sys/devices/pci0000:85/0000:85:00.0/0000:86:00.0/0000:87:04.0/0000:8b

> > > >

> > >

> > :00.0/0000:8c:00.0/0000:8d:00.0/host14/scsi_host/host14/nr_hw_queues:1

> > > > 28

> > > > .

> > >

> > > That's on my v8 wip branch, so I guess you're picking it up from there.

> >

> > John - I did more testing on v8 wip branch.  CPU hotplug is working as

> > expected, but I still see some performance issue on Logical Volumes.

> >

> > I created 8 Drives Raid-0 VD on MR controller and below is performance

> > impact of this RFC. Looks like contention is on single <sdev>.

> >

> > I used command - "numactl -N 1  fio 1vd.fio --iodepth=128 --bs=4k --

> > rw=randread --cpus_allowed_policy=split --ioscheduler=none --

> > group_reporting --runtime=200 --numjobs=1"

> > IOPS without RFC = 300K IOPS with RFC = 230K.

> >

> > Perf top (shared host tag. IOPS = 230K)

> >

> > 13.98%  [kernel]        [k] sbitmap_any_bit_set

> >      6.43%  [kernel]        [k] blk_mq_run_hw_queue

> 

> blk_mq_run_hw_queue function take more CPU which is called from "

> scsi_end_request"


The problem could be that nr_hw_queues is increased a lot so that
sample on blk_mq_run_hw_queue() can be observed now.

> It looks like " blk_mq_hctx_has_pending" handles only elevator (scheduler)

> case. If  queue has ioscheduler=none, we can skip. I case of scheduler=none,

> IO will be pushed to hardware queue and it by pass software queue.

> Based on above understanding, I added below patch and I can see performance

> scale back to expectation.

> 

> Ming mentioned that - we cannot remove blk_mq_run_hw_queues() from IO

> completion path otherwise we may see IO hang. So I have just modified

> completion path assuming it is only required for IO scheduler case.

> https://www.spinics.net/lists/linux-block/msg55049.html

> 

> Please review and let me know if this is good or we have to address with

> proper fix.

> 

> diff --git a/block/blk-mq.c b/block/blk-mq.c

> index 1be7ac5a4040..b6a5b41b7fc2 100644

> --- a/block/blk-mq.c

> +++ b/block/blk-mq.c

> @@ -1559,6 +1559,9 @@ void blk_mq_run_hw_queues(struct request_queue *q,

> bool async)

>         struct blk_mq_hw_ctx *hctx;

>         int i;

> 

> +       if (!q->elevator)

> +               return;

> +


This way shouldn't be correct, blk_mq_run_hw_queues() is still needed for
none because request may not be dispatched successfully by direct issue.


Thanks,
Ming
Kashyap Desai July 21, 2020, 6:53 a.m. | #19
> > >

> > > Perf top (shared host tag. IOPS = 230K)

> > >

> > > 13.98%  [kernel]        [k] sbitmap_any_bit_set

> > >      6.43%  [kernel]        [k] blk_mq_run_hw_queue

> >

> > blk_mq_run_hw_queue function take more CPU which is called from "

> > scsi_end_request"

>

> The problem could be that nr_hw_queues is increased a lot so that sample

on
> blk_mq_run_hw_queue() can be observed now.


Yes. That is correct.

>

> > It looks like " blk_mq_hctx_has_pending" handles only elevator

> > (scheduler) case. If  queue has ioscheduler=none, we can skip. I case

> > of scheduler=none, IO will be pushed to hardware queue and it by pass

> software queue.

> > Based on above understanding, I added below patch and I can see

> > performance scale back to expectation.

> >

> > Ming mentioned that - we cannot remove blk_mq_run_hw_queues() from IO

> > completion path otherwise we may see IO hang. So I have just modified

> > completion path assuming it is only required for IO scheduler case.

> > https://www.spinics.net/lists/linux-block/msg55049.html

> >

> > Please review and let me know if this is good or we have to address

> > with proper fix.

> >

> > diff --git a/block/blk-mq.c b/block/blk-mq.c index

> > 1be7ac5a4040..b6a5b41b7fc2 100644

> > --- a/block/blk-mq.c

> > +++ b/block/blk-mq.c

> > @@ -1559,6 +1559,9 @@ void blk_mq_run_hw_queues(struct

> request_queue

> > *q, bool async)

> >         struct blk_mq_hw_ctx *hctx;

> >         int i;

> >

> > +       if (!q->elevator)

> > +               return;

> > +

>

> This way shouldn't be correct, blk_mq_run_hw_queues() is still needed

for
> none because request may not be dispatched successfully by direct issue.


When block layer attempt posting request to h/w queue directly (for
ioscheduler=none) and if it fails, it is calling
blk_mq_request_bypass_insert().
blk_mq_request_bypass_insert function will start the h/w queue from
submission context. Do we still have an issue if we skip running hw queue
from completion ?

Kashyap

>

>

> Thanks,

> Ming
Ming Lei July 22, 2020, 4:12 a.m. | #20
On Tue, Jul 21, 2020 at 12:23:39PM +0530, Kashyap Desai wrote:
> > > >

> > > > Perf top (shared host tag. IOPS = 230K)

> > > >

> > > > 13.98%  [kernel]        [k] sbitmap_any_bit_set

> > > >      6.43%  [kernel]        [k] blk_mq_run_hw_queue

> > >

> > > blk_mq_run_hw_queue function take more CPU which is called from "

> > > scsi_end_request"

> >

> > The problem could be that nr_hw_queues is increased a lot so that sample

> on

> > blk_mq_run_hw_queue() can be observed now.

> 

> Yes. That is correct.

> 

> >

> > > It looks like " blk_mq_hctx_has_pending" handles only elevator

> > > (scheduler) case. If  queue has ioscheduler=none, we can skip. I case

> > > of scheduler=none, IO will be pushed to hardware queue and it by pass

> > software queue.

> > > Based on above understanding, I added below patch and I can see

> > > performance scale back to expectation.

> > >

> > > Ming mentioned that - we cannot remove blk_mq_run_hw_queues() from IO

> > > completion path otherwise we may see IO hang. So I have just modified

> > > completion path assuming it is only required for IO scheduler case.

> > > https://www.spinics.net/lists/linux-block/msg55049.html

> > >

> > > Please review and let me know if this is good or we have to address

> > > with proper fix.

> > >

> > > diff --git a/block/blk-mq.c b/block/blk-mq.c index

> > > 1be7ac5a4040..b6a5b41b7fc2 100644

> > > --- a/block/blk-mq.c

> > > +++ b/block/blk-mq.c

> > > @@ -1559,6 +1559,9 @@ void blk_mq_run_hw_queues(struct

> > request_queue

> > > *q, bool async)

> > >         struct blk_mq_hw_ctx *hctx;

> > >         int i;

> > >

> > > +       if (!q->elevator)

> > > +               return;

> > > +

> >

> > This way shouldn't be correct, blk_mq_run_hw_queues() is still needed

> for

> > none because request may not be dispatched successfully by direct issue.

> 

> When block layer attempt posting request to h/w queue directly (for

> ioscheduler=none) and if it fails, it is calling

> blk_mq_request_bypass_insert().

> blk_mq_request_bypass_insert function will start the h/w queue from

> submission context. Do we still have an issue if we skip running hw queue

> from completion ?


The thing is that we can't guarantee that direct issue or adding request into
hctx->dispatch is always done for MQ/none, for example, request still
can be added to sw queue from blk_mq_flush_plug_list() when mq plug is
applied.

Also, I am not sure it is a good idea to add request into hctx->dispatch
via blk_mq_request_bypass_insert() in __blk_mq_try_issue_directly() in
case of running out of budget, because this way may hurt sequential IO
performance.

Thanks,
Ming
Kashyap Desai July 22, 2020, 5:30 a.m. | #21
> On Tue, Jul 21, 2020 at 12:23:39PM +0530, Kashyap Desai wrote:

> > > > >

> > > > > Perf top (shared host tag. IOPS = 230K)

> > > > >

> > > > > 13.98%  [kernel]        [k] sbitmap_any_bit_set

> > > > >      6.43%  [kernel]        [k] blk_mq_run_hw_queue

> > > >

> > > > blk_mq_run_hw_queue function take more CPU which is called from "

> > > > scsi_end_request"

> > >

> > > The problem could be that nr_hw_queues is increased a lot so that

> > > sample

> > on

> > > blk_mq_run_hw_queue() can be observed now.

> >

> > Yes. That is correct.

> >

> > >

> > > > It looks like " blk_mq_hctx_has_pending" handles only elevator

> > > > (scheduler) case. If  queue has ioscheduler=none, we can skip. I

> > > > case of scheduler=none, IO will be pushed to hardware queue and it

> > > > by pass

> > > software queue.

> > > > Based on above understanding, I added below patch and I can see

> > > > performance scale back to expectation.

> > > >

> > > > Ming mentioned that - we cannot remove blk_mq_run_hw_queues()

> from

> > > > IO completion path otherwise we may see IO hang. So I have just

> > > > modified completion path assuming it is only required for IO

scheduler
> case.

> > > > https://www.spinics.net/lists/linux-block/msg55049.html

> > > >

> > > > Please review and let me know if this is good or we have to

> > > > address with proper fix.

> > > >

> > > > diff --git a/block/blk-mq.c b/block/blk-mq.c index

> > > > 1be7ac5a4040..b6a5b41b7fc2 100644

> > > > --- a/block/blk-mq.c

> > > > +++ b/block/blk-mq.c

> > > > @@ -1559,6 +1559,9 @@ void blk_mq_run_hw_queues(struct

> > > request_queue

> > > > *q, bool async)

> > > >         struct blk_mq_hw_ctx *hctx;

> > > >         int i;

> > > >

> > > > +       if (!q->elevator)

> > > > +               return;

> > > > +

> > >

> > > This way shouldn't be correct, blk_mq_run_hw_queues() is still

> > > needed

> > for

> > > none because request may not be dispatched successfully by direct

issue.
> >

> > When block layer attempt posting request to h/w queue directly (for

> > ioscheduler=none) and if it fails, it is calling

> > blk_mq_request_bypass_insert().

> > blk_mq_request_bypass_insert function will start the h/w queue from

> > submission context. Do we still have an issue if we skip running hw

> > queue from completion ?

>

> The thing is that we can't guarantee that direct issue or adding request

into
> hctx->dispatch is always done for MQ/none, for example, request still

> can be added to sw queue from blk_mq_flush_plug_list() when mq plug is

> applied.


I see even blk_mq_sched_insert_requests() from blk_mq_flush_plug_list make
sure it run the h/w queue. If all the submission path which deals with s/w
queue make sure they run h/w queue, can't we remove blk_mq_run_hw_queues()
from scsi_end_request ?

>

> Also, I am not sure it is a good idea to add request into hctx->dispatch

via
> blk_mq_request_bypass_insert() in __blk_mq_try_issue_directly() in case

of
> running out of budget, because this way may hurt sequential IO

performance.
>

> Thanks,

> Ming
Ming Lei July 22, 2020, 8:04 a.m. | #22
On Wed, Jul 22, 2020 at 11:00:45AM +0530, Kashyap Desai wrote:
> > On Tue, Jul 21, 2020 at 12:23:39PM +0530, Kashyap Desai wrote:

> > > > > >

> > > > > > Perf top (shared host tag. IOPS = 230K)

> > > > > >

> > > > > > 13.98%  [kernel]        [k] sbitmap_any_bit_set

> > > > > >      6.43%  [kernel]        [k] blk_mq_run_hw_queue

> > > > >

> > > > > blk_mq_run_hw_queue function take more CPU which is called from "

> > > > > scsi_end_request"

> > > >

> > > > The problem could be that nr_hw_queues is increased a lot so that

> > > > sample

> > > on

> > > > blk_mq_run_hw_queue() can be observed now.

> > >

> > > Yes. That is correct.

> > >

> > > >

> > > > > It looks like " blk_mq_hctx_has_pending" handles only elevator

> > > > > (scheduler) case. If  queue has ioscheduler=none, we can skip. I

> > > > > case of scheduler=none, IO will be pushed to hardware queue and it

> > > > > by pass

> > > > software queue.

> > > > > Based on above understanding, I added below patch and I can see

> > > > > performance scale back to expectation.

> > > > >

> > > > > Ming mentioned that - we cannot remove blk_mq_run_hw_queues()

> > from

> > > > > IO completion path otherwise we may see IO hang. So I have just

> > > > > modified completion path assuming it is only required for IO

> scheduler

> > case.

> > > > > https://www.spinics.net/lists/linux-block/msg55049.html

> > > > >

> > > > > Please review and let me know if this is good or we have to

> > > > > address with proper fix.

> > > > >

> > > > > diff --git a/block/blk-mq.c b/block/blk-mq.c index

> > > > > 1be7ac5a4040..b6a5b41b7fc2 100644

> > > > > --- a/block/blk-mq.c

> > > > > +++ b/block/blk-mq.c

> > > > > @@ -1559,6 +1559,9 @@ void blk_mq_run_hw_queues(struct

> > > > request_queue

> > > > > *q, bool async)

> > > > >         struct blk_mq_hw_ctx *hctx;

> > > > >         int i;

> > > > >

> > > > > +       if (!q->elevator)

> > > > > +               return;

> > > > > +

> > > >

> > > > This way shouldn't be correct, blk_mq_run_hw_queues() is still

> > > > needed

> > > for

> > > > none because request may not be dispatched successfully by direct

> issue.

> > >

> > > When block layer attempt posting request to h/w queue directly (for

> > > ioscheduler=none) and if it fails, it is calling

> > > blk_mq_request_bypass_insert().

> > > blk_mq_request_bypass_insert function will start the h/w queue from

> > > submission context. Do we still have an issue if we skip running hw

> > > queue from completion ?

> >

> > The thing is that we can't guarantee that direct issue or adding request

> into

> > hctx->dispatch is always done for MQ/none, for example, request still

> > can be added to sw queue from blk_mq_flush_plug_list() when mq plug is

> > applied.

> 

> I see even blk_mq_sched_insert_requests() from blk_mq_flush_plug_list make

> sure it run the h/w queue. If all the submission path which deals with s/w

> queue make sure they run h/w queue, can't we remove blk_mq_run_hw_queues()

> from scsi_end_request ?


No, one purpose of blk_mq_run_hw_queues() is for rerun queue in case that
dispatch budget is running out of in submission path, and sdev->device_busy is
shared by all hw queues on this scsi device.

I posted one patch for avoiding it in scsi_end_request() before, looks it
never lands upstream:

https://lore.kernel.org/linux-block/20191118100640.3673-1-ming.lei@redhat.com/

Thanks,
Ming
John Garry July 22, 2020, 9:32 a.m. | #23
>>>>>>

>>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c index

>>>>>> 1be7ac5a4040..b6a5b41b7fc2 100644

>>>>>> --- a/block/blk-mq.c

>>>>>> +++ b/block/blk-mq.c

>>>>>> @@ -1559,6 +1559,9 @@ void blk_mq_run_hw_queues(struct

>>>>> request_queue

>>>>>> *q, bool async)

>>>>>>          struct blk_mq_hw_ctx *hctx;

>>>>>>          int i;

>>>>>>

>>>>>> +       if (!q->elevator)

>>>>>> +               return;

>>>>>> +

>>>>> This way shouldn't be correct, blk_mq_run_hw_queues() is still

>>>>> needed


Could the logic of blk_mq_run_hw_queue() -> blk_mq_hctx_has_pending() -> 
sbitmap_any_bit_set(&hctx->ctx_map) be optimised for megaraid scenario?

As I see, since megaraid will have 1:1 mapping of CPU to hw queue, will 
there only ever possibly a single bit set in ctx_map? If so, it seems a 
waste to always check every sbitmap map. But adding logic for this may 
negate any possible gains.

>>>> for

>>>>> none because request may not be dispatched successfully by direct

>> issue.

>>>> When block layer attempt posting request to h/w queue directly (for

>>>> ioscheduler=none) and if it fails, it is calling

>>>> blk_mq_request_bypass_insert().

>>>> blk_mq_request_bypass_insert function will start the h/w queue from

>>>> submission context. Do we still have an issue if we skip running hw

>>>> queue from completion ?

>>> The thing is that we can't guarantee that direct issue or adding request

>> into

>>> hctx->dispatch is always done for MQ/none, for example, request still

>>> can be added to sw queue from blk_mq_flush_plug_list() when mq plug is

>>> applied.

>> I see even blk_mq_sched_insert_requests() from blk_mq_flush_plug_list make

>> sure it run the h/w queue. If all the submission path which deals with s/w

>> queue make sure they run h/w queue, can't we remove blk_mq_run_hw_queues()

>> from scsi_end_request ?

> No, one purpose of blk_mq_run_hw_queues() is for rerun queue in case that

> dispatch budget is running out of in submission path, and sdev->device_busy is

> shared by all hw queues on this scsi device.

> 

> I posted one patch for avoiding it in scsi_end_request() before, looks it

> never lands upstream:

> 


I saw that you actually posted the v3:
https://lore.kernel.org/linux-scsi/BL0PR2101MB11230C5F70151037B23C0C35CE2D0@BL0PR2101MB1123.namprd21.prod.outlook.com/
And it no longer applies, due to the changes in scsi_mq_get_budget(), I 
think, which look non-trivial. Any chance to repost?

Thanks,
John
Ming Lei July 23, 2020, 2:07 p.m. | #24
On Wed, Jul 22, 2020 at 10:32:41AM +0100, John Garry wrote:
> > > > > > > 

> > > > > > > diff --git a/block/blk-mq.c b/block/blk-mq.c index

> > > > > > > 1be7ac5a4040..b6a5b41b7fc2 100644

> > > > > > > --- a/block/blk-mq.c

> > > > > > > +++ b/block/blk-mq.c

> > > > > > > @@ -1559,6 +1559,9 @@ void blk_mq_run_hw_queues(struct

> > > > > > request_queue

> > > > > > > *q, bool async)

> > > > > > >          struct blk_mq_hw_ctx *hctx;

> > > > > > >          int i;

> > > > > > > 

> > > > > > > +       if (!q->elevator)

> > > > > > > +               return;

> > > > > > > +

> > > > > > This way shouldn't be correct, blk_mq_run_hw_queues() is still

> > > > > > needed

> 

> Could the logic of blk_mq_run_hw_queue() -> blk_mq_hctx_has_pending() ->

> sbitmap_any_bit_set(&hctx->ctx_map) be optimised for megaraid scenario?

> 

> As I see, since megaraid will have 1:1 mapping of CPU to hw queue, will

> there only ever possibly a single bit set in ctx_map? If so, it seems a

> waste to always check every sbitmap map. But adding logic for this may

> negate any possible gains.


It really depends on min and max cpu id in the map, then sbitmap
depth can be reduced to (max - min + 1). I'd suggest to double check that
cost of sbitmap_any_bit_set() really matters.

> 

> > > > > for

> > > > > > none because request may not be dispatched successfully by direct

> > > issue.

> > > > > When block layer attempt posting request to h/w queue directly (for

> > > > > ioscheduler=none) and if it fails, it is calling

> > > > > blk_mq_request_bypass_insert().

> > > > > blk_mq_request_bypass_insert function will start the h/w queue from

> > > > > submission context. Do we still have an issue if we skip running hw

> > > > > queue from completion ?

> > > > The thing is that we can't guarantee that direct issue or adding request

> > > into

> > > > hctx->dispatch is always done for MQ/none, for example, request still

> > > > can be added to sw queue from blk_mq_flush_plug_list() when mq plug is

> > > > applied.

> > > I see even blk_mq_sched_insert_requests() from blk_mq_flush_plug_list make

> > > sure it run the h/w queue. If all the submission path which deals with s/w

> > > queue make sure they run h/w queue, can't we remove blk_mq_run_hw_queues()

> > > from scsi_end_request ?

> > No, one purpose of blk_mq_run_hw_queues() is for rerun queue in case that

> > dispatch budget is running out of in submission path, and sdev->device_busy is

> > shared by all hw queues on this scsi device.

> > 

> > I posted one patch for avoiding it in scsi_end_request() before, looks it

> > never lands upstream:

> > 

> 

> I saw that you actually posted the v3:

> https://lore.kernel.org/linux-scsi/BL0PR2101MB11230C5F70151037B23C0C35CE2D0@BL0PR2101MB1123.namprd21.prod.outlook.com/

> And it no longer applies, due to the changes in scsi_mq_get_budget(), I

> think, which look non-trivial. Any chance to repost?


OK, will post V4.


thanks,
Ming
John Garry July 23, 2020, 5:29 p.m. | #25
>> As I see, since megaraid will have 1:1 mapping of CPU to hw queue, will

>> there only ever possibly a single bit set in ctx_map? If so, it seems a

>> waste to always check every sbitmap map. But adding logic for this may

>> negate any possible gains.

> 

> It really depends on min and max cpu id in the map, then sbitmap

> depth can be reduced to (max - min + 1). I'd suggest to double check that

> cost of sbitmap_any_bit_set() really matters.


Hi Ming,

I'm not sure that reducing the search range would help much, as we still 
need to load some indexes of map[], and at best this may be reduced from 
2/3 -> 1 elements, depending on nr_cpus.

> 

>>

>>>>>> for

>>>>>>> none because request may not be dispatched successfully by direct

>>>> issue.

>>>>>> When block layer attempt posting request to h/w queue directly (for

>>>>>> ioscheduler=none) and if it fails, it is calling

>>>>>> blk_mq_request_bypass_insert().

>>>>>> blk_mq_request_bypass_insert function will start the h/w queue from

>>>>>> submission context. Do we still have an issue if we skip running hw

>>>>>> queue from completion ?

>>>>> The thing is that we can't guarantee that direct issue or adding request

>>>> into

>>>>> hctx->dispatch is always done for MQ/none, for example, request still

>>>>> can be added to sw queue from blk_mq_flush_plug_list() when mq plug is

>>>>> applied.

>>>> I see even blk_mq_sched_insert_requests() from blk_mq_flush_plug_list make

>>>> sure it run the h/w queue. If all the submission path which deals with s/w

>>>> queue make sure they run h/w queue, can't we remove blk_mq_run_hw_queues()

>>>> from scsi_end_request ?

>>> No, one purpose of blk_mq_run_hw_queues() is for rerun queue in case that

>>> dispatch budget is running out of in submission path, and sdev->device_busy is

>>> shared by all hw queues on this scsi device.

>>>

>>> I posted one patch for avoiding it in scsi_end_request() before, looks it

>>> never lands upstream:

>>>

>>

>> I saw that you actually posted the v3:

>> https://lore.kernel.org/linux-scsi/BL0PR2101MB11230C5F70151037B23C0C35CE2D0@BL0PR2101MB1123.namprd21.prod.outlook.com/

>> And it no longer applies, due to the changes in scsi_mq_get_budget(), I

>> think, which look non-trivial. Any chance to repost?

> 

> OK, will post V4.


Thanks!
Ming Lei July 24, 2020, 2:47 a.m. | #26
On Thu, Jul 23, 2020 at 06:29:01PM +0100, John Garry wrote:
> > > As I see, since megaraid will have 1:1 mapping of CPU to hw queue, will

> > > there only ever possibly a single bit set in ctx_map? If so, it seems a

> > > waste to always check every sbitmap map. But adding logic for this may

> > > negate any possible gains.

> > 

> > It really depends on min and max cpu id in the map, then sbitmap

> > depth can be reduced to (max - min + 1). I'd suggest to double check that

> > cost of sbitmap_any_bit_set() really matters.

> 

> Hi Ming,

> 

> I'm not sure that reducing the search range would help much, as we still

> need to load some indexes of map[], and at best this may be reduced from 2/3

> -> 1 elements, depending on nr_cpus.


I believe you misunderstood my idea, and you have to think it from implementation
viewpoint.

The only workable way is to store the min cpu id as 'offset' and set the sbitmap
depth as (max - min + 1), isn't it? Then the actual cpu id can be figured out via
'offset' + nr_bit. And the whole indexes are just spread on the actual depth. BTW,
max & min is the max / min cpu id in hctx->cpu_map. So we can improve not only on 1:1,
and I guess most of MQ cases can benefit from the change, since it shouldn't be usual
for one ctx_map to cover both 0 & nr_cpu_id - 1.

Meantime, we need to allocate the sbitmap dynamically.


Thanks,
Ming
John Garry July 28, 2020, 7:54 a.m. | #27
On 24/07/2020 03:47, Ming Lei wrote:
> On Thu, Jul 23, 2020 at 06:29:01PM +0100, John Garry wrote:

>>>> As I see, since megaraid will have 1:1 mapping of CPU to hw queue, will

>>>> there only ever possibly a single bit set in ctx_map? If so, it seems a

>>>> waste to always check every sbitmap map. But adding logic for this may

>>>> negate any possible gains.

>>>

>>> It really depends on min and max cpu id in the map, then sbitmap

>>> depth can be reduced to (max - min + 1). I'd suggest to double check that

>>> cost of sbitmap_any_bit_set() really matters.

>>

>> Hi Ming,

>>

>> I'm not sure that reducing the search range would help much, as we still

>> need to load some indexes of map[], and at best this may be reduced from 2/3

>> -> 1 elements, depending on nr_cpus.

> 

> I believe you misunderstood my idea, and you have to think it from implementation

> viewpoint.

> 

> The only workable way is to store the min cpu id as 'offset' and set the sbitmap

> depth as (max - min + 1), isn't it? Then the actual cpu id can be figured out via

> 'offset' + nr_bit. And the whole indexes are just spread on the actual depth. BTW,

> max & min is the max / min cpu id in hctx->cpu_map. So we can improve not only on 1:1,

> and I guess most of MQ cases can benefit from the change, since it shouldn't be usual

> for one ctx_map to cover both 0 & nr_cpu_id - 1.

> 

> Meantime, we need to allocate the sbitmap dynamically.


OK, so dynamically allocating the sbitmap could be good. I was thinking 
previously that we still allocate for nr_cpus size, and search a limited 
range - but this would have heavier runtime overhead.

So if you really think that this may have some value, then let me know, 
so we can look to take it forward.

Thanks,
John
Kashyap Desai July 28, 2020, 8:01 a.m. | #28
> On Wed, Jul 22, 2020 at 11:00:45AM +0530, Kashyap Desai wrote:

> > > On Tue, Jul 21, 2020 at 12:23:39PM +0530, Kashyap Desai wrote:

> > > > > > >

> > > > > > > Perf top (shared host tag. IOPS = 230K)

> > > > > > >

> > > > > > > 13.98%  [kernel]        [k] sbitmap_any_bit_set

> > > > > > >      6.43%  [kernel]        [k] blk_mq_run_hw_queue

> > > > > >

> > > > > > blk_mq_run_hw_queue function take more CPU which is called

from
> "

> > > > > > scsi_end_request"

> > > > >

> > > > > The problem could be that nr_hw_queues is increased a lot so

> > > > > that sample

> > > > on

> > > > > blk_mq_run_hw_queue() can be observed now.

> > > >

> > > > Yes. That is correct.

> > > >

> > > > >

> > > > > > It looks like " blk_mq_hctx_has_pending" handles only elevator

> > > > > > (scheduler) case. If  queue has ioscheduler=none, we can skip.

> > > > > > I case of scheduler=none, IO will be pushed to hardware queue

> > > > > > and it by pass

> > > > > software queue.

> > > > > > Based on above understanding, I added below patch and I can

> > > > > > see performance scale back to expectation.

> > > > > >

> > > > > > Ming mentioned that - we cannot remove blk_mq_run_hw_queues()

> > > from

> > > > > > IO completion path otherwise we may see IO hang. So I have

> > > > > > just modified completion path assuming it is only required for

> > > > > > IO

> > scheduler

> > > case.

> > > > > > https://www.spinics.net/lists/linux-block/msg55049.html

> > > > > >

> > > > > > Please review and let me know if this is good or we have to

> > > > > > address with proper fix.

> > > > > >

> > > > > > diff --git a/block/blk-mq.c b/block/blk-mq.c index

> > > > > > 1be7ac5a4040..b6a5b41b7fc2 100644

> > > > > > --- a/block/blk-mq.c

> > > > > > +++ b/block/blk-mq.c

> > > > > > @@ -1559,6 +1559,9 @@ void blk_mq_run_hw_queues(struct

> > > > > request_queue

> > > > > > *q, bool async)

> > > > > >         struct blk_mq_hw_ctx *hctx;

> > > > > >         int i;

> > > > > >

> > > > > > +       if (!q->elevator)

> > > > > > +               return;

> > > > > > +

> > > > >

> > > > > This way shouldn't be correct, blk_mq_run_hw_queues() is still

> > > > > needed

> > > > for

> > > > > none because request may not be dispatched successfully by

> > > > > direct

> > issue.

> > > >

> > > > When block layer attempt posting request to h/w queue directly

> > > > (for

> > > > ioscheduler=none) and if it fails, it is calling

> > > > blk_mq_request_bypass_insert().

> > > > blk_mq_request_bypass_insert function will start the h/w queue

> > > > from submission context. Do we still have an issue if we skip

> > > > running hw queue from completion ?

> > >

> > > The thing is that we can't guarantee that direct issue or adding

> > > request

> > into

> > > hctx->dispatch is always done for MQ/none, for example, request

> > > hctx->still

> > > can be added to sw queue from blk_mq_flush_plug_list() when mq plug

> > > is applied.

> >

> > I see even blk_mq_sched_insert_requests() from blk_mq_flush_plug_list

> > make sure it run the h/w queue. If all the submission path which deals

> > with s/w queue make sure they run h/w queue, can't we remove

> > blk_mq_run_hw_queues() from scsi_end_request ?

>

> No, one purpose of blk_mq_run_hw_queues() is for rerun queue in case

that
> dispatch budget is running out of in submission path, and

sdev->device_busy
> is shared by all hw queues on this scsi device.

>

> I posted one patch for avoiding it in scsi_end_request() before, looks

it never
> lands upstream:

>

> https://lore.kernel.org/linux-block/20191118100640.3673-1-

> ming.lei@redhat.com/


Ming - I think above patch will fix the issue of performance on VD.
I fix some hunk failure and ported to 5.8 kernel -
I am testing this patch on my setup. If you post V4, I will use that.

So far looks good.  I have reduced device queue depth so that I hit budget
busy code path frequently.

Kashyap


>

> Thanks,

> Ming
Ming Lei July 28, 2020, 8:45 a.m. | #29
On Tue, Jul 28, 2020 at 08:54:27AM +0100, John Garry wrote:
> On 24/07/2020 03:47, Ming Lei wrote:

> > On Thu, Jul 23, 2020 at 06:29:01PM +0100, John Garry wrote:

> > > > > As I see, since megaraid will have 1:1 mapping of CPU to hw queue, will

> > > > > there only ever possibly a single bit set in ctx_map? If so, it seems a

> > > > > waste to always check every sbitmap map. But adding logic for this may

> > > > > negate any possible gains.

> > > > 

> > > > It really depends on min and max cpu id in the map, then sbitmap

> > > > depth can be reduced to (max - min + 1). I'd suggest to double check that

> > > > cost of sbitmap_any_bit_set() really matters.

> > > 

> > > Hi Ming,

> > > 

> > > I'm not sure that reducing the search range would help much, as we still

> > > need to load some indexes of map[], and at best this may be reduced from 2/3

> > > -> 1 elements, depending on nr_cpus.

> > 

> > I believe you misunderstood my idea, and you have to think it from implementation

> > viewpoint.

> > 

> > The only workable way is to store the min cpu id as 'offset' and set the sbitmap

> > depth as (max - min + 1), isn't it? Then the actual cpu id can be figured out via

> > 'offset' + nr_bit. And the whole indexes are just spread on the actual depth. BTW,

> > max & min is the max / min cpu id in hctx->cpu_map. So we can improve not only on 1:1,

> > and I guess most of MQ cases can benefit from the change, since it shouldn't be usual

> > for one ctx_map to cover both 0 & nr_cpu_id - 1.

> > 

> > Meantime, we need to allocate the sbitmap dynamically.

> 

> OK, so dynamically allocating the sbitmap could be good. I was thinking

> previously that we still allocate for nr_cpus size, and search a limited

> range - but this would have heavier runtime overhead.

> 

> So if you really think that this may have some value, then let me know, so

> we can look to take it forward.


Forget to mention, the in-tree code has been this shape for long
time, please see sbitmap_resize() called from blk_mq_map_swqueue().

Another update is that V4 of 'scsi: core: only re-run queue in scsi_end_request()
if device queue is busy' is quite hard to implement since commit b4fd63f42647110c9
("Revert "scsi: core: run queue if SCSI device queue isn't ready and queue is idle").


Thanks,
Ming
Kashyap Desai July 29, 2020, 5:25 a.m. | #30
> On Tue, Jul 28, 2020 at 08:54:27AM +0100, John Garry wrote:

> > On 24/07/2020 03:47, Ming Lei wrote:

> > > On Thu, Jul 23, 2020 at 06:29:01PM +0100, John Garry wrote:

> > > > > > As I see, since megaraid will have 1:1 mapping of CPU to hw

> > > > > > queue, will there only ever possibly a single bit set in

> > > > > > ctx_map? If so, it seems a waste to always check every sbitmap

> > > > > > map. But adding logic for this may negate any possible gains.

> > > > >

> > > > > It really depends on min and max cpu id in the map, then sbitmap

> > > > > depth can be reduced to (max - min + 1). I'd suggest to double

> > > > > check that cost of sbitmap_any_bit_set() really matters.

> > > >

> > > > Hi Ming,

> > > >

> > > > I'm not sure that reducing the search range would help much, as we

> > > > still need to load some indexes of map[], and at best this may be

> > > > reduced from 2/3

> > > > -> 1 elements, depending on nr_cpus.

> > >

> > > I believe you misunderstood my idea, and you have to think it from

> > > implementation viewpoint.

> > >

> > > The only workable way is to store the min cpu id as 'offset' and set

> > > the sbitmap depth as (max - min + 1), isn't it? Then the actual cpu

> > > id can be figured out via 'offset' + nr_bit. And the whole indexes

> > > are just spread on the actual depth. BTW, max & min is the max / min

> > > cpu id in hctx->cpu_map. So we can improve not only on 1:1, and I

> > > guess most of MQ cases can benefit from the change, since it

shouldn't be
> usual for one ctx_map to cover both 0 & nr_cpu_id - 1.

> > >

> > > Meantime, we need to allocate the sbitmap dynamically.

> >

> > OK, so dynamically allocating the sbitmap could be good. I was

> > thinking previously that we still allocate for nr_cpus size, and

> > search a limited range - but this would have heavier runtime overhead.

> >

> > So if you really think that this may have some value, then let me

> > know, so we can look to take it forward.

>

> Forget to mention, the in-tree code has been this shape for long time,

please
> see sbitmap_resize() called from blk_mq_map_swqueue().

>

> Another update is that V4 of 'scsi: core: only re-run queue in

> scsi_end_request() if device queue is busy' is quite hard to implement

since
> commit b4fd63f42647110c9 ("Revert "scsi: core: run queue if SCSI device

> queue isn't ready and queue is idle").


Ming -

Update from my testing. I found only one case of IO stall. I can discuss
this specific topic if you like to send separate patch. It is too much
interleaved discussion in this thread.

I noted you mentioned that V4 of 'scsi: core: only re-run queue in
scsi_end_request() if device queue is busy' need underlying support of
"scsi: core: run queue if SCSI device queue isn't ready and queue is idle"
patch which is already reverted in mainline.
Overall idea of running h/w queues conditionally in your patch " scsi:
core: only re-run queue in scsi_end_request" is still worth. There can be
some race if we use this patch and for that you have concern. Am I
correct. ?

One of the race I found in my testing is fixed by below patch -

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 54f9015..bcfd33a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -173,8 +173,10 @@ static int blk_mq_do_dispatch_ctx(struct
blk_mq_hw_ctx *hctx)
                if (!sbitmap_any_bit_set(&hctx->ctx_map))
                        break;

-               if (!blk_mq_get_dispatch_budget(hctx))
+               if (!blk_mq_get_dispatch_budget(hctx)) {
+                       blk_mq_delay_run_hw_queue(hctx,
BLK_MQ_BUDGET_DELAY);
                        break;
+               }

                rq = blk_mq_dequeue_from_ctx(hctx, ctx);
                if (!rq) {


In my test setup, I have your V3 'scsi: core: only re-run queue in
scsi_end_request() if device queue is busy' rebased on 5.8 which does not
have
"scsi: core: run queue if SCSI device queue isn't ready and queue is idle"
since it is already reverted in mainline.

I have 24 SAS SSD and I reduced QD = 16 so that I hit budget contention
frequently.  I am running ioscheduler=none.
If hctx0 has 16 IO inflight (All those IO will be posted to h/q queue
directly). Next IO (17th IO) will see budget contention and it will be
queued into s/w queue.
It is expected that queue will be kicked from scsi_end_request. It is
possible that one of the  IO completed and it reduce sdev->device_busy,
but it has not yet reach the code which will kicked the h/w queue.
Releasing budget and restarting h/w queue is not atomic. At the same time,
another IO (18th IO) from submission path get the budget and it will be
posted from below path. This IO will reset sdev->restart and it will not
allow h/w queue to be restarted from completion path. This will lead one
IO in s/w queue pending forever if there are no more new IOs.
blk_mq_sched_insert_requests
	->blk_mq_try_issue_list_directly().

I found that overall idea and attempt in recent block layer code is to
restart h/w queue from submission path in delayed context.
Making it less dependent on queue restart from completion path and/or even
though completion path restart the queue, there are certain race in
submission path which must be fixed by self-restarting h/w queue.

If we catch all such instances in submission path (as posted above in my
patch), we can still consider your patch
'scsi: core: only re-run queue in scsi_end_request() if device queue is
busy'.

Thanks, Kashyap

>

>

> Thanks,

> Ming
Ming Lei July 29, 2020, 3:36 p.m. | #31
On Wed, Jul 29, 2020 at 10:55:22AM +0530, Kashyap Desai wrote:
> > On Tue, Jul 28, 2020 at 08:54:27AM +0100, John Garry wrote:

> > > On 24/07/2020 03:47, Ming Lei wrote:

> > > > On Thu, Jul 23, 2020 at 06:29:01PM +0100, John Garry wrote:

> > > > > > > As I see, since megaraid will have 1:1 mapping of CPU to hw

> > > > > > > queue, will there only ever possibly a single bit set in

> > > > > > > ctx_map? If so, it seems a waste to always check every sbitmap

> > > > > > > map. But adding logic for this may negate any possible gains.

> > > > > >

> > > > > > It really depends on min and max cpu id in the map, then sbitmap

> > > > > > depth can be reduced to (max - min + 1). I'd suggest to double

> > > > > > check that cost of sbitmap_any_bit_set() really matters.

> > > > >

> > > > > Hi Ming,

> > > > >

> > > > > I'm not sure that reducing the search range would help much, as we

> > > > > still need to load some indexes of map[], and at best this may be

> > > > > reduced from 2/3

> > > > > -> 1 elements, depending on nr_cpus.

> > > >

> > > > I believe you misunderstood my idea, and you have to think it from

> > > > implementation viewpoint.

> > > >

> > > > The only workable way is to store the min cpu id as 'offset' and set

> > > > the sbitmap depth as (max - min + 1), isn't it? Then the actual cpu

> > > > id can be figured out via 'offset' + nr_bit. And the whole indexes

> > > > are just spread on the actual depth. BTW, max & min is the max / min

> > > > cpu id in hctx->cpu_map. So we can improve not only on 1:1, and I

> > > > guess most of MQ cases can benefit from the change, since it

> shouldn't be

> > usual for one ctx_map to cover both 0 & nr_cpu_id - 1.

> > > >

> > > > Meantime, we need to allocate the sbitmap dynamically.

> > >

> > > OK, so dynamically allocating the sbitmap could be good. I was

> > > thinking previously that we still allocate for nr_cpus size, and

> > > search a limited range - but this would have heavier runtime overhead.

> > >

> > > So if you really think that this may have some value, then let me

> > > know, so we can look to take it forward.

> >

> > Forget to mention, the in-tree code has been this shape for long time,

> please

> > see sbitmap_resize() called from blk_mq_map_swqueue().

> >

> > Another update is that V4 of 'scsi: core: only re-run queue in

> > scsi_end_request() if device queue is busy' is quite hard to implement

> since

> > commit b4fd63f42647110c9 ("Revert "scsi: core: run queue if SCSI device

> > queue isn't ready and queue is idle").

> 

> Ming -

> 

> Update from my testing. I found only one case of IO stall. I can discuss

> this specific topic if you like to send separate patch. It is too much

> interleaved discussion in this thread.

> 

> I noted you mentioned that V4 of 'scsi: core: only re-run queue in

> scsi_end_request() if device queue is busy' need underlying support of

> "scsi: core: run queue if SCSI device queue isn't ready and queue is idle"

> patch which is already reverted in mainline.


Right.

> Overall idea of running h/w queues conditionally in your patch " scsi:

> core: only re-run queue in scsi_end_request" is still worth. There can be


I agree.

> some race if we use this patch and for that you have concern. Am I

> correct. ?


If the patch of "scsi: core: run queue if SCSI device queue isn't ready and
queue is idle" is re-added, the approach should work. However, it looks a bit
complicated, and I was thinking if one simpler approach can be figured out.

> 

> One of the race I found in my testing is fixed by below patch -

> 

> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c

> index 54f9015..bcfd33a 100644

> --- a/block/blk-mq-sched.c

> +++ b/block/blk-mq-sched.c

> @@ -173,8 +173,10 @@ static int blk_mq_do_dispatch_ctx(struct

> blk_mq_hw_ctx *hctx)

>                 if (!sbitmap_any_bit_set(&hctx->ctx_map))

>                         break;

> 

> -               if (!blk_mq_get_dispatch_budget(hctx))

> +               if (!blk_mq_get_dispatch_budget(hctx)) {

> +                       blk_mq_delay_run_hw_queue(hctx,

> BLK_MQ_BUDGET_DELAY);

>                         break;

> +               }


Actually all hw queues need to be run, instead of this hctx, cause
the budget stuff is request queue wide.

> 

>                 rq = blk_mq_dequeue_from_ctx(hctx, ctx);

>                 if (!rq) {

> 

> 

> In my test setup, I have your V3 'scsi: core: only re-run queue in

> scsi_end_request() if device queue is busy' rebased on 5.8 which does not

> have

> "scsi: core: run queue if SCSI device queue isn't ready and queue is idle"

> since it is already reverted in mainline.


If you added the above patch, I believe you can remove the run queue in
scsi_end_request() unconditionally. However, the delay run queue may
degrade io performance.

Actually the re-run queue in scsi_end_request() is only for dequeuing
request from sw/scheduler queue. And no such issue if request stays in
hctx->dispatch list.

> 

> I have 24 SAS SSD and I reduced QD = 16 so that I hit budget contention

> frequently.  I am running ioscheduler=none.

> If hctx0 has 16 IO inflight (All those IO will be posted to h/q queue

> directly). Next IO (17th IO) will see budget contention and it will be

> queued into s/w queue.

> It is expected that queue will be kicked from scsi_end_request. It is

> possible that one of the  IO completed and it reduce sdev->device_busy,

> but it has not yet reach the code which will kicked the h/w queue.

> Releasing budget and restarting h/w queue is not atomic. At the same time,

> another IO (18th IO) from submission path get the budget and it will be

> posted from below path. This IO will reset sdev->restart and it will not

> allow h/w queue to be restarted from completion path. This will lead one


Maybe re-run queue is needed before resetting sdev->restart if sdev->restart is 1.


Thanks, 
Ming
Kashyap Desai July 29, 2020, 6:31 p.m. | #32
> > >

> > > Another update is that V4 of 'scsi: core: only re-run queue in

> > > scsi_end_request() if device queue is busy' is quite hard to

> > > implement

> > since

> > > commit b4fd63f42647110c9 ("Revert "scsi: core: run queue if SCSI

> > > device queue isn't ready and queue is idle").

> >

> > Ming -

> >

> > Update from my testing. I found only one case of IO stall. I can

> > discuss this specific topic if you like to send separate patch. It is

> > too much interleaved discussion in this thread.

> >

> > I noted you mentioned that V4 of 'scsi: core: only re-run queue in

> > scsi_end_request() if device queue is busy' need underlying support of

> > "scsi: core: run queue if SCSI device queue isn't ready and queue is

idle"
> > patch which is already reverted in mainline.

>

> Right.

>

> > Overall idea of running h/w queues conditionally in your patch " scsi:

> > core: only re-run queue in scsi_end_request" is still worth. There can

> > be

>

> I agree.

>

> > some race if we use this patch and for that you have concern. Am I

> > correct. ?

>

> If the patch of "scsi: core: run queue if SCSI device queue isn't ready

and queue
> is idle" is re-added, the approach should work.

I could not find issue in " scsi: core: only re-run queue in
scsi_end_request" even though above mentioned patch is reverted.
There may be some corner cases/race condition in submission path which can
be fixed doing self-restart of h/w queue.

> However, it looks a bit

> complicated, and I was thinking if one simpler approach can be figured

out.

I was thinking your original approach is simple, but if you think some
other simple approach I can test as part of these series.
BTW, I am still not getting why you think your original approach is not
good design.

>

> >

> > One of the race I found in my testing is fixed by below patch -

> >

> > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index

> > 54f9015..bcfd33a 100644

> > --- a/block/blk-mq-sched.c

> > +++ b/block/blk-mq-sched.c

> > @@ -173,8 +173,10 @@ static int blk_mq_do_dispatch_ctx(struct

> > blk_mq_hw_ctx *hctx)

> >                 if (!sbitmap_any_bit_set(&hctx->ctx_map))

> >                         break;

> >

> > -               if (!blk_mq_get_dispatch_budget(hctx))

> > +               if (!blk_mq_get_dispatch_budget(hctx)) {

> > +                       blk_mq_delay_run_hw_queue(hctx,

> > BLK_MQ_BUDGET_DELAY);

> >                         break;

> > +               }

>

> Actually all hw queues need to be run, instead of this hctx, cause the

budget
> stuff is request queue wide.



OK. But I thought all the hctx will see issue independently, if they are
active and they will restart its own hctx queue.
BTW, do you think above handling in block layer code make sense
irrespective of current h/w queue restart logic OR it is just relative
stuffs ?

>

> >

> >                 rq = blk_mq_dequeue_from_ctx(hctx, ctx);

> >                 if (!rq) {

> >

> >

> > In my test setup, I have your V3 'scsi: core: only re-run queue in

> > scsi_end_request() if device queue is busy' rebased on 5.8 which does

> > not have

> > "scsi: core: run queue if SCSI device queue isn't ready and queue is

idle"
> > since it is already reverted in mainline.

>

> If you added the above patch, I believe you can remove the run queue in

> scsi_end_request() unconditionally. However, the delay run queue may

> degrade io performance.


I understood.  But that performance issue is due to budget contention and
may impact some old HBA(less queue depth) or emulation HBA.
That is why I thought your patch of conditional h/w run from completion
would improve performance.

>

> Actually the re-run queue in scsi_end_request() is only for dequeuing

request
> from sw/scheduler queue. And no such issue if request stays in

> hctx->dispatch list.


I was not aware of this. Thanks for info.  I will review the code for my
own understanding.
>

> >

> > I have 24 SAS SSD and I reduced QD = 16 so that I hit budget

> > contention frequently.  I am running ioscheduler=none.

> > If hctx0 has 16 IO inflight (All those IO will be posted to h/q queue

> > directly). Next IO (17th IO) will see budget contention and it will be

> > queued into s/w queue.

> > It is expected that queue will be kicked from scsi_end_request. It is

> > possible that one of the  IO completed and it reduce

> > sdev->device_busy, but it has not yet reach the code which will kicked

the
> h/w queue.

> > Releasing budget and restarting h/w queue is not atomic. At the same

> > time, another IO (18th IO) from submission path get the budget and it

> > will be posted from below path. This IO will reset sdev->restart and

> > it will not allow h/w queue to be restarted from completion path. This

> > will lead one

>

> Maybe re-run queue is needed before resetting sdev->restart if

sdev->restart
> is 1.


Agree.

>

>

> Thanks,

> Ming
Ming Lei Aug. 4, 2020, 8:36 a.m. | #33
On Thu, Jul 30, 2020 at 12:01:22AM +0530, Kashyap Desai wrote:
> > > >

> > > > Another update is that V4 of 'scsi: core: only re-run queue in

> > > > scsi_end_request() if device queue is busy' is quite hard to

> > > > implement

> > > since

> > > > commit b4fd63f42647110c9 ("Revert "scsi: core: run queue if SCSI

> > > > device queue isn't ready and queue is idle").

> > >

> > > Ming -

> > >

> > > Update from my testing. I found only one case of IO stall. I can

> > > discuss this specific topic if you like to send separate patch. It is

> > > too much interleaved discussion in this thread.

> > >

> > > I noted you mentioned that V4 of 'scsi: core: only re-run queue in

> > > scsi_end_request() if device queue is busy' need underlying support of

> > > "scsi: core: run queue if SCSI device queue isn't ready and queue is

> idle"

> > > patch which is already reverted in mainline.

> >

> > Right.

> >

> > > Overall idea of running h/w queues conditionally in your patch " scsi:

> > > core: only re-run queue in scsi_end_request" is still worth. There can

> > > be

> >

> > I agree.

> >

> > > some race if we use this patch and for that you have concern. Am I

> > > correct. ?

> >

> > If the patch of "scsi: core: run queue if SCSI device queue isn't ready

> and queue

> > is idle" is re-added, the approach should work.

> I could not find issue in " scsi: core: only re-run queue in

> scsi_end_request" even though above mentioned patch is reverted.

> There may be some corner cases/race condition in submission path which can

> be fixed doing self-restart of h/w queue.


It is because two corner cases are handled in other ways:

    Now that we have the patches ("blk-mq: In blk_mq_dispatch_rq_list()
    "no budget" is a reason to kick") and ("blk-mq: Rerun dispatching in
    the case of budget contention") we should no longer need the fix in
    the SCSI code.  Revert it, resolving conflicts with other patches that
    have touched this code.

> 

> > However, it looks a bit

> > complicated, and I was thinking if one simpler approach can be figured

> out.

> 

> I was thinking your original approach is simple, but if you think some

> other simple approach I can test as part of these series.

> BTW, I am still not getting why you think your original approach is not

> good design.


It is still not straightforward enough or simple enough for proving its
correctness, even though the implementation isn't complicated.

> 

> >

> > >

> > > One of the race I found in my testing is fixed by below patch -

> > >

> > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index

> > > 54f9015..bcfd33a 100644

> > > --- a/block/blk-mq-sched.c

> > > +++ b/block/blk-mq-sched.c

> > > @@ -173,8 +173,10 @@ static int blk_mq_do_dispatch_ctx(struct

> > > blk_mq_hw_ctx *hctx)

> > >                 if (!sbitmap_any_bit_set(&hctx->ctx_map))

> > >                         break;

> > >

> > > -               if (!blk_mq_get_dispatch_budget(hctx))

> > > +               if (!blk_mq_get_dispatch_budget(hctx)) {

> > > +                       blk_mq_delay_run_hw_queue(hctx,

> > > BLK_MQ_BUDGET_DELAY);

> > >                         break;

> > > +               }

> >

> > Actually all hw queues need to be run, instead of this hctx, cause the

> budget

> > stuff is request queue wide.

> 

> 

> OK. But I thought all the hctx will see issue independently, if they are

> active and they will restart its own hctx queue.

> BTW, do you think above handling in block layer code make sense

> irrespective of current h/w queue restart logic OR it is just relative

> stuffs ?


You are right, it is correct to just run this hctx.

> 

> >

> > >

> > >                 rq = blk_mq_dequeue_from_ctx(hctx, ctx);

> > >                 if (!rq) {

> > >

> > >

> > > In my test setup, I have your V3 'scsi: core: only re-run queue in

> > > scsi_end_request() if device queue is busy' rebased on 5.8 which does

> > > not have

> > > "scsi: core: run queue if SCSI device queue isn't ready and queue is

> idle"

> > > since it is already reverted in mainline.

> >

> > If you added the above patch, I believe you can remove the run queue in

> > scsi_end_request() unconditionally. However, the delay run queue may

> > degrade io performance.

> 

> I understood.  But that performance issue is due to budget contention and

> may impact some old HBA(less queue depth) or emulation HBA.


Your patch for delay running hw queue causes delay once one request
is completed, and the queue should have been run immediately after one
request is finished.

> That is why I thought your patch of conditional h/w run from completion

> would improve performance.


Yeah, we all think that way is correct thing to do, and now the problem
is how to run hw queue just in case of budget contention.


thanks, 
Ming
Kashyap Desai Aug. 4, 2020, 9:27 a.m. | #34
> >

> > > However, it looks a bit

> > > complicated, and I was thinking if one simpler approach can be

> > > figured

> > out.

> >

> > I was thinking your original approach is simple, but if you think some

> > other simple approach I can test as part of these series.

> > BTW, I am still not getting why you think your original approach is

> > not good design.

>

> It is still not straightforward enough or simple enough for proving its

> correctness, even though the implementation isn't complicated.


Ming -

I noted your comments.

I have completed testing and this particular latest performance issue on
Volume is outstanding.
Currently it is 20-25% performance drop in IOPs and we want that to be
closed before shared host tag is enabled for <megaraid_sas> driver.
Just for my understanding - What will be the next steps on this ?

I can validate any new approach/patch for this issue.

Kashyap

>

> >

> > >

> > > >
John Garry Aug. 4, 2020, 5 p.m. | #35
On 28/07/2020 09:45, Ming Lei wrote:
>> OK, so dynamically allocating the sbitmap could be good. I was thinking

>> previously that we still allocate for nr_cpus size, and search a limited

>> range - but this would have heavier runtime overhead.

>>

>> So if you really think that this may have some value, then let me know, so

>> we can look to take it forward.


Hi Ming,

> Forget to mention, the in-tree code has been this shape for long

> time, please see sbitmap_resize() called from blk_mq_map_swqueue().


So after the resize, even if we are only checking a single word and a 
few bits within that word, we still need 2x 64b loads - 1x for .word and 
1x for .cleared. Seems a bit inefficient for caching when we have a 1:1 
mapping or similar. For 1:1 case only, how about a ctx_map per queue for 
all hctx, with a single bit per hctx? I do realize that it makes the 
code more complicated, but it could be more efficient.

Another thing to consider is that for ctx_map, we don't do deferred bit 
clear, so we don't ever really need to check .cleared there. I think.

Thanks,
John
Ming Lei Aug. 5, 2020, 2:56 a.m. | #36
On Tue, Aug 04, 2020 at 06:00:52PM +0100, John Garry wrote:
> On 28/07/2020 09:45, Ming Lei wrote:

> > > OK, so dynamically allocating the sbitmap could be good. I was thinking

> > > previously that we still allocate for nr_cpus size, and search a limited

> > > range - but this would have heavier runtime overhead.

> > > 

> > > So if you really think that this may have some value, then let me know, so

> > > we can look to take it forward.

> 

> Hi Ming,

> 

> > Forget to mention, the in-tree code has been this shape for long

> > time, please see sbitmap_resize() called from blk_mq_map_swqueue().

> 

> So after the resize, even if we are only checking a single word and a few

> bits within that word, we still need 2x 64b loads - 1x for .word and 1x for

> .cleared. Seems a bit inefficient for caching when we have a 1:1 mapping or

> similar. For 1:1 case only, how about a ctx_map per queue for all hctx, with

> a single bit per hctx? I do realize that it makes the code more complicated,

> but it could be more efficient.


IMO, the cost for accessing one bit and one word is basically same.

> 

> Another thing to consider is that for ctx_map, we don't do deferred bit

> clear, so we don't ever really need to check .cleared there. I think.


That looks true.

However, in case of MQ, the normal code path is direct issue, not sure if
we need this kind of optimization.

BTW, no matter if hostwide tags is used or not, the problem is in always
run-queue from scsi_end_request(). As we discussed, the re-run queue is
only needed in case of budget contention.


Thanks,
Ming
Ming Lei Aug. 5, 2020, 8:40 a.m. | #37
On Tue, Aug 04, 2020 at 02:57:52PM +0530, Kashyap Desai wrote:
> > >

> > > > However, it looks a bit

> > > > complicated, and I was thinking if one simpler approach can be

> > > > figured

> > > out.

> > >

> > > I was thinking your original approach is simple, but if you think some

> > > other simple approach I can test as part of these series.

> > > BTW, I am still not getting why you think your original approach is

> > > not good design.

> >

> > It is still not straightforward enough or simple enough for proving its

> > correctness, even though the implementation isn't complicated.

> 

> Ming -

> 

> I noted your comments.

> 

> I have completed testing and this particular latest performance issue on

> Volume is outstanding.

> Currently it is 20-25% performance drop in IOPs and we want that to be

> closed before shared host tag is enabled for <megaraid_sas> driver.

> Just for my understanding - What will be the next steps on this ?

> 

> I can validate any new approach/patch for this issue.

> 


Hello,

What do you think of the following patch?

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index c866a4f33871..49f0fc5c7a63 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -552,8 +552,24 @@ static void scsi_run_queue_async(struct scsi_device *sdev)
 	if (scsi_target(sdev)->single_lun ||
 	    !list_empty(&sdev->host->starved_list))
 		kblockd_schedule_work(&sdev->requeue_work);
-	else
-		blk_mq_run_hw_queues(sdev->request_queue, true);
+	else {
+		/*
+		 * smp_mb() implied in either rq->end_io or blk_mq_free_request
+		 * is for ordering writing .device_busy in scsi_device_unbusy()
+		 * and reading sdev->restarts.
+		 */
+		int old = atomic_read(&sdev->restarts);
+
+		if (old) {
+			blk_mq_run_hw_queues(sdev->request_queue, true);
+
+			/*
+			 * ->restarts has to be kept as non-zero if there is
+			 *  new budget contention comes.
+			 */
+			atomic_cmpxchg(&sdev->restarts, old, 0);
+		}
+	}
 }
 
 /* Returns false when no more bytes to process, true if there are more */
@@ -1612,8 +1628,34 @@ static void scsi_mq_put_budget(struct request_queue *q)
 static bool scsi_mq_get_budget(struct request_queue *q)
 {
 	struct scsi_device *sdev = q->queuedata;
+	int ret = scsi_dev_queue_ready(q, sdev);
 
-	return scsi_dev_queue_ready(q, sdev);
+	if (ret)
+		return true;
+
+	/*
+	 * If all in-flight requests originated from this LUN are completed
+	 * before setting .restarts, sdev->device_busy will be observed as
+	 * zero, then blk_mq_delay_run_hw_queue() will dispatch this request
+	 * soon. Otherwise, completion of one of these request will observe
+	 * the .restarts flag, and the request queue will be run for handling
+	 * this request, see scsi_end_request().
+	 */
+	atomic_inc(&sdev->restarts);
+
+	/*
+	 * Order writing .restarts and reading .device_busy, and make sure
+	 * .restarts is visible to scsi_end_request(). Its pair is implied by
+	 * __blk_mq_end_request() in scsi_end_request() for ordering
+	 * writing .device_busy in scsi_device_unbusy() and reading .restarts.
+	 *
+	 */
+	smp_mb__after_atomic();
+
+	if (unlikely(atomic_read(&sdev->device_busy) == 0 &&
+				!scsi_device_blocked(sdev)))
+		blk_mq_delay_run_hw_queues(sdev->request_queue, SCSI_QUEUE_DELAY);
+	return false;
 }
 
 static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index bc5909033d13..1a5c9a3df6d6 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -109,6 +109,7 @@ struct scsi_device {
 	atomic_t device_busy;		/* commands actually active on LLDD */
 	atomic_t device_blocked;	/* Device returned QUEUE_FULL. */
 
+	atomic_t restarts;
 	spinlock_t list_lock;
 	struct list_head starved_entry;
 	unsigned short queue_depth;	/* How deep of a queue we want */

Thanks, 
Ming
Kashyap Desai Aug. 6, 2020, 10:25 a.m. | #38
> > Ming -

> >

> > I noted your comments.

> >

> > I have completed testing and this particular latest performance issue

> > on Volume is outstanding.

> > Currently it is 20-25% performance drop in IOPs and we want that to be

> > closed before shared host tag is enabled for <megaraid_sas> driver.

> > Just for my understanding - What will be the next steps on this ?

> >

> > I can validate any new approach/patch for this issue.

> >

>

> Hello,

>

> What do you think of the following patch?


I tested this patch. I still see IO hang.

>

> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index

> c866a4f33871..49f0fc5c7a63 100644

> --- a/drivers/scsi/scsi_lib.c

> +++ b/drivers/scsi/scsi_lib.c

> @@ -552,8 +552,24 @@ static void scsi_run_queue_async(struct scsi_device

> *sdev)

>  	if (scsi_target(sdev)->single_lun ||

>  	    !list_empty(&sdev->host->starved_list))

>  		kblockd_schedule_work(&sdev->requeue_work);

> -	else

> -		blk_mq_run_hw_queues(sdev->request_queue, true);

> +	else {

> +		/*

> +		 * smp_mb() implied in either rq->end_io or

> blk_mq_free_request

> +		 * is for ordering writing .device_busy in

scsi_device_unbusy()
> +		 * and reading sdev->restarts.

> +		 */

> +		int old = atomic_read(&sdev->restarts);

> +

> +		if (old) {

> +			blk_mq_run_hw_queues(sdev->request_queue, true);

> +

> +			/*

> +			 * ->restarts has to be kept as non-zero if there

is
> +			 *  new budget contention comes.

> +			 */

> +			atomic_cmpxchg(&sdev->restarts, old, 0);

> +		}

> +	}

>  }

>

>  /* Returns false when no more bytes to process, true if there are more

*/
> @@ -1612,8 +1628,34 @@ static void scsi_mq_put_budget(struct

> request_queue *q)  static bool scsi_mq_get_budget(struct request_queue

*q)
> {

>  	struct scsi_device *sdev = q->queuedata;

> +	int ret = scsi_dev_queue_ready(q, sdev);

>

> -	return scsi_dev_queue_ready(q, sdev);

> +	if (ret)

> +		return true;

> +

> +	/*

> +	 * If all in-flight requests originated from this LUN are

completed
> +	 * before setting .restarts, sdev->device_busy will be observed as

> +	 * zero, then blk_mq_delay_run_hw_queue() will dispatch this

request
> +	 * soon. Otherwise, completion of one of these request will

observe
> +	 * the .restarts flag, and the request queue will be run for

handling
> +	 * this request, see scsi_end_request().

> +	 */

> +	atomic_inc(&sdev->restarts);

> +

> +	/*

> +	 * Order writing .restarts and reading .device_busy, and make sure

> +	 * .restarts is visible to scsi_end_request(). Its pair is implied

by
> +	 * __blk_mq_end_request() in scsi_end_request() for ordering

> +	 * writing .device_busy in scsi_device_unbusy() and reading

.restarts.
> +	 *

> +	 */

> +	smp_mb__after_atomic();

> +

> +	if (unlikely(atomic_read(&sdev->device_busy) == 0 &&

> +				!scsi_device_blocked(sdev)))

> +		blk_mq_delay_run_hw_queues(sdev->request_queue,

> SCSI_QUEUE_DELAY);


Hi Ming -

There is still some race which is not handled.  Take a case of IO is not
able to get budget and it has already marked <restarts> flag.
<restarts> flag will be seen non-zero in completion path and completion
path will attempt h/w queue run. (But this particular IO is still not in
s/w queue.).
Attempt of running h/w queue from completion path will not flush any IO
since there is no IO in s/w queue.

I think above code is added assuming it should manage this particular
case, but this code also does not help. If some IO in between submitted
directly to the h/w queue then sdev->device_busy will be non-zero.
	
If I move above section of the code into completion path, IO hang is
resolved.  I also verify performance -

Multi Drive R0 	1 workers per VD gives	662K prior to this patch and now
It scale to 1.1M IOPs. (90% improvement)
Multi Drive R0  4 workers per VD gives 1.9M prior to this patch and now It
scale to 3.1M IOPs. (50% improvement)

Here is modified patch -

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 6f50e5c..dcdc5f6 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -594,8 +594,26 @@ static bool scsi_end_request(struct request *req,
blk_status_t error,
        if (scsi_target(sdev)->single_lun ||
            !list_empty(&sdev->host->starved_list))
                kblockd_schedule_work(&sdev->requeue_work);
-       else
-               blk_mq_run_hw_queues(q, true);
+       else {
+               /*
+                * smp_mb() implied in either rq->end_io or
blk_mq_free_request
+                * is for ordering writing .device_busy in
scsi_device_unbusy()
+                * and reading sdev->restarts.
+                */
+               int old = atomic_read(&sdev->restarts);
+
+               if (old) {
+                       blk_mq_run_hw_queues(sdev->request_queue, true);
+
+                       /*
+                        * ->restarts has to be kept as non-zero if there
is
+                        *  new budget contention comes.
+                        */
+                       atomic_cmpxchg(&sdev->restarts, old, 0);
+               } else if (unlikely(atomic_read(&sdev->device_busy) == 0
&&
+                                       !scsi_device_blocked(sdev)))
+                       blk_mq_delay_run_hw_queues(sdev->request_queue,
SCSI_QUEUE_DELAY);
+       }

        percpu_ref_put(&q->q_usage_counter);
        return false;
@@ -1615,8 +1633,31 @@ static bool scsi_mq_get_budget(struct blk_mq_hw_ctx
*hctx)
 {
        struct request_queue *q = hctx->queue;
        struct scsi_device *sdev = q->queuedata;
+       int ret = scsi_dev_queue_ready(q, sdev);
+
+       if (ret)
+               return true;

-       return scsi_dev_queue_ready(q, sdev);
+       /*
+        * If all in-flight requests originated from this LUN are
completed
+        * before setting .restarts, sdev->device_busy will be observed as
+        * zero, then blk_mq_delay_run_hw_queue() will dispatch this
request
+        * soon. Otherwise, completion of one of these request will
observe
+        * the .restarts flag, and the request queue will be run for
handling
+        * this request, see scsi_end_request().
+        */
+       atomic_inc(&sdev->restarts);
+
+       /*
+        * Order writing .restarts and reading .device_busy, and make sure
+        * .restarts is visible to scsi_end_request(). Its pair is implied
by
+        * __blk_mq_end_request() in scsi_end_request() for ordering
+        * writing .device_busy in scsi_device_unbusy() and reading
.restarts.
+        *
+        */
+       smp_mb__after_atomic();
+
+       return false;
 }

 static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index bc59090..ac45058 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -108,7 +108,8 @@ struct scsi_device {

        atomic_t device_busy;           /* commands actually active on
LLDD */
        atomic_t device_blocked;        /* Device returned QUEUE_FULL. */
-
+
+       atomic_t restarts;
        spinlock_t list_lock;
        struct list_head starved_entry;
        unsigned short queue_depth;     /* How deep of a queue we want */
Ming Lei Aug. 6, 2020, 1:38 p.m. | #39
On Thu, Aug 06, 2020 at 03:55:50PM +0530, Kashyap Desai wrote:
> > > Ming -

> > >

> > > I noted your comments.

> > >

> > > I have completed testing and this particular latest performance issue

> > > on Volume is outstanding.

> > > Currently it is 20-25% performance drop in IOPs and we want that to be

> > > closed before shared host tag is enabled for <megaraid_sas> driver.

> > > Just for my understanding - What will be the next steps on this ?

> > >

> > > I can validate any new approach/patch for this issue.

> > >

> >

> > Hello,

> >

> > What do you think of the following patch?

> 

> I tested this patch. I still see IO hang.

> 

> >

> > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index

> > c866a4f33871..49f0fc5c7a63 100644

> > --- a/drivers/scsi/scsi_lib.c

> > +++ b/drivers/scsi/scsi_lib.c

> > @@ -552,8 +552,24 @@ static void scsi_run_queue_async(struct scsi_device

> > *sdev)

> >  	if (scsi_target(sdev)->single_lun ||

> >  	    !list_empty(&sdev->host->starved_list))

> >  		kblockd_schedule_work(&sdev->requeue_work);

> > -	else

> > -		blk_mq_run_hw_queues(sdev->request_queue, true);

> > +	else {

> > +		/*

> > +		 * smp_mb() implied in either rq->end_io or

> > blk_mq_free_request

> > +		 * is for ordering writing .device_busy in

> scsi_device_unbusy()

> > +		 * and reading sdev->restarts.

> > +		 */

> > +		int old = atomic_read(&sdev->restarts);

> > +

> > +		if (old) {

> > +			blk_mq_run_hw_queues(sdev->request_queue, true);

> > +

> > +			/*

> > +			 * ->restarts has to be kept as non-zero if there

> is

> > +			 *  new budget contention comes.

> > +			 */

> > +			atomic_cmpxchg(&sdev->restarts, old, 0);

> > +		}

> > +	}

> >  }

> >

> >  /* Returns false when no more bytes to process, true if there are more

> */

> > @@ -1612,8 +1628,34 @@ static void scsi_mq_put_budget(struct

> > request_queue *q)  static bool scsi_mq_get_budget(struct request_queue

> *q)

> > {

> >  	struct scsi_device *sdev = q->queuedata;

> > +	int ret = scsi_dev_queue_ready(q, sdev);

> >

> > -	return scsi_dev_queue_ready(q, sdev);

> > +	if (ret)

> > +		return true;

> > +

> > +	/*

> > +	 * If all in-flight requests originated from this LUN are

> completed

> > +	 * before setting .restarts, sdev->device_busy will be observed as

> > +	 * zero, then blk_mq_delay_run_hw_queue() will dispatch this

> request

> > +	 * soon. Otherwise, completion of one of these request will

> observe

> > +	 * the .restarts flag, and the request queue will be run for

> handling

> > +	 * this request, see scsi_end_request().

> > +	 */

> > +	atomic_inc(&sdev->restarts);

> > +

> > +	/*

> > +	 * Order writing .restarts and reading .device_busy, and make sure

> > +	 * .restarts is visible to scsi_end_request(). Its pair is implied

> by

> > +	 * __blk_mq_end_request() in scsi_end_request() for ordering

> > +	 * writing .device_busy in scsi_device_unbusy() and reading

> .restarts.

> > +	 *

> > +	 */

> > +	smp_mb__after_atomic();

> > +

> > +	if (unlikely(atomic_read(&sdev->device_busy) == 0 &&

> > +				!scsi_device_blocked(sdev)))

> > +		blk_mq_delay_run_hw_queues(sdev->request_queue,

> > SCSI_QUEUE_DELAY);

> 

> Hi Ming -

> 

> There is still some race which is not handled.  Take a case of IO is not

> able to get budget and it has already marked <restarts> flag.

> <restarts> flag will be seen non-zero in completion path and completion

> path will attempt h/w queue run. (But this particular IO is still not in

> s/w queue.).

> Attempt of running h/w queue from completion path will not flush any IO

> since there is no IO in s/w queue.


Then where is the IO to be submitted in case of running out of budget?

Any IO request which is going to be added to hctx->dispatch, the queue will be
re-run via blk-mq core.

Any IO request being issued directly when running out of budget will be
insert to hctx->dispatch or sw/scheduler queue, will be run in the
submission path.


Thanks, 
Ming
Kashyap Desai Aug. 6, 2020, 2:37 p.m. | #40
> > Hi Ming -

> >

> > There is still some race which is not handled.  Take a case of IO is

> > not able to get budget and it has already marked <restarts> flag.

> > <restarts> flag will be seen non-zero in completion path and

> > completion path will attempt h/w queue run. (But this particular IO is

> > still not in s/w queue.).

> > Attempt of running h/w queue from completion path will not flush any

> > IO since there is no IO in s/w queue.

>

> Then where is the IO to be submitted in case of running out of budget?


Typical race in your latest patch is - (Lets consider command A,B and C)
Command A did not receive budget. Command B completed  (which was already
submitted earlier) at the same time and it make sdev->device_busy = 0 from
" scsi_finish_command".
Command B has still not called "scsi_end_request". Command C get the
budget and it will make sdev->device_busy = 1. Now, Command A set  set
sdev->restarts flags but will not run h/w queue since sdev->device_busy =
1.
Command B run h/w queue (make sdev->restart = 0) from completion path, but
command -A is still not in the s/w queue. Command-A is in now in s/w
queue. Command-C completed but it will not run h/w queue because
sdev->restarts = 0.


>

> Any IO request which is going to be added to hctx->dispatch, the queue

will be
> re-run via blk-mq core.

>

> Any IO request being issued directly when running out of budget will be

insert
> to hctx->dispatch or sw/scheduler queue, will be run in the submission

path.

I have *not* included below changes we discussed in my testing - If I
include below patch, it is correct that queue will be run in submission
path (at least the path which is impacted in my testing). You have already
mentioned that most of the submission path has fix now in latest kernel
w.r.t running h/w queue from submission path.  Below path is missing for
running h/w queue from submission path.

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index
54f9015..bcfd33a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -173,8 +173,10 @@ static int blk_mq_do_dispatch_ctx(struct
blk_mq_hw_ctx *hctx)
                if (!sbitmap_any_bit_set(&hctx->ctx_map))
                        break;

-               if (!blk_mq_get_dispatch_budget(hctx))
+               if (!blk_mq_get_dispatch_budget(hctx)) {
+                       blk_mq_delay_run_hw_queue(hctx,
+ BLK_MQ_BUDGET_DELAY);
                        break;
+               }

                rq = blk_mq_dequeue_from_ctx(hctx, ctx);
                if (!rq) {

Are you saying above fix should be included along with your latest patch ?

>

>

> Thanks,

> Ming
Ming Lei Aug. 6, 2020, 3:29 p.m. | #41
On Thu, Aug 06, 2020 at 08:07:38PM +0530, Kashyap Desai wrote:
> > > Hi Ming -

> > >

> > > There is still some race which is not handled.  Take a case of IO is

> > > not able to get budget and it has already marked <restarts> flag.

> > > <restarts> flag will be seen non-zero in completion path and

> > > completion path will attempt h/w queue run. (But this particular IO is

> > > still not in s/w queue.).

> > > Attempt of running h/w queue from completion path will not flush any

> > > IO since there is no IO in s/w queue.

> >

> > Then where is the IO to be submitted in case of running out of budget?

> 

> Typical race in your latest patch is - (Lets consider command A,B and C)

> Command A did not receive budget. Command B completed  (which was already


Command A doesn't get budget, and A is still in sw/scheduler queue
because we try to acquire budget before dequeuing request from sw/scheduler queue,
see __blk_mq_do_dispatch_sched() and blk_mq_do_dispatch_ctx().

Not consider direct issue, because the hw queue will be run explicitly
when not getting budget, see __blk_mq_try_issue_directly.

Not consider command A being added to hctx->dispatch too, because blk-mq will
re-run the queue, see blk_mq_dispatch_rq_list().


> submitted earlier) at the same time and it make sdev->device_busy = 0 from

> " scsi_finish_command".

> Command B has still not called "scsi_end_request". Command C get the

> budget and it will make sdev->device_busy = 1. Now, Command A set  set

> sdev->restarts flags but will not run h/w queue since sdev->device_busy =

> 1.


Right.

> Command B run h/w queue (make sdev->restart = 0) from completion path, but

> command -A is still not in the s/w queue.


Then you didn't answer my question about where A is, did you?

> Command-A is in now in s/w queue. Command-C completed but it will not run h/w queue because

> sdev->restarts = 0.


Why does command-A become in sw/queue now?

> 

> 

> >

> > Any IO request which is going to be added to hctx->dispatch, the queue

> will be

> > re-run via blk-mq core.

> >

> > Any IO request being issued directly when running out of budget will be

> insert

> > to hctx->dispatch or sw/scheduler queue, will be run in the submission

> path.

> 

> I have *not* included below changes we discussed in my testing - If I

> include below patch, it is correct that queue will be run in submission

> path (at least the path which is impacted in my testing). You have already

> mentioned that most of the submission path has fix now in latest kernel

> w.r.t running h/w queue from submission path.  Below path is missing for

> running h/w queue from submission path.

> 

> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index

> 54f9015..bcfd33a 100644

> --- a/block/blk-mq-sched.c

> +++ b/block/blk-mq-sched.c

> @@ -173,8 +173,10 @@ static int blk_mq_do_dispatch_ctx(struct

> blk_mq_hw_ctx *hctx)

>                 if (!sbitmap_any_bit_set(&hctx->ctx_map))

>                         break;

> 

> -               if (!blk_mq_get_dispatch_budget(hctx))

> +               if (!blk_mq_get_dispatch_budget(hctx)) {

> +                       blk_mq_delay_run_hw_queue(hctx,

> + BLK_MQ_BUDGET_DELAY);

>                         break;

> +               }

> 

>                 rq = blk_mq_dequeue_from_ctx(hctx, ctx);

>                 if (!rq) {

> 

> Are you saying above fix should be included along with your latest patch ?


No.


Thanks,
Ming
Kashyap Desai Aug. 8, 2020, 7:05 p.m. | #42
> On Thu, Aug 06, 2020 at 08:07:38PM +0530, Kashyap Desai wrote:

> > > > Hi Ming -

> > > >

> > > > There is still some race which is not handled.  Take a case of IO

> > > > is not able to get budget and it has already marked <restarts>

flag.
> > > > <restarts> flag will be seen non-zero in completion path and

> > > > completion path will attempt h/w queue run. (But this particular

> > > > IO is still not in s/w queue.).

> > > > Attempt of running h/w queue from completion path will not flush

> > > > any IO since there is no IO in s/w queue.

> > >

> > > Then where is the IO to be submitted in case of running out of

budget?
> >

> > Typical race in your latest patch is - (Lets consider command A,B and

> > C) Command A did not receive budget. Command B completed  (which was

> > already

>

> Command A doesn't get budget, and A is still in sw/scheduler queue

because
> we try to acquire budget before dequeuing request from sw/scheduler

queue,
> see __blk_mq_do_dispatch_sched() and blk_mq_do_dispatch_ctx().

>

> Not consider direct issue, because the hw queue will be run explicitly

when
> not getting budget, see __blk_mq_try_issue_directly.

>

> Not consider command A being added to hctx->dispatch too, because blk-mq

> will re-run the queue, see blk_mq_dispatch_rq_list().


Ming -

After going through your comment (I noted your comment and thanks for
correcting my understanding.) and block layer code, I realize that it is a
different race condition. My previous explanation was not accurate.
I debug further and figure out what is actually happening - Consider below
scenario/sequence -

Thread -1 - Detected budget contention. Set restarts = 1.
Thread -2 - old restarts = 1. start hw queue.
Thread -3 - old restarts = 1. start hw queue.
Thread -2 - move restarts = 0.
In my testing, I noticed that both thread-2 and thread-3 started h/w queue
but there was no work for them to do. It is possible because some other
context of h/w queue run might have done that job.
It means, IO of thread-1 is already posted.
Thread -4 - Detected budget contention. Set restart = 1 (because thread-2
has move restarts=0).
Thread -3 - move restarts = 0 (because this thread see old value = 1 but
that is actually updated one more time by thread-4 and theread-4 actually
wanted to run h/w queues). IO of Thread-4 will not be scheduled.

We have to make sure that completion IO path do atomic_cmpxchng of
restarts flag before running the h/w queue.  Below code change - (main fix
is sequence of atomic_cmpxchg and blk_mq_run_hw_queues) fix the issue.

--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -594,8 +594,27 @@ static bool scsi_end_request(struct request *req,
blk_status_t error,
        if (scsi_target(sdev)->single_lun ||
            !list_empty(&sdev->host->starved_list))
                kblockd_schedule_work(&sdev->requeue_work);
-       else
-               blk_mq_run_hw_queues(q, true);
+       else {
+               /*
+                * smp_mb() implied in either rq->end_io or
blk_mq_free_request
+                * is for ordering writing .device_busy in
scsi_device_unbusy()
+                * and reading sdev->restarts.
+                */
+               int old = atomic_read(&sdev->restarts);
+
+               if (old) {
+                       /*
+                        * ->restarts has to be kept as non-zero if there
is
+                        *  new budget contention comes.
+                        */
+                       atomic_cmpxchg(&sdev->restarts, old, 0);
+
+                       /* run the queue after restarts flag is updated
+                        * to avoid race condition with .get_budget
+                        */
+                       blk_mq_run_hw_queues(sdev->request_queue, true);
+               }
+       }

        percpu_ref_put(&q->q_usage_counter);
        return false;

Kashyap
Ming Lei Aug. 9, 2020, 2:16 a.m. | #43
On Sun, Aug 09, 2020 at 12:35:21AM +0530, Kashyap Desai wrote:
> > On Thu, Aug 06, 2020 at 08:07:38PM +0530, Kashyap Desai wrote:

> > > > > Hi Ming -

> > > > >

> > > > > There is still some race which is not handled.  Take a case of IO

> > > > > is not able to get budget and it has already marked <restarts>

> flag.

> > > > > <restarts> flag will be seen non-zero in completion path and

> > > > > completion path will attempt h/w queue run. (But this particular

> > > > > IO is still not in s/w queue.).

> > > > > Attempt of running h/w queue from completion path will not flush

> > > > > any IO since there is no IO in s/w queue.

> > > >

> > > > Then where is the IO to be submitted in case of running out of

> budget?

> > >

> > > Typical race in your latest patch is - (Lets consider command A,B and

> > > C) Command A did not receive budget. Command B completed  (which was

> > > already

> >

> > Command A doesn't get budget, and A is still in sw/scheduler queue

> because

> > we try to acquire budget before dequeuing request from sw/scheduler

> queue,

> > see __blk_mq_do_dispatch_sched() and blk_mq_do_dispatch_ctx().

> >

> > Not consider direct issue, because the hw queue will be run explicitly

> when

> > not getting budget, see __blk_mq_try_issue_directly.

> >

> > Not consider command A being added to hctx->dispatch too, because blk-mq

> > will re-run the queue, see blk_mq_dispatch_rq_list().

> 

> Ming -

> 

> After going through your comment (I noted your comment and thanks for

> correcting my understanding.) and block layer code, I realize that it is a

> different race condition. My previous explanation was not accurate.

> I debug further and figure out what is actually happening - Consider below

> scenario/sequence -

> 

> Thread -1 - Detected budget contention. Set restarts = 1.

> Thread -2 - old restarts = 1. start hw queue.

> Thread -3 - old restarts = 1. start hw queue.

> Thread -2 - move restarts = 0.

> In my testing, I noticed that both thread-2 and thread-3 started h/w queue

> but there was no work for them to do. It is possible because some other

> context of h/w queue run might have done that job.


It should be true, because there is other run queue somewhere, such as
blk-mq's restart or delay run queue.

> It means, IO of thread-1 is already posted.


OK.

> Thread -4 - Detected budget contention. Set restart = 1 (because thread-2

> has move restarts=0).


OK.

> Thread -3 - move restarts = 0 (because this thread see old value = 1 but

> that is actually updated one more time by thread-4 and theread-4 actually

> wanted to run h/w queues). IO of Thread-4 will not be scheduled.


Right.

> 

> We have to make sure that completion IO path do atomic_cmpxchng of

> restarts flag before running the h/w queue.  Below code change - (main fix

> is sequence of atomic_cmpxchg and blk_mq_run_hw_queues) fix the issue.

> 

> --- a/drivers/scsi/scsi_lib.c

> +++ b/drivers/scsi/scsi_lib.c

> @@ -594,8 +594,27 @@ static bool scsi_end_request(struct request *req,

> blk_status_t error,

>         if (scsi_target(sdev)->single_lun ||

>             !list_empty(&sdev->host->starved_list))

>                 kblockd_schedule_work(&sdev->requeue_work);

> -       else

> -               blk_mq_run_hw_queues(q, true);

> +       else {

> +               /*

> +                * smp_mb() implied in either rq->end_io or

> blk_mq_free_request

> +                * is for ordering writing .device_busy in

> scsi_device_unbusy()

> +                * and reading sdev->restarts.

> +                */

> +               int old = atomic_read(&sdev->restarts);

> +

> +               if (old) {

> +                       /*

> +                        * ->restarts has to be kept as non-zero if there

> is

> +                        *  new budget contention comes.

> +                        */

> +                       atomic_cmpxchg(&sdev->restarts, old, 0);

> +

> +                       /* run the queue after restarts flag is updated

> +                        * to avoid race condition with .get_budget

> +                        */

> +                       blk_mq_run_hw_queues(sdev->request_queue, true);

> +               }

> +       }

> 


I think the above change is right, and this patter is basically same with SCHED_RESTART
used in blk_mq_sched_restart().

BTW, could you run your function & performance test against the following new version?
Then I can include your test result in commit log for moving on.


From 06993ddf5c5dbe0e772cc38342919eb61a57bc50 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>

Date: Wed, 5 Aug 2020 16:35:53 +0800
Subject: [PATCH] scsi: core: only re-run queue in scsi_end_request() if device
 queue is busy

Now the request queue is run in scsi_end_request() unconditionally if both
target queue and host queue is ready. We should have re-run request queue
only after this device queue becomes busy for restarting this LUN only.

Recently Long Li reported that cost of run queue may be very heavy in
case of high queue depth. So improve this situation by only running
the request queue when this LUN is busy.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ewan D. Milne <emilne@redhat.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Long Li <longli@microsoft.com>
Reported-by: Long Li <longli@microsoft.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>

---
V4:
	- not clear .restarts in get_budget(), instead clearing it
	after re-run queue is done; Kashyap figured out we have to
	update ->restarts before re-run queue in scsi_run_queue_async().

V3:
	- add one smp_mb() in scsi_mq_get_budget() and comment

V2:
	- commit log change, no any code change
	- add reported-by tag

Signed-off-by: Ming Lei <ming.lei@redhat.com>

---
 drivers/scsi/scsi_lib.c    | 51 +++++++++++++++++++++++++++++++++++---
 include/scsi/scsi_device.h |  1 +
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index c866a4f33871..d083250f9518 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -552,8 +552,27 @@ static void scsi_run_queue_async(struct scsi_device *sdev)
 	if (scsi_target(sdev)->single_lun ||
 	    !list_empty(&sdev->host->starved_list))
 		kblockd_schedule_work(&sdev->requeue_work);
-	else
-		blk_mq_run_hw_queues(sdev->request_queue, true);
+	else {
+		/*
+		 * smp_mb() implied in either rq->end_io or blk_mq_free_request
+		 * is for ordering writing .device_busy in scsi_device_unbusy()
+		 * and reading sdev->restarts.
+		 */
+		int old = atomic_read(&sdev->restarts);
+
+		if (old) {
+			/*
+			 * ->restarts has to be kept as non-zero if there is
+			 *  new budget contention comes.
+			 *
+			 *  No need to run queue when either another re-run
+			 *  queue wins in updating ->restarts or one new budget
+			 *  contention comes.
+			 */
+			if (atomic_cmpxchg(&sdev->restarts, old, 0) == old)
+				blk_mq_run_hw_queues(sdev->request_queue, true);
+		}
+	}
 }
 
 /* Returns false when no more bytes to process, true if there are more */
@@ -1612,8 +1631,34 @@ static void scsi_mq_put_budget(struct request_queue *q)
 static bool scsi_mq_get_budget(struct request_queue *q)
 {
 	struct scsi_device *sdev = q->queuedata;
+	int ret = scsi_dev_queue_ready(q, sdev);
+
+	if (ret)
+		return true;
+
+	atomic_inc(&sdev->restarts);
 
-	return scsi_dev_queue_ready(q, sdev);
+	/*
+	 * Order writing .restarts and reading .device_busy, and make sure
+	 * .restarts is visible to scsi_end_request(). Its pair is implied by
+	 * __blk_mq_end_request() in scsi_end_request() for ordering
+	 * writing .device_busy in scsi_device_unbusy() and reading .restarts.
+	 *
+	 */
+	smp_mb__after_atomic();
+
+	/*
+	 * If all in-flight requests originated from this LUN are completed
+	 * before setting .restarts, sdev->device_busy will be observed as
+	 * zero, then blk_mq_delay_run_hw_queues() will dispatch this request
+	 * soon. Otherwise, completion of one of these request will observe
+	 * the .restarts flag, and the request queue will be run for handling
+	 * this request, see scsi_end_request().
+	 */
+	if (unlikely(atomic_read(&sdev->device_busy) == 0 &&
+				!scsi_device_blocked(sdev)))
+		blk_mq_delay_run_hw_queues(sdev->request_queue, SCSI_QUEUE_DELAY);
+	return false;
 }
 
 static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index bc5909033d13..1a5c9a3df6d6 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -109,6 +109,7 @@ struct scsi_device {
 	atomic_t device_busy;		/* commands actually active on LLDD */
 	atomic_t device_blocked;	/* Device returned QUEUE_FULL. */
 
+	atomic_t restarts;
 	spinlock_t list_lock;
 	struct list_head starved_entry;
 	unsigned short queue_depth;	/* How deep of a queue we want */
-- 
2.25.2



Thanks,
Ming
Kashyap Desai Aug. 10, 2020, 4:38 p.m. | #44
> On Sun, Aug 09, 2020 at 12:35:21AM +0530, Kashyap Desai wrote:

> > > On Thu, Aug 06, 2020 at 08:07:38PM +0530, Kashyap Desai wrote:

> > > > > > Hi Ming -

> > > > > >

> > > > > > There is still some race which is not handled.  Take a case of

> > > > > > IO is not able to get budget and it has already marked

> > > > > > <restarts>

> > flag.

> > > > > > <restarts> flag will be seen non-zero in completion path and

> > > > > > completion path will attempt h/w queue run. (But this

> > > > > > particular IO is still not in s/w queue.).

> > > > > > Attempt of running h/w queue from completion path will not

> > > > > > flush any IO since there is no IO in s/w queue.

> > > > >

> > > > > Then where is the IO to be submitted in case of running out of

> > budget?

> > > >

> > > > Typical race in your latest patch is - (Lets consider command A,B

> > > > and

> > > > C) Command A did not receive budget. Command B completed  (which

> > > > was already

> > >

> > > Command A doesn't get budget, and A is still in sw/scheduler queue

> > because

> > > we try to acquire budget before dequeuing request from sw/scheduler

> > queue,

> > > see __blk_mq_do_dispatch_sched() and blk_mq_do_dispatch_ctx().

> > >

> > > Not consider direct issue, because the hw queue will be run

> > > explicitly

> > when

> > > not getting budget, see __blk_mq_try_issue_directly.

> > >

> > > Not consider command A being added to hctx->dispatch too, because

> > > blk-mq will re-run the queue, see blk_mq_dispatch_rq_list().

> >

> > Ming -

> >

> > After going through your comment (I noted your comment and thanks for

> > correcting my understanding.) and block layer code, I realize that it

> > is a different race condition. My previous explanation was not

accurate.
> > I debug further and figure out what is actually happening - Consider

> > below scenario/sequence -

> >

> > Thread -1 - Detected budget contention. Set restarts = 1.

> > Thread -2 - old restarts = 1. start hw queue.

> > Thread -3 - old restarts = 1. start hw queue.

> > Thread -2 - move restarts = 0.

> > In my testing, I noticed that both thread-2 and thread-3 started h/w

> > queue but there was no work for them to do. It is possible because

> > some other context of h/w queue run might have done that job.

>

> It should be true, because there is other run queue somewhere, such as

blk-
> mq's restart or delay run queue.

>

> > It means, IO of thread-1 is already posted.

>

> OK.

>

> > Thread -4 - Detected budget contention. Set restart = 1 (because

> > thread-2 has move restarts=0).

>

> OK.

>

> > Thread -3 - move restarts = 0 (because this thread see old value = 1

> > but that is actually updated one more time by thread-4 and theread-4

> > actually wanted to run h/w queues). IO of Thread-4 will not be

scheduled.
>

> Right.

>

> >

> > We have to make sure that completion IO path do atomic_cmpxchng of

> > restarts flag before running the h/w queue.  Below code change - (main

> > fix is sequence of atomic_cmpxchg and blk_mq_run_hw_queues) fix the

> issue.

> >

> > --- a/drivers/scsi/scsi_lib.c

> > +++ b/drivers/scsi/scsi_lib.c

> > @@ -594,8 +594,27 @@ static bool scsi_end_request(struct request *req,

> > blk_status_t error,

> >         if (scsi_target(sdev)->single_lun ||

> >             !list_empty(&sdev->host->starved_list))

> >                 kblockd_schedule_work(&sdev->requeue_work);

> > -       else

> > -               blk_mq_run_hw_queues(q, true);

> > +       else {

> > +               /*

> > +                * smp_mb() implied in either rq->end_io or

> > blk_mq_free_request

> > +                * is for ordering writing .device_busy in

> > scsi_device_unbusy()

> > +                * and reading sdev->restarts.

> > +                */

> > +               int old = atomic_read(&sdev->restarts);

> > +

> > +               if (old) {

> > +                       /*

> > +                        * ->restarts has to be kept as non-zero if

> > + there

> > is

> > +                        *  new budget contention comes.

> > +                        */

> > +                       atomic_cmpxchg(&sdev->restarts, old, 0);

> > +

> > +                       /* run the queue after restarts flag is

updated
> > +                        * to avoid race condition with .get_budget

> > +                        */

> > +                       blk_mq_run_hw_queues(sdev->request_queue,

true);
> > +               }

> > +       }

> >

>

> I think the above change is right, and this patter is basically same

with
> SCHED_RESTART used in blk_mq_sched_restart().

>

> BTW, could you run your function & performance test against the

following
> new version?

> Then I can include your test result in commit log for moving on.


Ming  - I completed both functional and performance test.

System used for the test -
Manufacturer: Supermicro
Product Name: X11DPG-QT

lscpu <snippet>
CPU(s):                72
On-line CPU(s) list:   0-71
Thread(s) per core:    2
Core(s) per socket:    18
Socket(s):             2
NUMA node(s):          2
Model name:            Intel(R) Xeon(R) Gold 6150 CPU @ 2.70GHz

Controller used -
MegaRAID 9560-16i

Total 24 SAS driver of model "WDC      WUSTM3240ASS200"

Total 3 VD created each VD consist of 8 SAS Drives.

Performance testing -

Fio script -
[global]
ioengine=libaio
direct=1
sync=0
ramp_time=20
runtime=60
cpus_allowed=18,19
bs=4k
rw=randread
ioscheduler=none
iodepth=128

[seqprecon]
filename=/dev/sdc
[seqprecon]
filename=/dev/sdd
[seqprecon]
filename=/dev/sde

Without this patch - 602K IOPS. Perf top snippet -(Note - Usage of
blk_mq_run_hw_queues -> blk_mq_run_hw_queue is very high. It consume more
CPU which leads to less performance.)

     8.70%  [kernel]        [k] blk_mq_run_hw_queue
     5.24%  [megaraid_sas]  [k] complete_cmd_fusion
     4.65%  [kernel]        [k] sbitmap_any_bit_set
     3.93%  [kernel]        [k] irq_entries_start
     3.58%  perf            [.] __symbols__insert
     2.21%  [megaraid_sas]  [k] megasas_build_and_issue_cmd_fusion
     1.91%  [kernel]        [k] blk_mq_run_hw_queues

With this patch - 1110K IOPS. Perf top snippet -

    8.05%  [megaraid_sas]  [k] complete_cmd_fusion
     4.10%  [kernel]        [k] irq_entries_start
     3.71%  [megaraid_sas]  [k] megasas_build_and_issue_cmd_fusion
     2.85%  [kernel]        [k] read_tsc
     2.83%  [kernel]        [k] io_submit_one
     2.26%  [kernel]        [k] entry_SYSCALL_64
     2.08%  [megaraid_sas]  [k] megasas_queue_command


Functional Test -

I cover overnight IO testing using <fio> script which sends 4K rand read,
read, rand write and write IOs to the 24 SAS JBOD drives.
Some of the JBOD has ioscheduler=none and some of the JBOD has
ioscheduler=mq-deadline
I used additional script which change sdev->queue_depth of each device
from 2 to 16 range at the interval of 5 seconds.
I used additional script which toggle "rq_affinity=1" and "rq_affinity=2"
at the interval of 5 seconds.

I did not noticed any IO hang.

Thanks, Kashyap

>

>

> From 06993ddf5c5dbe0e772cc38342919eb61a57bc50 Mon Sep 17 00:00:00

> 2001

> From: Ming Lei <ming.lei@redhat.com>

> Date: Wed, 5 Aug 2020 16:35:53 +0800

> Subject: [PATCH] scsi: core: only re-run queue in scsi_end_request() if

device
> queue is busy

>

> Now the request queue is run in scsi_end_request() unconditionally if

both
> target queue and host queue is ready. We should have re-run request

queue
> only after this device queue becomes busy for restarting this LUN only.

>

> Recently Long Li reported that cost of run queue may be very heavy in

case of
> high queue depth. So improve this situation by only running the request

queue
> when this LUN is busy.

>

> Cc: Jens Axboe <axboe@kernel.dk>

> Cc: Ewan D. Milne <emilne@redhat.com>

> Cc: Kashyap Desai <kashyap.desai@broadcom.com>

> Cc: Hannes Reinecke <hare@suse.de>

> Cc: Bart Van Assche <bvanassche@acm.org>

> Cc: Damien Le Moal <damien.lemoal@wdc.com>

> Cc: Long Li <longli@microsoft.com>

> Reported-by: Long Li <longli@microsoft.com>

> Signed-off-by: Ming Lei <ming.lei@redhat.com>

> ---

> V4:

> 	- not clear .restarts in get_budget(), instead clearing it

> 	after re-run queue is done; Kashyap figured out we have to

> 	update ->restarts before re-run queue in scsi_run_queue_async().

>

> V3:

> 	- add one smp_mb() in scsi_mq_get_budget() and comment

>

> V2:

> 	- commit log change, no any code change

> 	- add reported-by tag

>

> Signed-off-by: Ming Lei <ming.lei@redhat.com>

> ---

>  drivers/scsi/scsi_lib.c    | 51 +++++++++++++++++++++++++++++++++++---

>  include/scsi/scsi_device.h |  1 +

>  2 files changed, 49 insertions(+), 3 deletions(-)

>

> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index

> c866a4f33871..d083250f9518 100644

> --- a/drivers/scsi/scsi_lib.c

> +++ b/drivers/scsi/scsi_lib.c

> @@ -552,8 +552,27 @@ static void scsi_run_queue_async(struct scsi_device

> *sdev)

>  	if (scsi_target(sdev)->single_lun ||

>  	    !list_empty(&sdev->host->starved_list))

>  		kblockd_schedule_work(&sdev->requeue_work);

> -	else

> -		blk_mq_run_hw_queues(sdev->request_queue, true);

> +	else {

> +		/*

> +		 * smp_mb() implied in either rq->end_io or

> blk_mq_free_request

> +		 * is for ordering writing .device_busy in

scsi_device_unbusy()
> +		 * and reading sdev->restarts.

> +		 */

> +		int old = atomic_read(&sdev->restarts);

> +

> +		if (old) {

> +			/*

> +			 * ->restarts has to be kept as non-zero if there

is
> +			 *  new budget contention comes.

> +			 *

> +			 *  No need to run queue when either another

re-run
> +			 *  queue wins in updating ->restarts or one new

> budget

> +			 *  contention comes.

> +			 */

> +			if (atomic_cmpxchg(&sdev->restarts, old, 0) ==

old)
> +				blk_mq_run_hw_queues(sdev-

> >request_queue, true);

> +		}

> +	}

>  }

>

>  /* Returns false when no more bytes to process, true if there are more

*/
> @@ -1612,8 +1631,34 @@ static void scsi_mq_put_budget(struct

> request_queue *q)  static bool scsi_mq_get_budget(struct request_queue

*q)
> {

>  	struct scsi_device *sdev = q->queuedata;

> +	int ret = scsi_dev_queue_ready(q, sdev);

> +

> +	if (ret)

> +		return true;

> +

> +	atomic_inc(&sdev->restarts);

>

> -	return scsi_dev_queue_ready(q, sdev);

> +	/*

> +	 * Order writing .restarts and reading .device_busy, and make sure

> +	 * .restarts is visible to scsi_end_request(). Its pair is implied

by
> +	 * __blk_mq_end_request() in scsi_end_request() for ordering

> +	 * writing .device_busy in scsi_device_unbusy() and reading

.restarts.
> +	 *

> +	 */

> +	smp_mb__after_atomic();

> +

> +	/*

> +	 * If all in-flight requests originated from this LUN are

completed
> +	 * before setting .restarts, sdev->device_busy will be observed as

> +	 * zero, then blk_mq_delay_run_hw_queues() will dispatch this

> request

> +	 * soon. Otherwise, completion of one of these request will

observe
> +	 * the .restarts flag, and the request queue will be run for

handling
> +	 * this request, see scsi_end_request().

> +	 */

> +	if (unlikely(atomic_read(&sdev->device_busy) == 0 &&

> +				!scsi_device_blocked(sdev)))

> +		blk_mq_delay_run_hw_queues(sdev->request_queue,

> SCSI_QUEUE_DELAY);

> +	return false;

>  }

>

>  static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, diff

--git
> a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index

> bc5909033d13..1a5c9a3df6d6 100644

> --- a/include/scsi/scsi_device.h

> +++ b/include/scsi/scsi_device.h

> @@ -109,6 +109,7 @@ struct scsi_device {

>  	atomic_t device_busy;		/* commands actually active on

LLDD
> */

>  	atomic_t device_blocked;	/* Device returned QUEUE_FULL. */

>

> +	atomic_t restarts;

>  	spinlock_t list_lock;

>  	struct list_head starved_entry;

>  	unsigned short queue_depth;	/* How deep of a queue we want */

> --

> 2.25.2

>

>

>

> Thanks,

> Ming
John Garry Aug. 11, 2020, 8:09 a.m. | #45
On 10/08/2020 17:38, Kashyap Desai wrote:
> 

> Functional Test -

> 

> I cover overnight IO testing using <fio> script which sends 4K rand read,

> read, rand write and write IOs to the 24 SAS JBOD drives.

> Some of the JBOD has ioscheduler=none and some of the JBOD has

> ioscheduler=mq-deadline

> I used additional script which change sdev->queue_depth of each device

> from 2 to 16 range at the interval of 5 seconds.

> I used additional script which toggle "rq_affinity=1" and "rq_affinity=2"

> at the interval of 5 seconds.

> 

> I did not noticed any IO hang.

> 

> Thanks, Kashyap

> 


Nice work. I think v8 series can now be prepared, since this final 
performance issue reported looks resolved. But I still don't know what's 
going on for "[PATCHv6 00/21] scsi: enable reserved commands for LLDDs", 
which current hpsa patch relies on for basic functionality :(

>>

>>  From 06993ddf5c5dbe0e772cc38342919eb61a57bc50 Mon Sep 17 00:00:00

>> 2001

>> From: Ming Lei<ming.lei@redhat.com>

>> Date: Wed, 5 Aug 2020 16:35:53 +0800

>> Subject: [PATCH] scsi: core: only re-run queue in scsi_end_request() if

> device

>> queue is busy


Ming, I assume that you will send this directly to SCSI maintainers when 
the merge window closes.

>>

>> Now the request queue is run in scsi_end_request() unconditionally if

> both

>> target queue and host queue is ready. We should have re-run request

> queue

>> only after this device queue becomes busy for restarting this LUN only.

>>

>> Recently Long Li reported that cost of run queue may be very heavy in

> case of

>> high queue depth. So improve this situation by only running the request

> queue

>> when this LUN is busy.

Patch

diff --git a/drivers/scsi/megaraid/megaraid_sas.h b/drivers/scsi/megaraid/megaraid_sas.h
index af2c7a2a9565..b27a34a5f5de 100644
--- a/drivers/scsi/megaraid/megaraid_sas.h
+++ b/drivers/scsi/megaraid/megaraid_sas.h
@@ -2261,7 +2261,6 @@  enum MR_PERF_MODE {
 
 struct megasas_instance {
 
-	unsigned int *reply_map;
 	__le32 *producer;
 	dma_addr_t producer_h;
 	__le32 *consumer;
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index 00668335c2af..e6bb2a64d51c 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -37,6 +37,7 @@ 
 #include <linux/poll.h>
 #include <linux/vmalloc.h>
 #include <linux/irq_poll.h>
+#include <linux/blk-mq-pci.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
@@ -3115,6 +3116,19 @@  megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev,
 	return 0;
 }
 
+static int megasas_map_queues(struct Scsi_Host *shost)
+{
+	struct megasas_instance *instance;
+
+	instance = (struct megasas_instance *)shost->hostdata;
+
+	if (!instance->smp_affinity_enable)
+		return 0;
+
+	return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],
+			instance->pdev, instance->low_latency_index_start);
+}
+
 static void megasas_aen_polling(struct work_struct *work);
 
 /**
@@ -3423,8 +3437,10 @@  static struct scsi_host_template megasas_template = {
 	.eh_timed_out = megasas_reset_timer,
 	.shost_attrs = megaraid_host_attrs,
 	.bios_param = megasas_bios_param,
+	.map_queues = megasas_map_queues,
 	.change_queue_depth = scsi_change_queue_depth,
 	.max_segment_size = 0xffffffff,
+	.host_tagset = 1,
 };
 
 /**
@@ -5708,34 +5724,6 @@  megasas_setup_jbod_map(struct megasas_instance *instance)
 		instance->use_seqnum_jbod_fp = false;
 }
 
-static void megasas_setup_reply_map(struct megasas_instance *instance)
-{
-	const struct cpumask *mask;
-	unsigned int queue, cpu, low_latency_index_start;
-
-	low_latency_index_start = instance->low_latency_index_start;
-
-	for (queue = low_latency_index_start; queue < instance->msix_vectors; queue++) {
-		mask = pci_irq_get_affinity(instance->pdev, queue);
-		if (!mask)
-			goto fallback;
-
-		for_each_cpu(cpu, mask)
-			instance->reply_map[cpu] = queue;
-	}
-	return;
-
-fallback:
-	queue = low_latency_index_start;
-	for_each_possible_cpu(cpu) {
-		instance->reply_map[cpu] = queue;
-		if (queue == (instance->msix_vectors - 1))
-			queue = low_latency_index_start;
-		else
-			queue++;
-	}
-}
-
 /**
  * megasas_get_device_list -	Get the PD and LD device list from FW.
  * @instance:			Adapter soft state
@@ -6158,8 +6146,6 @@  static int megasas_init_fw(struct megasas_instance *instance)
 			goto fail_init_adapter;
 	}
 
-	megasas_setup_reply_map(instance);
-
 	dev_info(&instance->pdev->dev,
 		"current msix/online cpus\t: (%d/%d)\n",
 		instance->msix_vectors, (unsigned int)num_online_cpus());
@@ -6793,6 +6779,9 @@  static int megasas_io_attach(struct megasas_instance *instance)
 	host->max_id = MEGASAS_MAX_DEV_PER_CHANNEL;
 	host->max_lun = MEGASAS_MAX_LUN;
 	host->max_cmd_len = 16;
+	if (instance->adapter_type != MFI_SERIES && instance->msix_vectors > 0)
+		host->nr_hw_queues = instance->msix_vectors -
+			instance->low_latency_index_start;
 
 	/*
 	 * Notify the mid-layer about the new controller
@@ -6960,11 +6949,6 @@  static inline int megasas_alloc_mfi_ctrl_mem(struct megasas_instance *instance)
  */
 static int megasas_alloc_ctrl_mem(struct megasas_instance *instance)
 {
-	instance->reply_map = kcalloc(nr_cpu_ids, sizeof(unsigned int),
-				      GFP_KERNEL);
-	if (!instance->reply_map)
-		return -ENOMEM;
-
 	switch (instance->adapter_type) {
 	case MFI_SERIES:
 		if (megasas_alloc_mfi_ctrl_mem(instance))
@@ -6981,8 +6965,6 @@  static int megasas_alloc_ctrl_mem(struct megasas_instance *instance)
 
 	return 0;
  fail:
-	kfree(instance->reply_map);
-	instance->reply_map = NULL;
 	return -ENOMEM;
 }
 
@@ -6995,7 +6977,6 @@  static int megasas_alloc_ctrl_mem(struct megasas_instance *instance)
  */
 static inline void megasas_free_ctrl_mem(struct megasas_instance *instance)
 {
-	kfree(instance->reply_map);
 	if (instance->adapter_type == MFI_SERIES) {
 		if (instance->producer)
 			dma_free_coherent(&instance->pdev->dev, sizeof(u32),
@@ -7683,8 +7664,6 @@  megasas_resume(struct pci_dev *pdev)
 			goto fail_reenable_msix;
 	}
 
-	megasas_setup_reply_map(instance);
-
 	if (instance->adapter_type != MFI_SERIES) {
 		megasas_reset_reply_desc(instance);
 		if (megasas_ioc_init_fusion(instance)) {
diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
index 319f241da4b6..8e25b700988e 100644
--- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
+++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
@@ -373,24 +373,24 @@  megasas_get_msix_index(struct megasas_instance *instance,
 {
 	int sdev_busy;
 
-	/* nr_hw_queue = 1 for MegaRAID */
-	struct blk_mq_hw_ctx *hctx =
-		scmd->device->request_queue->queue_hw_ctx[0];
+	struct blk_mq_hw_ctx *hctx = scmd->request->mq_hctx;
 
 	sdev_busy = atomic_read(&hctx->nr_active);
 
 	if (instance->perf_mode == MR_BALANCED_PERF_MODE &&
-	    sdev_busy > (data_arms * MR_DEVICE_HIGH_IOPS_DEPTH))
+	    sdev_busy > (data_arms * MR_DEVICE_HIGH_IOPS_DEPTH)) {
 		cmd->request_desc->SCSIIO.MSIxIndex =
 			mega_mod64((atomic64_add_return(1, &instance->high_iops_outstanding) /
 					MR_HIGH_IOPS_BATCH_COUNT), instance->low_latency_index_start);
-	else if (instance->msix_load_balance)
+	} else if (instance->msix_load_balance) {
 		cmd->request_desc->SCSIIO.MSIxIndex =
 			(mega_mod64(atomic64_add_return(1, &instance->total_io_count),
 				instance->msix_vectors));
-	else
-		cmd->request_desc->SCSIIO.MSIxIndex =
-			instance->reply_map[raw_smp_processor_id()];
+	} else {
+		u32 tag = blk_mq_unique_tag(scmd->request);
+
+		cmd->request_desc->SCSIIO.MSIxIndex = blk_mq_unique_tag_to_hwq(tag) + instance->low_latency_index_start;
+	}
 }
 
 /**
@@ -3326,7 +3326,7 @@  megasas_build_and_issue_cmd_fusion(struct megasas_instance *instance,
 {
 	struct megasas_cmd_fusion *cmd, *r1_cmd = NULL;
 	union MEGASAS_REQUEST_DESCRIPTOR_UNION *req_desc;
-	u32 index;
+	u32 index, blk_tag, unique_tag;
 
 	if ((megasas_cmd_type(scmd) == READ_WRITE_LDIO) &&
 		instance->ldio_threshold &&
@@ -3342,7 +3342,9 @@  megasas_build_and_issue_cmd_fusion(struct megasas_instance *instance,
 		return SCSI_MLQUEUE_HOST_BUSY;
 	}
 
-	cmd = megasas_get_cmd_fusion(instance, scmd->request->tag);
+	unique_tag = blk_mq_unique_tag(scmd->request);
+	blk_tag = blk_mq_unique_tag_to_tag(unique_tag);
+	cmd = megasas_get_cmd_fusion(instance, blk_tag);
 
 	if (!cmd) {
 		atomic_dec(&instance->fw_outstanding);
@@ -3383,7 +3385,7 @@  megasas_build_and_issue_cmd_fusion(struct megasas_instance *instance,
 	 */
 	if (cmd->r1_alt_dev_handle != MR_DEVHANDLE_INVALID) {
 		r1_cmd = megasas_get_cmd_fusion(instance,
-				(scmd->request->tag + instance->max_fw_cmds));
+				(blk_tag + instance->max_fw_cmds));
 		megasas_prepare_secondRaid1_IO(instance, cmd, r1_cmd);
 	}