diff mbox series

aacraid: reply queue mapping to CPUs based of IRQ affinity

Message ID 20230328214124.26419-1-sagar.biradar@microchip.com
State New
Headers show
Series aacraid: reply queue mapping to CPUs based of IRQ affinity | expand

Commit Message

Sagar Biradar March 28, 2023, 9:41 p.m. UTC
Fix the IO hang that arises because of MSIx vector not
having a mapped online CPU upon receiving completion.
This patch sets up a reply queue mapping to CPUs based on the
IRQ affinity retrieved using pci_irq_get_affinity() API.

Reviewed-by: Gilbert Wu <gilbert.wu@microchip.com>
Signed-off-by: Sagar Biradar <Sagar.Biradar@microchip.com>
---
 drivers/scsi/aacraid/aacraid.h  |  1 +
 drivers/scsi/aacraid/comminit.c | 25 +++++++++++++++++++++++++
 drivers/scsi/aacraid/linit.c    | 11 +++++++++++
 drivers/scsi/aacraid/src.c      |  2 +-
 4 files changed, 38 insertions(+), 1 deletion(-)

Comments

Sagar Biradar April 10, 2023, 9:17 p.m. UTC | #1
-----Original Message-----
From: John Garry <john.g.garry@oracle.com> 
Sent: Wednesday, March 29, 2023 12:08 AM
To: Sagar Biradar - C34249 <Sagar.Biradar@microchip.com>; Don Brace - C33706 <Don.Brace@microchip.com>; Gilbert Wu - C33504 <Gilbert.Wu@microchip.com>; linux-scsi@vger.kernel.org; Martin Petersen <martin.petersen@oracle.com>; James Bottomley <jejb@linux.ibm.com>; Brian King <brking@linux.vnet.ibm.com>; stable@vger.kernel.org; Tom White - C33503 <Tom.White@microchip.com>
Subject: Re: [PATCH] aacraid: reply queue mapping to CPUs based of IRQ affinity

EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe

On 28/03/2023 22:41, Sagar Biradar wrote:
> Fix the IO hang that arises because of MSIx vector not having a mapped 
> online CPU upon receiving completion.

What about if the CPU targeted goes offline while the IO is in-flight?

> This patch sets up a reply queue mapping to CPUs based on the IRQ 
> affinity retrieved using pci_irq_get_affinity() API.
>

blk-mq already does what you want here, including handling for the case I mention above. It maintains a CPU -> HW queue mapping, and using a reply map in the LLD is the old way of doing this.

Could you instead follow the example in commit 664f0dce2058 ("scsi:
mpt3sas: Add support for shared host tagset for CPU hotplug"), and expose the HW queues to the upper layer? You can alternatively check the example of any SCSI driver which sets shost->host_tagset for this.

Thanks,
John
[Sagar Biradar] 

***What about if the CPU targeted goes offline while the IO is in-flight?
We ran multiple random cases with the IO's running in parallel and disabling load-bearing CPU's. We saw that the load was transferred to the other online CPUs successfully every time.
The same was tested at vendor and their customer site - they did not see any issues too.


***blk-mq already does what you want here, including handling for the case I mention above. It maintains a CPU -> HW queue mapping, and using a reply map in the LLD is the old way of doing this.
We also tried implementing the blk-mq mechanism in the driver and we saw command timeouts. 
The firmware has limitation of fixed number of queues per vector and the blk-mq changes would saturate that limit.
That answers the possible command timeout. 

Also this is EOL product and there will be no firmware code changes. Given this, we have decided to stick to the reply_map mechanism.
(https://storage.microsemi.com/en-us/support/series8/index.php)

Thank you for your review comments and we hope you will reconsider the original patch.

Thanks
Sagar

> Reviewed-by: Gilbert Wu <gilbert.wu@microchip.com>
> Signed-off-by: Sagar Biradar <Sagar.Biradar@microchip.com>
> ---
>   drivers/scsi/aacraid/aacraid.h  |  1 +
>   drivers/scsi/aacraid/comminit.c | 25 +++++++++++++++++++++++++
>   drivers/scsi/aacraid/linit.c    | 11 +++++++++++
>   drivers/scsi/aacraid/src.c      |  2 +-
>   4 files changed, 38 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/scsi/aacraid/aacraid.h 
> b/drivers/scsi/aacraid/aacraid.h index 5e115e8b2ba4..4a23f9fab61f 
> 100644
> --- a/drivers/scsi/aacraid/aacraid.h
> +++ b/drivers/scsi/aacraid/aacraid.h
> @@ -1678,6 +1678,7 @@ struct aac_dev
>       u32                     handle_pci_error;
>       bool                    init_reset;
>       u8                      soft_reset_support;
> +     unsigned int *reply_map;
>   };
>
>   #define aac_adapter_interrupt(dev) \ diff --git 
> a/drivers/scsi/aacraid/comminit.c b/drivers/scsi/aacraid/comminit.c 
> index bd99c5492b7d..6fc323844a31 100644
> --- a/drivers/scsi/aacraid/comminit.c
> +++ b/drivers/scsi/aacraid/comminit.c
> @@ -33,6 +33,8 @@
>
>   #include "aacraid.h"
>
> +void aac_setup_reply_map(struct aac_dev *dev);
> +
>   struct aac_common aac_config = {
>       .irq_mod = 1
>   };
> @@ -630,6 +632,9 @@ struct aac_dev *aac_init_adapter(struct aac_dev 
> *dev)
>
>       if (aac_is_src(dev))
>               aac_define_int_mode(dev);
> +
> +     aac_setup_reply_map(dev);
> +
>       /*
>        *      Ok now init the communication subsystem
>        */
> @@ -658,3 +663,23 @@ struct aac_dev *aac_init_adapter(struct aac_dev *dev)
>       return dev;
>   }
>
> +void aac_setup_reply_map(struct aac_dev *dev) {
> +     const struct cpumask *mask;
> +     unsigned int i, cpu = 1;
> +
> +     for (i = 1; i < dev->max_msix; i++) {
> +             mask = pci_irq_get_affinity(dev->pdev, i);
> +             if (!mask)
> +                     goto fallback;
> +
> +             for_each_cpu(cpu, mask) {
> +                     dev->reply_map[cpu] = i;
> +             }
> +     }
> +     return;
> +
> +fallback:
> +     for_each_possible_cpu(cpu)
> +             dev->reply_map[cpu] = 0; }
> diff --git a/drivers/scsi/aacraid/linit.c 
> b/drivers/scsi/aacraid/linit.c index 5ba5c18b77b4..af60c7d26407 100644
> --- a/drivers/scsi/aacraid/linit.c
> +++ b/drivers/scsi/aacraid/linit.c
> @@ -1668,6 +1668,14 @@ static int aac_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
>               goto out_free_host;
>       }
>
> +     aac->reply_map = kzalloc(sizeof(unsigned int) * nr_cpu_ids,
> +                             GFP_KERNEL);
> +     if (!aac->reply_map) {
> +             error = -ENOMEM;
> +             dev_err(&pdev->dev, "reply_map allocation failed\n");
> +             goto out_free_host;
> +     }
> +
>       spin_lock_init(&aac->fib_lock);
>
>       mutex_init(&aac->ioctl_mutex);
> @@ -1797,6 +1805,8 @@ static int aac_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
>                                 aac->comm_addr, aac->comm_phys);
>       kfree(aac->queues);
>       aac_adapter_ioremap(aac, 0);
> +     /* By now we should have configured the reply_map */
> +     kfree(aac->reply_map);
>       kfree(aac->fibs);
>       kfree(aac->fsa_dev);
>    out_free_host:
> @@ -1918,6 +1928,7 @@ static void aac_remove_one(struct pci_dev *pdev)
>
>       aac_adapter_ioremap(aac, 0);
>
> +     kfree(aac->reply_map);
>       kfree(aac->fibs);
>       kfree(aac->fsa_dev);
>
> diff --git a/drivers/scsi/aacraid/src.c b/drivers/scsi/aacraid/src.c 
> index 11ef58204e96..e84ec60a655b 100644
> --- a/drivers/scsi/aacraid/src.c
> +++ b/drivers/scsi/aacraid/src.c
> @@ -506,7 +506,7 @@ static int aac_src_deliver_message(struct fib *fib)
>                       && dev->sa_firmware)
>                       vector_no = aac_get_vector(dev);
>               else
> -                     vector_no = fib->vector_no;
> +                     vector_no = 
> + dev->reply_map[raw_smp_processor_id()];
>
>               if (native_hba) {
>                       if (fib->flags & 
> FIB_CONTEXT_FLAG_NATIVE_HBA_TMF) {
John Garry April 12, 2023, 9:38 a.m. UTC | #2
On 10/04/2023 22:17, Sagar.Biradar@microchip.com wrote:
> On 28/03/2023 22:41, Sagar Biradar wrote:
>> Fix the IO hang that arises because of MSIx vector not having a mapped
>> online CPU upon receiving completion.
> What about if the CPU targeted goes offline while the IO is in-flight?
> 
>> This patch sets up a reply queue mapping to CPUs based on the IRQ
>> affinity retrieved using pci_irq_get_affinity() API.
>>
> blk-mq already does what you want here, including handling for the case I mention above. It maintains a CPU -> HW queue mapping, and using a reply map in the LLD is the old way of doing this.
> 
> Could you instead follow the example in commit 664f0dce2058 ("scsi:
> mpt3sas: Add support for shared host tagset for CPU hotplug"), and expose the HW queues to the upper layer? You can alternatively check the example of any SCSI driver which sets shost->host_tagset for this.
> 
> Thanks,
> John
> [Sagar Biradar]
> 
> ***What about if the CPU targeted goes offline while the IO is in-flight?
> We ran multiple random cases with the IO's running in parallel and disabling load-bearing CPU's. We saw that the load was transferred to the other online CPUs successfully every time.
> The same was tested at vendor and their customer site - they did not see any issues too.

You need to ensure that all CPUs associated with the HW queue are 
offline and stay offline until any IO may timeout, which would be 30 
seconds according to SCSI sd default timeout. I am not sure if you were 
doing that exactly.

> 
> 
> ***blk-mq already does what you want here, including handling for the case I mention above. It maintains a CPU -> HW queue mapping, and using a reply map in the LLD is the old way of doing this.
> We also tried implementing the blk-mq mechanism in the driver and we saw command timeouts.
> The firmware has limitation of fixed number of queues per vector and the blk-mq changes would saturate that limit.
> That answers the possible command timeout.
> 
> Also this is EOL product and there will be no firmware code changes. Given this, we have decided to stick to the reply_map mechanism.
> (https://urldefense.com/v3/__https://storage.microsemi.com/en-us/support/series8/index.php__;!!ACWV5N9M2RV99hQ!PLrbfoEBvEGxw2CvahCL0AP5c4f5cQ8gT0ahXVgB0mSbyqxWJ8pdtYY0JwRL8xZ59k0NHJhXCBbMtVWlq5pYMeOEHmw7ww$  )
> 
> Thank you for your review comments and we hope you will reconsider the original patch.

I've been checking the driver a bit more and this drivers uses some 
"reserved" commands, right? That would be internal commands which the 
driver sends to the adapter which does not have a scsi_cmnd associated. 
If so, it gets a bit more tricky to use blk-mq support for HW queues, as 
we need to manually find a HW queue for those "reserved commands", like
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/scsi/hisi_sas/hisi_sas_main.c?h=v6.3-rc6#n532

Anyway, it's not up to me ...

Thanks,
John
Sagar Biradar April 12, 2023, 7:29 p.m. UTC | #3
-----Original Message-----
From: John Garry <john.g.garry@oracle.com> 
Sent: Wednesday, April 12, 2023 2:38 AM
To: Sagar Biradar - C34249 <Sagar.Biradar@microchip.com>; Don Brace - C33706 <Don.Brace@microchip.com>; Gilbert Wu - C33504 <Gilbert.Wu@microchip.com>; linux-scsi@vger.kernel.org; martin.petersen@oracle.com; jejb@linux.ibm.com; brking@linux.vnet.ibm.com; stable@vger.kernel.org; Tom White - C33503 <Tom.White@microchip.com>
Subject: Re: [PATCH] aacraid: reply queue mapping to CPUs based of IRQ affinity

EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe

On 10/04/2023 22:17, Sagar.Biradar@microchip.com wrote:
> On 28/03/2023 22:41, Sagar Biradar wrote:
>> Fix the IO hang that arises because of MSIx vector not having a 
>> mapped online CPU upon receiving completion.
> What about if the CPU targeted goes offline while the IO is in-flight?
>
>> This patch sets up a reply queue mapping to CPUs based on the IRQ 
>> affinity retrieved using pci_irq_get_affinity() API.
>>
> blk-mq already does what you want here, including handling for the case I mention above. It maintains a CPU -> HW queue mapping, and using a reply map in the LLD is the old way of doing this.
>
> Could you instead follow the example in commit 664f0dce2058 ("scsi:
> mpt3sas: Add support for shared host tagset for CPU hotplug"), and expose the HW queues to the upper layer? You can alternatively check the example of any SCSI driver which sets shost->host_tagset for this.
>
> Thanks,
> John
> [Sagar Biradar]
>
> ***What about if the CPU targeted goes offline while the IO is in-flight?
> We ran multiple random cases with the IO's running in parallel and disabling load-bearing CPU's. We saw that the load was transferred to the other online CPUs successfully every time.
> The same was tested at vendor and their customer site - they did not see any issues too.

You need to ensure that all CPUs associated with the HW queue are offline and stay offline until any IO may timeout, which would be 30 seconds according to SCSI sd default timeout. I am not sure if you were doing that exactly.

>
>
> ***blk-mq already does what you want here, including handling for the case I mention above. It maintains a CPU -> HW queue mapping, and using a reply map in the LLD is the old way of doing this.
> We also tried implementing the blk-mq mechanism in the driver and we saw command timeouts.
> The firmware has limitation of fixed number of queues per vector and the blk-mq changes would saturate that limit.
> That answers the possible command timeout.
>
> Also this is EOL product and there will be no firmware code changes. Given this, we have decided to stick to the reply_map mechanism.
> (https://urldefense.com/v3/__https://storage.microsemi.com/en-us/suppo
> rt/series8/index.php__;!!ACWV5N9M2RV99hQ!PLrbfoEBvEGxw2CvahCL0AP5c4f5c
> Q8gT0ahXVgB0mSbyqxWJ8pdtYY0JwRL8xZ59k0NHJhXCBbMtVWlq5pYMeOEHmw7ww$  )
>
> Thank you for your review comments and we hope you will reconsider the original patch.

I've been checking the driver a bit more and this drivers uses some "reserved" commands, right? That would be internal commands which the driver sends to the adapter which does not have a scsi_cmnd associated.
If so, it gets a bit more tricky to use blk-mq support for HW queues, as we need to manually find a HW queue for those "reserved commands", like
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/scsi/hisi_sas/hisi_sas_main.c?h=v6.3-rc6#n532

Anyway, it's not up to me ...

Thanks,
John

[Sagar Biradar] 
I thank you for your time and review comments.

***You need to ensure that all CPUs associated with the HW queue are offline and stay offline until any IO may timeout, which would be 30 seconds according to SCSI sd default timeout. I am not sure if you were doing that exactly.

We disabled 14 out of 16 CPUs and each time we saw the interrupts migrated to the other CPUs.
The CPUs remained offline for varying times, each of which were more than 30 seconds.
We monitored proper behavior of the threads running on CPUs and observed them migrating to other CPUs as they were disabled.
We, along with the vendor/customer, did not observe any command timeouts in these experiments. 
In case any commands time out - the driver will resort to the error handling mechanism.

***I've been checking the driver a bit more and this drivers uses some "reserved" commands, right? That would be internal commands which the driver sends to the adapter which does not have a scsi_cmnd associated.
If so, it gets a bit more tricky to use blk-mq support for HW queues, as we need to manually find a HW queue for those "reserved commands", like
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/scsi/hisi_sas/hisi_sas_main.c?h=v6.3-rc6#n532
Anyway, it's not up to me ...

Yes we have reserved commands, that originate from within the driver.
We rely on the reply_map mechanism (from the original patch) to get interrupt vector for the reserved commands too.


Thanks
Sagar
Sagar Biradar April 13, 2023, 9:52 p.m. UTC | #4
-----Original Message-----
From: Sagar Biradar - C34249 
Sent: Wednesday, April 12, 2023 12:30 PM
To: John Garry <john.g.garry@oracle.com>; Don Brace - C33706 <Don.Brace@microchip.com>; Gilbert Wu - C33504 <Gilbert.Wu@microchip.com>; linux-scsi@vger.kernel.org; martin.petersen@oracle.com; jejb@linux.ibm.com; brking@linux.vnet.ibm.com; stable@vger.kernel.org; Tom White - C33503 <Tom.White@microchip.com>
Subject: RE: [PATCH] aacraid: reply queue mapping to CPUs based of IRQ affinity



-----Original Message-----
From: John Garry <john.g.garry@oracle.com>
Sent: Wednesday, April 12, 2023 2:38 AM
To: Sagar Biradar - C34249 <Sagar.Biradar@microchip.com>; Don Brace - C33706 <Don.Brace@microchip.com>; Gilbert Wu - C33504 <Gilbert.Wu@microchip.com>; linux-scsi@vger.kernel.org; martin.petersen@oracle.com; jejb@linux.ibm.com; brking@linux.vnet.ibm.com; stable@vger.kernel.org; Tom White - C33503 <Tom.White@microchip.com>
Subject: Re: [PATCH] aacraid: reply queue mapping to CPUs based of IRQ affinity

EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe

On 10/04/2023 22:17, Sagar.Biradar@microchip.com wrote:
> On 28/03/2023 22:41, Sagar Biradar wrote:
>> Fix the IO hang that arises because of MSIx vector not having a 
>> mapped online CPU upon receiving completion.
> What about if the CPU targeted goes offline while the IO is in-flight?
>
>> This patch sets up a reply queue mapping to CPUs based on the IRQ 
>> affinity retrieved using pci_irq_get_affinity() API.
>>
> blk-mq already does what you want here, including handling for the case I mention above. It maintains a CPU -> HW queue mapping, and using a reply map in the LLD is the old way of doing this.
>
> Could you instead follow the example in commit 664f0dce2058 ("scsi:
> mpt3sas: Add support for shared host tagset for CPU hotplug"), and expose the HW queues to the upper layer? You can alternatively check the example of any SCSI driver which sets shost->host_tagset for this.
>
> Thanks,
> John
> [Sagar Biradar]
>
> ***What about if the CPU targeted goes offline while the IO is in-flight?
> We ran multiple random cases with the IO's running in parallel and disabling load-bearing CPU's. We saw that the load was transferred to the other online CPUs successfully every time.
> The same was tested at vendor and their customer site - they did not see any issues too.

You need to ensure that all CPUs associated with the HW queue are offline and stay offline until any IO may timeout, which would be 30 seconds according to SCSI sd default timeout. I am not sure if you were doing that exactly.

>
>
> ***blk-mq already does what you want here, including handling for the case I mention above. It maintains a CPU -> HW queue mapping, and using a reply map in the LLD is the old way of doing this.
> We also tried implementing the blk-mq mechanism in the driver and we saw command timeouts.
> The firmware has limitation of fixed number of queues per vector and the blk-mq changes would saturate that limit.
> That answers the possible command timeout.
>
> Also this is EOL product and there will be no firmware code changes. Given this, we have decided to stick to the reply_map mechanism.
> (https://urldefense.com/v3/__https://storage.microsemi.com/en-us/suppo
> rt/series8/index.php__;!!ACWV5N9M2RV99hQ!PLrbfoEBvEGxw2CvahCL0AP5c4f5c
> Q8gT0ahXVgB0mSbyqxWJ8pdtYY0JwRL8xZ59k0NHJhXCBbMtVWlq5pYMeOEHmw7ww$  )
>
> Thank you for your review comments and we hope you will reconsider the original patch.

I've been checking the driver a bit more and this drivers uses some "reserved" commands, right? That would be internal commands which the driver sends to the adapter which does not have a scsi_cmnd associated.
If so, it gets a bit more tricky to use blk-mq support for HW queues, as we need to manually find a HW queue for those "reserved commands", like
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/scsi/hisi_sas/hisi_sas_main.c?h=v6.3-rc6#n532

Anyway, it's not up to me ...

Thanks,
John

[Sagar Biradar]
I thank you for your time and review comments.

***You need to ensure that all CPUs associated with the HW queue are offline and stay offline until any IO may timeout, which would be 30 seconds according to SCSI sd default timeout. I am not sure if you were doing that exactly.

We disabled 14 out of 16 CPUs and each time we saw the interrupts migrated to the other CPUs.
The CPUs remained offline for varying times, each of which were more than 30 seconds.
We monitored proper behavior of the threads running on CPUs and observed them migrating to other CPUs as they were disabled.
We, along with the vendor/customer, did not observe any command timeouts in these experiments. 
In case any commands time out - the driver will resort to the error handling mechanism.

***I've been checking the driver a bit more and this drivers uses some "reserved" commands, right? That would be internal commands which the driver sends to the adapter which does not have a scsi_cmnd associated.
If so, it gets a bit more tricky to use blk-mq support for HW queues, as we need to manually find a HW queue for those "reserved commands", like
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/scsi/hisi_sas/hisi_sas_main.c?h=v6.3-rc6#n532
Anyway, it's not up to me ...

Yes we have reserved commands, that originate from within the driver.
We rely on the reply_map mechanism (from the original patch) to get interrupt vector for the reserved commands too.


Thanks
Sagar




Also, there is this patch which addresses the concerns John Garry raised.
https://lore.kernel.org/lkml/20220929033428.25948-1-mj0123.lee@samsung.com/T/

This patch explains how the coordination happens when a CPU goes offline.
IPI can be read InterProcessor Interrupt.
The request shall be completed from the CPU where it is running when the original CPU goes offline.

Thanks
Sagar
John Garry April 14, 2023, 7:20 a.m. UTC | #5
On 13/04/2023 22:52, Sagar.Biradar@microchip.com wrote:

[snip]

> Yes we have reserved commands, that originate from within the driver.
> We rely on the reply_map mechanism (from the original patch) to get interrupt vector for the reserved commands too.
> 
> 
> Thanks
> Sagar
> 
> 
> 
> 
> Also, there is this patch which addresses the concerns John Garry raised.
> https://urldefense.com/v3/__https://lore.kernel.org/lkml/20220929033428.25948-1-mj0123.lee@samsung.com/T/__;!!ACWV5N9M2RV99hQ!Jjh5jXadoTBK0R4UONFNOssLSRfzaOA9yV2ENIlArRzHEe6ylDxDEIwIzs9nzUOkLmgVC-B2Nfd_sjeho995VACy5O0qoA$  
> 
> This patch explains how the coordination happens when a CPU goes offline.
> IPI can be read InterProcessor Interrupt.
> The request shall be completed from the CPU where it is running when the original CPU goes offline.
> 
> Thanks
> Sagar

Can you please use standard mailing list practices in your replies, i.e. 
quote original mail in the reply here? I don't know what was added to 
the thread in this latest reply.

Thanks,
John
Sagar Biradar April 14, 2023, 8:07 p.m. UTC | #6
-----Original Message-----
From: John Garry <john.g.garry@oracle.com> 
Sent: Wednesday, April 12, 2023 2:38 AM
To: Sagar Biradar - C34249 <Sagar.Biradar@microchip.com>; Don Brace - C33706 <Don.Brace@microchip.com>; Gilbert Wu - C33504 <Gilbert.Wu@microchip.com>; linux-scsi@vger.kernel.org; martin.petersen@oracle.com; jejb@linux.ibm.com; brking@linux.vnet.ibm.com; stable@vger.kernel.org; Tom White - C33503 <Tom.White@microchip.com>
Subject: Re: [PATCH] aacraid: reply queue mapping to CPUs based of IRQ affinity

EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe

On 10/04/2023 22:17, Sagar.Biradar@microchip.com wrote:
> On 28/03/2023 22:41, Sagar Biradar wrote:
>> Fix the IO hang that arises because of MSIx vector not having a 
>> mapped online CPU upon receiving completion.
> What about if the CPU targeted goes offline while the IO is in-flight?
>
>> This patch sets up a reply queue mapping to CPUs based on the IRQ 
>> affinity retrieved using pci_irq_get_affinity() API.
>>
> blk-mq already does what you want here, including handling for the case I mention above. It maintains a CPU -> HW queue mapping, and using a reply map in the LLD is the old way of doing this.
>
> Could you instead follow the example in commit 664f0dce2058 ("scsi:
> mpt3sas: Add support for shared host tagset for CPU hotplug"), and expose the HW queues to the upper layer? You can alternatively check the example of any SCSI driver which sets shost->host_tagset for this.
>
> Thanks,
> John
> [Sagar Biradar]
>
> ***What about if the CPU targeted goes offline while the IO is in-flight?
> We ran multiple random cases with the IO's running in parallel and disabling load-bearing CPU's. We saw that the load was transferred to the other online CPUs successfully every time.
> The same was tested at vendor and their customer site - they did not see any issues too.

You need to ensure that all CPUs associated with the HW queue are offline and stay offline until any IO may timeout, which would be 30 seconds according to SCSI sd default timeout. I am not sure if you were doing that exactly. 

[Sagar Biradar] 
Responding again inline to the original thread to avoid confusion . . . 

We disabled 14 out of 16 CPUs and each time we saw the interrupts migrated to the other CPUs.
The CPUs remained offline for varying times, each of which were more than 30 seconds.
We monitored proper behavior of the threads running on CPUs and observed them migrating to other CPUs as they were disabled.
We, along with the vendor/customer, did not observe any command timeouts in these experiments. 
In case any commands time out - the driver will resort to the error handling mechanism.

Also, there is this patch which addresses the concerns John Garry raised.
https://lore.kernel.org/lkml/20220929033428.25948-1-mj0123.lee@samsung.com/T/

This patch explains how the coordination happens when a CPU goes offline.
IPI can be read Inter-Processor Interrupt.
The request shall be completed from the CPU where it is running when the original CPU goes offline.

>
>
> ***blk-mq already does what you want here, including handling for the case I mention above. It maintains a CPU -> HW queue mapping, and using a reply map in the LLD is the old way of doing this.
> We also tried implementing the blk-mq mechanism in the driver and we saw command timeouts.
> The firmware has limitation of fixed number of queues per vector and the blk-mq changes would saturate that limit.
> That answers the possible command timeout.
>
> Also this is EOL product and there will be no firmware code changes. Given this, we have decided to stick to the reply_map mechanism.
> (https://urldefense.com/v3/__https://storage.microsemi.com/en-us/suppo
> rt/series8/index.php__;!!ACWV5N9M2RV99hQ!PLrbfoEBvEGxw2CvahCL0AP5c4f5c
> Q8gT0ahXVgB0mSbyqxWJ8pdtYY0JwRL8xZ59k0NHJhXCBbMtVWlq5pYMeOEHmw7ww$  )
>
> Thank you for your review comments and we hope you will reconsider the original patch.

I've been checking the driver a bit more and this drivers uses some "reserved" commands, right? That would be internal commands which the driver sends to the adapter which does not have a scsi_cmnd associated.
If so, it gets a bit more tricky to use blk-mq support for HW queues, as we need to manually find a HW queue for those "reserved commands", like
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/scsi/hisi_sas/hisi_sas_main.c?h=v6.3-rc6#n532

Anyway, it's not up to me ... 

[Sagar Biradar] 
Yes, we have reserved commands, that originate from within the driver.
We rely on the reply_map mechanism (from the original patch) to get interrupt vector for the reserved commands too.

Thanks,
John
Sagar Biradar April 18, 2023, 4:36 p.m. UTC | #7
-----Original Message-----
From: Sagar Biradar - C34249 
Sent: Friday, April 14, 2023 1:07 PM
To: John Garry <john.g.garry@oracle.com>; Don Brace - C33706 <Don.Brace@microchip.com>; Gilbert Wu - C33504 <Gilbert.Wu@microchip.com>; linux-scsi@vger.kernel.org; martin.petersen@oracle.com; jejb@linux.ibm.com; brking@linux.vnet.ibm.com; stable@vger.kernel.org; Tom White - C33503 <Tom.White@microchip.com>
Subject: RE: [PATCH] aacraid: reply queue mapping to CPUs based of IRQ affinity

[Sagar Biradar] Just checking if anyone get a chance to review this patch?
Thanks, in advance.


-----Original Message-----
From: John Garry <john.g.garry@oracle.com>
Sent: Wednesday, April 12, 2023 2:38 AM
To: Sagar Biradar - C34249 <Sagar.Biradar@microchip.com>; Don Brace - C33706 <Don.Brace@microchip.com>; Gilbert Wu - C33504 <Gilbert.Wu@microchip.com>; linux-scsi@vger.kernel.org; martin.petersen@oracle.com; jejb@linux.ibm.com; brking@linux.vnet.ibm.com; stable@vger.kernel.org; Tom White - C33503 <Tom.White@microchip.com>
Subject: Re: [PATCH] aacraid: reply queue mapping to CPUs based of IRQ affinity

EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe

On 10/04/2023 22:17, Sagar.Biradar@microchip.com wrote:
> On 28/03/2023 22:41, Sagar Biradar wrote:
>> Fix the IO hang that arises because of MSIx vector not having a 
>> mapped online CPU upon receiving completion.
> What about if the CPU targeted goes offline while the IO is in-flight?
>
>> This patch sets up a reply queue mapping to CPUs based on the IRQ 
>> affinity retrieved using pci_irq_get_affinity() API.
>>
> blk-mq already does what you want here, including handling for the case I mention above. It maintains a CPU -> HW queue mapping, and using a reply map in the LLD is the old way of doing this.
>
> Could you instead follow the example in commit 664f0dce2058 ("scsi:
> mpt3sas: Add support for shared host tagset for CPU hotplug"), and expose the HW queues to the upper layer? You can alternatively check the example of any SCSI driver which sets shost->host_tagset for this.
>
> Thanks,
> John
> [Sagar Biradar]
>
> ***What about if the CPU targeted goes offline while the IO is in-flight?
> We ran multiple random cases with the IO's running in parallel and disabling load-bearing CPU's. We saw that the load was transferred to the other online CPUs successfully every time.
> The same was tested at vendor and their customer site - they did not see any issues too.

You need to ensure that all CPUs associated with the HW queue are offline and stay offline until any IO may timeout, which would be 30 seconds according to SCSI sd default timeout. I am not sure if you were doing that exactly. 

[Sagar Biradar]
Responding again inline to the original thread to avoid confusion . . . 

We disabled 14 out of 16 CPUs and each time we saw the interrupts migrated to the other CPUs.
The CPUs remained offline for varying times, each of which were more than 30 seconds.
We monitored proper behavior of the threads running on CPUs and observed them migrating to other CPUs as they were disabled.
We, along with the vendor/customer, did not observe any command timeouts in these experiments. 
In case any commands time out - the driver will resort to the error handling mechanism.

Also, there is this patch which addresses the concerns John Garry raised.
https://lore.kernel.org/lkml/20220929033428.25948-1-mj0123.lee@samsung.com/T/

This patch explains how the coordination happens when a CPU goes offline.
IPI can be read Inter-Processor Interrupt.
The request shall be completed from the CPU where it is running when the original CPU goes offline.

>
>
> ***blk-mq already does what you want here, including handling for the case I mention above. It maintains a CPU -> HW queue mapping, and using a reply map in the LLD is the old way of doing this.
> We also tried implementing the blk-mq mechanism in the driver and we saw command timeouts.
> The firmware has limitation of fixed number of queues per vector and the blk-mq changes would saturate that limit.
> That answers the possible command timeout.
>
> Also this is EOL product and there will be no firmware code changes. Given this, we have decided to stick to the reply_map mechanism.
> (https://urldefense.com/v3/__https://storage.microsemi.com/en-us/suppo
> rt/series8/index.php__;!!ACWV5N9M2RV99hQ!PLrbfoEBvEGxw2CvahCL0AP5c4f5c
> Q8gT0ahXVgB0mSbyqxWJ8pdtYY0JwRL8xZ59k0NHJhXCBbMtVWlq5pYMeOEHmw7ww$  )
>
> Thank you for your review comments and we hope you will reconsider the original patch.

I've been checking the driver a bit more and this drivers uses some "reserved" commands, right? That would be internal commands which the driver sends to the adapter which does not have a scsi_cmnd associated.
If so, it gets a bit more tricky to use blk-mq support for HW queues, as we need to manually find a HW queue for those "reserved commands", like
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/scsi/hisi_sas/hisi_sas_main.c?h=v6.3-rc6#n532

Anyway, it's not up to me ... 

[Sagar Biradar]
Yes, we have reserved commands, that originate from within the driver.
We rely on the reply_map mechanism (from the original patch) to get interrupt vector for the reserved commands too.

Thanks,
John
Sagar Biradar April 18, 2023, 11:55 p.m. UTC | #8
-----Original Message-----
From: James Bottomley <jejb@linux.ibm.com> 
Sent: Tuesday, April 18, 2023 10:13 AM
To: Sagar Biradar - C34249 <Sagar.Biradar@microchip.com>; john.g.garry@oracle.com; Don Brace - C33706 <Don.Brace@microchip.com>; Gilbert Wu - C33504 <Gilbert.Wu@microchip.com>; linux-scsi@vger.kernel.org; martin.petersen@oracle.com; brking@linux.vnet.ibm.com; stable@vger.kernel.org; Tom White - C33503 <Tom.White@microchip.com>
Subject: Re: [PATCH] aacraid: reply queue mapping to CPUs based of IRQ affinity

EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe

[I'm with Jon: your email style makes digging information out of the emails very hard, which is why I only quote this section] On Mon, 2023-04-10 at 21:17 +0000, Sagar.Biradar@microchip.com wrote:
> ***blk-mq already does what you want here, including handling for the 
> case I mention above. It maintains a CPU -> HW queue mapping, and 
> using a reply map in the LLD is the old way of doing this.
>

> We also tried implementing the blk-mq mechanism in the driver and we 
> saw command timeouts.
> The firmware has limitation of fixed number of queues per vector and 
> the blk-mq changes would saturate that limit.
> That answers the possible command timeout.

Could we have more details on this, please?  The problem is that this is a very fragile area of the kernel, so you rolling your own special snowflake implementation in the driver is going to be an ongoing maintenance problem (and the fact that you need this at all indicates you have long tail customers who will be around for a while yet).  If the only issue is limiting the number of queues per vector, we can look at getting the block layer to do that.  Although I was under the impression that you can do it yourself with the ->map_queues() callback.  Can you say a bit about why this didn't work?

[Sagar Biradar] 
Thank you for your response.
We did venture trying into something like what you pointed to.
We mapped the hardware queues, and we still see command timeout. This change doesn’t work for us at this stage. 
Also, we observed that the load is not balanced across all the CPUs.
I am pasting the code snippets for better understanding.


During the probe, we assigned the hardware queues.
shost->nr_hw_queues = shost->can_queue; //inside aac_probe_one().

We also wrote a new routine "blk_mq_pci_map_queues" (mapped to .map_queues in scsi_host_template).
static void aac_map_queues(struct Scsi_Host *shost)
{
                struct aac_dev *aac = (struct aac_dev *)shost->hostdata;
                blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],
                                                      aac->pdev, 0);
}

With the above changes, we see command timeouts in the firmware space and the commands never return to the driver.
This may need some changes in the firmware, but the firmware changes are restricted (since this is EOL product).
Also, we saw that the load was entirely upon one CPU and it was not balanced across other CPUs.

We have had this reply_queue mechanism (patch) in our Out Of Box driver (OOB) for more than three years.
We(vendors/customers included) have not observed any issues.


James
John Garry April 19, 2023, 7:42 a.m. UTC | #9
On 19/04/2023 00:55, Sagar.Biradar@microchip.com wrote:
> [I'm with Jon: your email style makes digging information out of the emails very hard, which is why I only quote this section] On Mon, 2023-04-10 at 21:17 +0000,Sagar.Biradar@microchip.com  wrote:

I stopped replying as this wasn't fixed... and still isn't :((

>> ***blk-mq already does what you want here, including handling for the
>> case I mention above. It maintains a CPU -> HW queue mapping, and
>> using a reply map in the LLD is the old way of doing this.
>>
>> We also tried implementing the blk-mq mechanism in the driver and we
>> saw command timeouts.
>> The firmware has limitation of fixed number of queues per vector and
>> the blk-mq changes would saturate that limit.
>> That answers the possible command timeout.
> Could we have more details on this, please?  The problem is that this is a very fragile area of the kernel, so you rolling your own special snowflake implementation in the driver is going to be an ongoing maintenance problem (and the fact that you need this at all indicates you have long tail customers who will be around for a while yet).  If the only issue is limiting the number of queues per vector, we can look at getting the block layer to do that.  Although I was under the impression that you can do it yourself with the ->map_queues() callback.  Can you say a bit about why this didn't work?
> 
> [Sagar Biradar]
> Thank you for your response.
> We did venture trying into something like what you pointed to.
> We mapped the hardware queues, and we still see command timeout. This change doesn’t work for us at this stage.
> Also, we observed that the load is not balanced across all the CPUs.
> I am pasting the code snippets for better understanding.
> 
> 
> During the probe, we assigned the hardware queues.
> shost->nr_hw_queues = shost->can_queue; //inside aac_probe_one().

That is wrong - .can_queue is the number of IOs which the scsi host may 
be sent at any given time, while .nr_hw_queues should be the number of 
MSI(X) vectors returned from pci_alloc_vectors()

You should also set shost->host_tagset = 1.

Since the driver has reserved commands, the .can_queue should be reduced 
by the amount of reserved commands and the driver needs to manually 
choose which HW queue to send those reserved commands on - see example 
in other driver here: 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/scsi/hisi_sas/hisi_sas_main.c?h=v6.3-rc7#n523

JFYI, We have tried to add reserved command support to SCSI ML, but it 
still has not been finished - see 
https://lore.kernel.org/linux-scsi/20211125151048.103910-1-hare@suse.de/

> 
> We also wrote a new routine "blk_mq_pci_map_queues" (mapped to .map_queues in scsi_host_template).
> static void aac_map_queues(struct Scsi_Host *shost)
> {
>                  struct aac_dev *aac = (struct aac_dev *)shost->hostdata;
>                  blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],
>                                                        aac->pdev, 0);
> }

This looks ok.

> 
> With the above changes, we see command timeouts in the firmware space and the commands never return to the driver.
> This may need some changes in the firmware, but the firmware changes are restricted (since this is EOL product).
> Also, we saw that the load was entirely upon one CPU and it was not balanced across other CPUs.
> 
> We have had this reply_queue mechanism (patch) in our Out Of Box driver (OOB) for more than three years.
> We(vendors/customers included) have not observed any issues.
diff mbox series

Patch

diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h
index 5e115e8b2ba4..4a23f9fab61f 100644
--- a/drivers/scsi/aacraid/aacraid.h
+++ b/drivers/scsi/aacraid/aacraid.h
@@ -1678,6 +1678,7 @@  struct aac_dev
 	u32			handle_pci_error;
 	bool			init_reset;
 	u8			soft_reset_support;
+	unsigned int *reply_map;
 };
 
 #define aac_adapter_interrupt(dev) \
diff --git a/drivers/scsi/aacraid/comminit.c b/drivers/scsi/aacraid/comminit.c
index bd99c5492b7d..6fc323844a31 100644
--- a/drivers/scsi/aacraid/comminit.c
+++ b/drivers/scsi/aacraid/comminit.c
@@ -33,6 +33,8 @@ 
 
 #include "aacraid.h"
 
+void aac_setup_reply_map(struct aac_dev *dev);
+
 struct aac_common aac_config = {
 	.irq_mod = 1
 };
@@ -630,6 +632,9 @@  struct aac_dev *aac_init_adapter(struct aac_dev *dev)
 
 	if (aac_is_src(dev))
 		aac_define_int_mode(dev);
+
+	aac_setup_reply_map(dev);
+
 	/*
 	 *	Ok now init the communication subsystem
 	 */
@@ -658,3 +663,23 @@  struct aac_dev *aac_init_adapter(struct aac_dev *dev)
 	return dev;
 }
 
+void aac_setup_reply_map(struct aac_dev *dev)
+{
+	const struct cpumask *mask;
+	unsigned int i, cpu = 1;
+
+	for (i = 1; i < dev->max_msix; i++) {
+		mask = pci_irq_get_affinity(dev->pdev, i);
+		if (!mask)
+			goto fallback;
+
+		for_each_cpu(cpu, mask) {
+			dev->reply_map[cpu] = i;
+		}
+	}
+	return;
+
+fallback:
+	for_each_possible_cpu(cpu)
+		dev->reply_map[cpu] = 0;
+}
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 5ba5c18b77b4..af60c7d26407 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1668,6 +1668,14 @@  static int aac_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto out_free_host;
 	}
 
+	aac->reply_map = kzalloc(sizeof(unsigned int) * nr_cpu_ids,
+				GFP_KERNEL);
+	if (!aac->reply_map) {
+		error = -ENOMEM;
+		dev_err(&pdev->dev, "reply_map allocation failed\n");
+		goto out_free_host;
+	}
+
 	spin_lock_init(&aac->fib_lock);
 
 	mutex_init(&aac->ioctl_mutex);
@@ -1797,6 +1805,8 @@  static int aac_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 				  aac->comm_addr, aac->comm_phys);
 	kfree(aac->queues);
 	aac_adapter_ioremap(aac, 0);
+	/* By now we should have configured the reply_map */
+	kfree(aac->reply_map);
 	kfree(aac->fibs);
 	kfree(aac->fsa_dev);
  out_free_host:
@@ -1918,6 +1928,7 @@  static void aac_remove_one(struct pci_dev *pdev)
 
 	aac_adapter_ioremap(aac, 0);
 
+	kfree(aac->reply_map);
 	kfree(aac->fibs);
 	kfree(aac->fsa_dev);
 
diff --git a/drivers/scsi/aacraid/src.c b/drivers/scsi/aacraid/src.c
index 11ef58204e96..e84ec60a655b 100644
--- a/drivers/scsi/aacraid/src.c
+++ b/drivers/scsi/aacraid/src.c
@@ -506,7 +506,7 @@  static int aac_src_deliver_message(struct fib *fib)
 			&& dev->sa_firmware)
 			vector_no = aac_get_vector(dev);
 		else
-			vector_no = fib->vector_no;
+			vector_no = dev->reply_map[raw_smp_processor_id()];
 
 		if (native_hba) {
 			if (fib->flags & FIB_CONTEXT_FLAG_NATIVE_HBA_TMF) {