diff mbox series

[v2,11/13] scsi: target: Treat CMD_T_FABRIC_STOP like CMD_T_STOP

Message ID 20230112030832.110143-12-michael.christie@oracle.com
State Superseded
Headers show
Series target task management fixes | expand

Commit Message

Mike Christie Jan. 12, 2023, 3:08 a.m. UTC
iscsit will set CMD_T_FABRIC_STOP on running commands when its transport
connection is down and it can't send/recv IO (tx/rx threads are killed
or the cleanup thread is run from the one thats up). It will then loop
over running commands and wait for LIO core to complete them or clean
them up if they were on an internal queue waiting to be sent or ackd.

Currently, CMD_T_FABRIC_STOP only stops TMRs from operating on the
command but for isert we need to prevent LIO core from calling into
iscsit callouts when the connection is being brought down. If LIO core
queues commands to iscsit and it ends up adding to an internal queue
instead of passing back to the driver then we can end up hanging waiting
on command completion that never occurs because it's stuck on the internal
list (the tx thread is stopped at this time, so it will never loop over
the response list and call into isert). We also want to sync up on a
point where we no longer call into isert so it can cleanup it's structs.

This has LIO core treat CMD_T_FABRIC_STOP like CMD_T_STOP during
command execution and also fixes the locking around the
target_cmd_interrupted calls so we don't have a case where a command
is marked CMD_T_COMPLETE and CMD_T_STOP|CMD_T_FABRIC_STOP at the same
time.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/target_core_sbc.c       |  2 +-
 drivers/target/target_core_transport.c | 27 +++++++++++++++-----------
 2 files changed, 17 insertions(+), 12 deletions(-)

Comments

Dmitry Bogdanov Jan. 13, 2023, 2:15 p.m. UTC | #1
On Wed, Jan 11, 2023 at 09:08:30PM -0600, Mike Christie wrote:
> 
> iscsit will set CMD_T_FABRIC_STOP on running commands when its transport
> connection is down and it can't send/recv IO (tx/rx threads are killed
> or the cleanup thread is run from the one thats up). It will then loop
> over running commands and wait for LIO core to complete them or clean
> them up if they were on an internal queue waiting to be sent or ackd.
> 
> Currently, CMD_T_FABRIC_STOP only stops TMRs from operating on the
> command but for isert we need to prevent LIO core from calling into
> iscsit callouts when the connection is being brought down. If LIO core
> queues commands to iscsit and it ends up adding to an internal queue
> instead of passing back to the driver then we can end up hanging waiting
> on command completion that never occurs because it's stuck on the internal
> list (the tx thread is stopped at this time, so it will never loop over
> the response list and call into isert). We also want to sync up on a
> point where we no longer call into isert so it can cleanup it's structs.
> 
> This has LIO core treat CMD_T_FABRIC_STOP like CMD_T_STOP during
> command execution and also fixes the locking around the
> target_cmd_interrupted calls so we don't have a case where a command
> is marked CMD_T_COMPLETE and CMD_T_STOP|CMD_T_FABRIC_STOP at the same
> time.
> 
> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> ---
>  drivers/target/target_core_sbc.c       |  2 +-
>  drivers/target/target_core_transport.c | 27 +++++++++++++++-----------
>  2 files changed, 17 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
> index 7536ca797606..56136613767f 100644
> --- a/drivers/target/target_core_sbc.c
> +++ b/drivers/target/target_core_sbc.c
> @@ -459,7 +459,7 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes
>                  * we don't have to perform the write operation.
>                  */
>                 WARN_ON(!(cmd->transport_state &
> -                       (CMD_T_ABORTED | CMD_T_STOP)));
> +                       (CMD_T_ABORTED | CMD_T_STOP | CMD_T_FABRIC_STOP)));
>                 goto out;
>         }
>         /*
> diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
> index cb3fdc81ba3b..02a9476945dc 100644
> --- a/drivers/target/target_core_transport.c
> +++ b/drivers/target/target_core_transport.c
> @@ -737,8 +737,8 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd)
>          * Determine if frontend context caller is requesting the stopping of
>          * this command for frontend exceptions.
>          */
> -       if (cmd->transport_state & CMD_T_STOP) {
> -               pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
> +       if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
> +               pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP for ITT: 0x%08llx\n",
>                         __func__, __LINE__, cmd->tag);
> 
>                 spin_unlock_irqrestore(&cmd->t_state_lock, flags);
> @@ -889,7 +889,7 @@ static bool target_cmd_interrupted(struct se_cmd *cmd)
>                 INIT_WORK(&cmd->work, target_abort_work);
>                 queue_work(target_completion_wq, &cmd->work);
>                 return true;
> -       } else if (cmd->transport_state & CMD_T_STOP) {
> +       } else if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
>                 if (cmd->transport_complete_callback)
>                         cmd->transport_complete_callback(cmd, false, &post_ret);
>                 complete_all(&cmd->t_transport_stop_comp);
> @@ -907,13 +907,15 @@ void target_complete_cmd_with_sense(struct se_cmd *cmd, u8 scsi_status,
>         int success, cpu;
>         unsigned long flags;
> 
> -       if (target_cmd_interrupted(cmd))
> +       spin_lock_irqsave(&cmd->t_state_lock, flags);

That leads to a hardlock because
target_cmd_interrupted() => cmd->transport_complete_callback() also tooks
cmd->t_state_lock.

> +       if (target_cmd_interrupted(cmd)) {
> +               spin_unlock_irqrestore(&cmd->t_state_lock, flags);
>                 return;
> +       }
> 
>         cmd->scsi_status = scsi_status;
>         cmd->sense_reason = sense_reason;
> 
> -       spin_lock_irqsave(&cmd->t_state_lock, flags);
>         switch (cmd->scsi_status) {
>         case SAM_STAT_CHECK_CONDITION:
>                 if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE)
> @@ -2277,10 +2279,12 @@ void target_execute_cmd(struct se_cmd *cmd)
>          *
>          * If the received CDB has already been aborted stop processing it here.
>          */
> -       if (target_cmd_interrupted(cmd))
> +       spin_lock_irq(&cmd->t_state_lock);
> +       if (target_cmd_interrupted(cmd)) {
> +               spin_unlock_irq(&cmd->t_state_lock);
>                 return;
> +       }
> 
> -       spin_lock_irq(&cmd->t_state_lock);
>         cmd->t_state = TRANSPORT_PROCESSING;
>         cmd->transport_state |= CMD_T_ACTIVE | CMD_T_SENT;
>         spin_unlock_irq(&cmd->t_state_lock);
> @@ -2847,9 +2851,9 @@ transport_generic_new_cmd(struct se_cmd *cmd)
>          * Determine if frontend context caller is requesting the stopping of
>          * this command for frontend exceptions.
>          */
> -       if (cmd->transport_state & CMD_T_STOP &&
> +       if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP) &&
>             !cmd->se_tfo->write_pending_must_be_called) {
> -               pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
> +               pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOPfor ITT: 0x%08llx\n",
>                          __func__, __LINE__, cmd->tag);
> 
>                 spin_unlock_irqrestore(&cmd->t_state_lock, flags);
> @@ -2880,11 +2884,12 @@ static void transport_write_pending_qf(struct se_cmd *cmd)
>         bool stop;
> 
>         spin_lock_irqsave(&cmd->t_state_lock, flags);
> -       stop = (cmd->transport_state & (CMD_T_STOP | CMD_T_ABORTED));
> +       stop = (cmd->transport_state &
> +               (CMD_T_STOP | CMD_T_FABRIC_STOP | CMD_T_ABORTED));
>         spin_unlock_irqrestore(&cmd->t_state_lock, flags);
> 
>         if (stop) {
> -               pr_debug("%s:%d CMD_T_STOP|CMD_T_ABORTED for ITT: 0x%08llx\n",
> +               pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP|CMD_T_ABORTED for ITT: 0x%08llx\n",
>                         __func__, __LINE__, cmd->tag);
>                 complete_all(&cmd->t_transport_stop_comp);
>                 return;
> --
> 2.31.1
> 
>
Dmitry Bogdanov Jan. 17, 2023, 11:52 a.m. UTC | #2
On Fri, Jan 13, 2023 at 05:15:12PM +0300, Dmitry Bogdanov wrote:
> On Wed, Jan 11, 2023 at 09:08:30PM -0600, Mike Christie wrote:
> > 
> > iscsit will set CMD_T_FABRIC_STOP on running commands when its transport
> > connection is down and it can't send/recv IO (tx/rx threads are killed
> > or the cleanup thread is run from the one thats up). It will then loop
> > over running commands and wait for LIO core to complete them or clean
> > them up if they were on an internal queue waiting to be sent or ackd.
> > 
> > Currently, CMD_T_FABRIC_STOP only stops TMRs from operating on the
> > command but for isert we need to prevent LIO core from calling into
> > iscsit callouts when the connection is being brought down. If LIO core
> > queues commands to iscsit and it ends up adding to an internal queue
> > instead of passing back to the driver then we can end up hanging waiting
> > on command completion that never occurs because it's stuck on the internal
> > list (the tx thread is stopped at this time, so it will never loop over
> > the response list and call into isert). We also want to sync up on a
> > point where we no longer call into isert so it can cleanup it's structs.
> > 
> > This has LIO core treat CMD_T_FABRIC_STOP like CMD_T_STOP during
> > command execution and also fixes the locking around the
> > target_cmd_interrupted calls so we don't have a case where a command
> > is marked CMD_T_COMPLETE and CMD_T_STOP|CMD_T_FABRIC_STOP at the same
> > time.
> > 
> > Signed-off-by: Mike Christie <michael.christie@oracle.com>
> > ---
> >  drivers/target/target_core_sbc.c       |  2 +-
> >  drivers/target/target_core_transport.c | 27 +++++++++++++++-----------
> >  2 files changed, 17 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
> > index 7536ca797606..56136613767f 100644
> > --- a/drivers/target/target_core_sbc.c
> > +++ b/drivers/target/target_core_sbc.c
> > @@ -459,7 +459,7 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes
> >                  * we don't have to perform the write operation.
> >                  */
> >                 WARN_ON(!(cmd->transport_state &
> > -                       (CMD_T_ABORTED | CMD_T_STOP)));
> > +                       (CMD_T_ABORTED | CMD_T_STOP | CMD_T_FABRIC_STOP)));
> >                 goto out;
> >         }
> >         /*
> > diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
> > index cb3fdc81ba3b..02a9476945dc 100644
> > --- a/drivers/target/target_core_transport.c
> > +++ b/drivers/target/target_core_transport.c
> > @@ -737,8 +737,8 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd)
> >          * Determine if frontend context caller is requesting the stopping of
> >          * this command for frontend exceptions.
> >          */
> > -       if (cmd->transport_state & CMD_T_STOP) {
> > -               pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
> > +       if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
> > +               pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP for ITT: 0x%08llx\n",
> >                         __func__, __LINE__, cmd->tag);
> > 
> >                 spin_unlock_irqrestore(&cmd->t_state_lock, flags);
> > @@ -889,7 +889,7 @@ static bool target_cmd_interrupted(struct se_cmd *cmd)
> >                 INIT_WORK(&cmd->work, target_abort_work);
> >                 queue_work(target_completion_wq, &cmd->work);
> >                 return true;
> > -       } else if (cmd->transport_state & CMD_T_STOP) {
> > +       } else if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
> >                 if (cmd->transport_complete_callback)
> >                         cmd->transport_complete_callback(cmd, false, &post_ret);
> >                 complete_all(&cmd->t_transport_stop_comp);
> > @@ -907,13 +907,15 @@ void target_complete_cmd_with_sense(struct se_cmd *cmd, u8 scsi_status,
> >         int success, cpu;
> >         unsigned long flags;
> > 
> > -       if (target_cmd_interrupted(cmd))
> > +       spin_lock_irqsave(&cmd->t_state_lock, flags);
> 
> That leads to a hardlock because
> target_cmd_interrupted() => cmd->transport_complete_callback() also tooks
> cmd->t_state_lock.
But the taking the lock for read+write of cmd->t*_state is absolutelly right!
It would be great if you manage to move transport_complete_callback()
into other thread/job.

> > +       if (target_cmd_interrupted(cmd)) {
> > +               spin_unlock_irqrestore(&cmd->t_state_lock, flags);
> >                 return;
> > +       }
> > 
> >         cmd->scsi_status = scsi_status;
> >         cmd->sense_reason = sense_reason;
> > 
> > -       spin_lock_irqsave(&cmd->t_state_lock, flags);
> >         switch (cmd->scsi_status) {
> >         case SAM_STAT_CHECK_CONDITION:
> >                 if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE)
Mike Christie Jan. 17, 2023, 6:45 p.m. UTC | #3
On 1/17/23 05:52, Dmitry Bogdanov wrote:
> On Fri, Jan 13, 2023 at 05:15:12PM +0300, Dmitry Bogdanov wrote:
>> On Wed, Jan 11, 2023 at 09:08:30PM -0600, Mike Christie wrote:
>>>
>>> iscsit will set CMD_T_FABRIC_STOP on running commands when its transport
>>> connection is down and it can't send/recv IO (tx/rx threads are killed
>>> or the cleanup thread is run from the one thats up). It will then loop
>>> over running commands and wait for LIO core to complete them or clean
>>> them up if they were on an internal queue waiting to be sent or ackd.
>>>
>>> Currently, CMD_T_FABRIC_STOP only stops TMRs from operating on the
>>> command but for isert we need to prevent LIO core from calling into
>>> iscsit callouts when the connection is being brought down. If LIO core
>>> queues commands to iscsit and it ends up adding to an internal queue
>>> instead of passing back to the driver then we can end up hanging waiting
>>> on command completion that never occurs because it's stuck on the internal
>>> list (the tx thread is stopped at this time, so it will never loop over
>>> the response list and call into isert). We also want to sync up on a
>>> point where we no longer call into isert so it can cleanup it's structs.
>>>
>>> This has LIO core treat CMD_T_FABRIC_STOP like CMD_T_STOP during
>>> command execution and also fixes the locking around the
>>> target_cmd_interrupted calls so we don't have a case where a command
>>> is marked CMD_T_COMPLETE and CMD_T_STOP|CMD_T_FABRIC_STOP at the same
>>> time.
>>>
>>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
>>> ---
>>>  drivers/target/target_core_sbc.c       |  2 +-
>>>  drivers/target/target_core_transport.c | 27 +++++++++++++++-----------
>>>  2 files changed, 17 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
>>> index 7536ca797606..56136613767f 100644
>>> --- a/drivers/target/target_core_sbc.c
>>> +++ b/drivers/target/target_core_sbc.c
>>> @@ -459,7 +459,7 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes
>>>                  * we don't have to perform the write operation.
>>>                  */
>>>                 WARN_ON(!(cmd->transport_state &
>>> -                       (CMD_T_ABORTED | CMD_T_STOP)));
>>> +                       (CMD_T_ABORTED | CMD_T_STOP | CMD_T_FABRIC_STOP)));
>>>                 goto out;
>>>         }
>>>         /*
>>> diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
>>> index cb3fdc81ba3b..02a9476945dc 100644
>>> --- a/drivers/target/target_core_transport.c
>>> +++ b/drivers/target/target_core_transport.c
>>> @@ -737,8 +737,8 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd)
>>>          * Determine if frontend context caller is requesting the stopping of
>>>          * this command for frontend exceptions.
>>>          */
>>> -       if (cmd->transport_state & CMD_T_STOP) {
>>> -               pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
>>> +       if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
>>> +               pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP for ITT: 0x%08llx\n",
>>>                         __func__, __LINE__, cmd->tag);
>>>
>>>                 spin_unlock_irqrestore(&cmd->t_state_lock, flags);
>>> @@ -889,7 +889,7 @@ static bool target_cmd_interrupted(struct se_cmd *cmd)
>>>                 INIT_WORK(&cmd->work, target_abort_work);
>>>                 queue_work(target_completion_wq, &cmd->work);
>>>                 return true;
>>> -       } else if (cmd->transport_state & CMD_T_STOP) {
>>> +       } else if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
>>>                 if (cmd->transport_complete_callback)
>>>                         cmd->transport_complete_callback(cmd, false, &post_ret);
>>>                 complete_all(&cmd->t_transport_stop_comp);
>>> @@ -907,13 +907,15 @@ void target_complete_cmd_with_sense(struct se_cmd *cmd, u8 scsi_status,
>>>         int success, cpu;
>>>         unsigned long flags;
>>>
>>> -       if (target_cmd_interrupted(cmd))
>>> +       spin_lock_irqsave(&cmd->t_state_lock, flags);
>>
>> That leads to a hardlock because
>> target_cmd_interrupted() => cmd->transport_complete_callback() also tooks
>> cmd->t_state_lock.

We shouldn't lock up because for the failure cases the transport_complete_callback
functions don't take the lock. They just cleanup and return.

> But the taking the lock for read+write of cmd->t*_state is absolutelly right!
> It would be great if you manage to move transport_complete_callback()
> into other thread/job.

I'm not sure why we want to do that since none of the transport_complete_callback
sleep on failure. It seems to just add more complexity and we only have the 2
transport_complete_callback uses now, so it seems overkill.
Dmitry Bogdanov Jan. 17, 2023, 7:55 p.m. UTC | #4
On Tue, Jan 17, 2023 at 12:45:56PM -0600, Mike Christie wrote:
> 
> On 1/17/23 05:52, Dmitry Bogdanov wrote:
> > On Fri, Jan 13, 2023 at 05:15:12PM +0300, Dmitry Bogdanov wrote:
> >> On Wed, Jan 11, 2023 at 09:08:30PM -0600, Mike Christie wrote:
> >>>
> >>> iscsit will set CMD_T_FABRIC_STOP on running commands when its transport
> >>> connection is down and it can't send/recv IO (tx/rx threads are killed
> >>> or the cleanup thread is run from the one thats up). It will then loop
> >>> over running commands and wait for LIO core to complete them or clean
> >>> them up if they were on an internal queue waiting to be sent or ackd.
> >>>
> >>> Currently, CMD_T_FABRIC_STOP only stops TMRs from operating on the
> >>> command but for isert we need to prevent LIO core from calling into
> >>> iscsit callouts when the connection is being brought down. If LIO core
> >>> queues commands to iscsit and it ends up adding to an internal queue
> >>> instead of passing back to the driver then we can end up hanging waiting
> >>> on command completion that never occurs because it's stuck on the internal
> >>> list (the tx thread is stopped at this time, so it will never loop over
> >>> the response list and call into isert). We also want to sync up on a
> >>> point where we no longer call into isert so it can cleanup it's structs.
> >>>
> >>> This has LIO core treat CMD_T_FABRIC_STOP like CMD_T_STOP during
> >>> command execution and also fixes the locking around the
> >>> target_cmd_interrupted calls so we don't have a case where a command
> >>> is marked CMD_T_COMPLETE and CMD_T_STOP|CMD_T_FABRIC_STOP at the same
> >>> time.
> >>>
> >>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> >>> ---
> >>>  drivers/target/target_core_sbc.c       |  2 +-
> >>>  drivers/target/target_core_transport.c | 27 +++++++++++++++-----------
> >>>  2 files changed, 17 insertions(+), 12 deletions(-)
> >>>
> >>> diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
> >>> index 7536ca797606..56136613767f 100644
> >>> --- a/drivers/target/target_core_sbc.c
> >>> +++ b/drivers/target/target_core_sbc.c
> >>> @@ -459,7 +459,7 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes
> >>>                  * we don't have to perform the write operation.
> >>>                  */
> >>>                 WARN_ON(!(cmd->transport_state &
> >>> -                       (CMD_T_ABORTED | CMD_T_STOP)));
> >>> +                       (CMD_T_ABORTED | CMD_T_STOP | CMD_T_FABRIC_STOP)));
> >>>                 goto out;
> >>>         }
> >>>         /*
> >>> diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
> >>> index cb3fdc81ba3b..02a9476945dc 100644
> >>> --- a/drivers/target/target_core_transport.c
> >>> +++ b/drivers/target/target_core_transport.c
> >>> @@ -737,8 +737,8 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd)
> >>>          * Determine if frontend context caller is requesting the stopping of
> >>>          * this command for frontend exceptions.
> >>>          */
> >>> -       if (cmd->transport_state & CMD_T_STOP) {
> >>> -               pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
> >>> +       if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
> >>> +               pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP for ITT: 0x%08llx\n",
> >>>                         __func__, __LINE__, cmd->tag);
> >>>
> >>>                 spin_unlock_irqrestore(&cmd->t_state_lock, flags);
> >>> @@ -889,7 +889,7 @@ static bool target_cmd_interrupted(struct se_cmd *cmd)
> >>>                 INIT_WORK(&cmd->work, target_abort_work);
> >>>                 queue_work(target_completion_wq, &cmd->work);
> >>>                 return true;
> >>> -       } else if (cmd->transport_state & CMD_T_STOP) {
> >>> +       } else if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
> >>>                 if (cmd->transport_complete_callback)
> >>>                         cmd->transport_complete_callback(cmd, false, &post_ret);
> >>>                 complete_all(&cmd->t_transport_stop_comp);
> >>> @@ -907,13 +907,15 @@ void target_complete_cmd_with_sense(struct se_cmd *cmd, u8 scsi_status,
> >>>         int success, cpu;
> >>>         unsigned long flags;
> >>>
> >>> -       if (target_cmd_interrupted(cmd))
> >>> +       spin_lock_irqsave(&cmd->t_state_lock, flags);
> >>
> >> That leads to a hardlock because
> >> target_cmd_interrupted() => cmd->transport_complete_callback() also tooks
> >> cmd->t_state_lock.
> 
> We shouldn't lock up because for the failure cases the transport_complete_callback
> functions don't take the lock. They just cleanup and return.

The second callback (compare_and_write_post) does take the lock always.
Although to be honest, that lock there has no sense.

> > But the taking the lock for read+write of cmd->t*_state is absolutelly right!
> > It would be great if you manage to move transport_complete_callback()
> > into other thread/job.
> 
> I'm not sure why we want to do that since none of the transport_complete_callback
> sleep on failure. It seems to just add more complexity and we only have the 2
> transport_complete_callback uses now, so it seems overkill.

I have a personal interest on that :) Of course, it is just one of the
ways to solve it.

BR,
 Dmitry
Mike Christie Jan. 18, 2023, 7:12 p.m. UTC | #5
On 1/17/23 13:55, Dmitry Bogdanov wrote
>>>>> -       if (target_cmd_interrupted(cmd))
>>>>> +       spin_lock_irqsave(&cmd->t_state_lock, flags);
>>>>
>>>> That leads to a hardlock because
>>>> target_cmd_interrupted() => cmd->transport_complete_callback() also tooks
>>>> cmd->t_state_lock.
>>
>> We shouldn't lock up because for the failure cases the transport_complete_callback
>> functions don't take the lock. They just cleanup and return.
> 
> The second callback (compare_and_write_post) does take the lock always.
> Although to be honest, that lock there has no sense.

You're right. I messed up. I removed it in my tree in a prep patch
but never posted it.

> 
>>> But the taking the lock for read+write of cmd->t*_state is absolutelly right!
>>> It would be great if you manage to move transport_complete_callback()
>>> into other thread/job.
>>
>> I'm not sure why we want to do that since none of the transport_complete_callback
>> sleep on failure. It seems to just add more complexity and we only have the 2
>> transport_complete_callback uses now, so it seems overkill.
> 
> I have a personal interest on that :) Of course, it is just one of the
> ways to solve it.

Ah ok. I'm going to do the more standard process and code for the what we
have right now since we don't know when your code will be pushed upstream.
It's also the more simple approach so we can always change it and make it more
complicated later on. We just really have to get these regressions fixed.
diff mbox series

Patch

diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
index 7536ca797606..56136613767f 100644
--- a/drivers/target/target_core_sbc.c
+++ b/drivers/target/target_core_sbc.c
@@ -459,7 +459,7 @@  static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes
 		 * we don't have to perform the write operation.
 		 */
 		WARN_ON(!(cmd->transport_state &
-			(CMD_T_ABORTED | CMD_T_STOP)));
+			(CMD_T_ABORTED | CMD_T_STOP | CMD_T_FABRIC_STOP)));
 		goto out;
 	}
 	/*
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index cb3fdc81ba3b..02a9476945dc 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -737,8 +737,8 @@  static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd)
 	 * Determine if frontend context caller is requesting the stopping of
 	 * this command for frontend exceptions.
 	 */
-	if (cmd->transport_state & CMD_T_STOP) {
-		pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
+	if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
+		pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP for ITT: 0x%08llx\n",
 			__func__, __LINE__, cmd->tag);
 
 		spin_unlock_irqrestore(&cmd->t_state_lock, flags);
@@ -889,7 +889,7 @@  static bool target_cmd_interrupted(struct se_cmd *cmd)
 		INIT_WORK(&cmd->work, target_abort_work);
 		queue_work(target_completion_wq, &cmd->work);
 		return true;
-	} else if (cmd->transport_state & CMD_T_STOP) {
+	} else if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
 		if (cmd->transport_complete_callback)
 			cmd->transport_complete_callback(cmd, false, &post_ret);
 		complete_all(&cmd->t_transport_stop_comp);
@@ -907,13 +907,15 @@  void target_complete_cmd_with_sense(struct se_cmd *cmd, u8 scsi_status,
 	int success, cpu;
 	unsigned long flags;
 
-	if (target_cmd_interrupted(cmd))
+	spin_lock_irqsave(&cmd->t_state_lock, flags);
+	if (target_cmd_interrupted(cmd)) {
+		spin_unlock_irqrestore(&cmd->t_state_lock, flags);
 		return;
+	}
 
 	cmd->scsi_status = scsi_status;
 	cmd->sense_reason = sense_reason;
 
-	spin_lock_irqsave(&cmd->t_state_lock, flags);
 	switch (cmd->scsi_status) {
 	case SAM_STAT_CHECK_CONDITION:
 		if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE)
@@ -2277,10 +2279,12 @@  void target_execute_cmd(struct se_cmd *cmd)
 	 *
 	 * If the received CDB has already been aborted stop processing it here.
 	 */
-	if (target_cmd_interrupted(cmd))
+	spin_lock_irq(&cmd->t_state_lock);
+	if (target_cmd_interrupted(cmd)) {
+		spin_unlock_irq(&cmd->t_state_lock);
 		return;
+	}
 
-	spin_lock_irq(&cmd->t_state_lock);
 	cmd->t_state = TRANSPORT_PROCESSING;
 	cmd->transport_state |= CMD_T_ACTIVE | CMD_T_SENT;
 	spin_unlock_irq(&cmd->t_state_lock);
@@ -2847,9 +2851,9 @@  transport_generic_new_cmd(struct se_cmd *cmd)
 	 * Determine if frontend context caller is requesting the stopping of
 	 * this command for frontend exceptions.
 	 */
-	if (cmd->transport_state & CMD_T_STOP &&
+	if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP) &&
 	    !cmd->se_tfo->write_pending_must_be_called) {
-		pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
+		pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOPfor ITT: 0x%08llx\n",
 			 __func__, __LINE__, cmd->tag);
 
 		spin_unlock_irqrestore(&cmd->t_state_lock, flags);
@@ -2880,11 +2884,12 @@  static void transport_write_pending_qf(struct se_cmd *cmd)
 	bool stop;
 
 	spin_lock_irqsave(&cmd->t_state_lock, flags);
-	stop = (cmd->transport_state & (CMD_T_STOP | CMD_T_ABORTED));
+	stop = (cmd->transport_state &
+		(CMD_T_STOP | CMD_T_FABRIC_STOP | CMD_T_ABORTED));
 	spin_unlock_irqrestore(&cmd->t_state_lock, flags);
 
 	if (stop) {
-		pr_debug("%s:%d CMD_T_STOP|CMD_T_ABORTED for ITT: 0x%08llx\n",
+		pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP|CMD_T_ABORTED for ITT: 0x%08llx\n",
 			__func__, __LINE__, cmd->tag);
 		complete_all(&cmd->t_transport_stop_comp);
 		return;