Message ID | 20240503195606.13120-1-mwilck@suse.com |
---|---|
State | New |
Headers | show |
Series | [v2] I/O errors for ALUA state transitions | expand |
> -static enum scsi_disposition alua_check_sense(struct scsi_device *sdev, > - struct scsi_sense_hdr *sense_hdr) > +static enum scsi_disposition alua_handle_state_transition(struct scsi_device *sdev) > { > struct alua_dh_data *h = sdev->handler_data; > struct alua_port_group *pg; > > + /* > + * LUN Not Accessible - ALUA state transition > + */ > + rcu_read_lock(); > + pg = rcu_dereference(h->pg); > + if (pg) > + pg->state = SCSI_ACCESS_STATE_TRANSITIONING; > + rcu_read_unlock(); > + alua_check(sdev, false); > + return NEEDS_RETRY; This always returns NEEDS_RETRY, so you can drop the return value entirely and handle this in the callers.
On 5/3/24 2:56 PM, Martin Wilck wrote: > case UNIT_ATTENTION: > + if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x0a) Do you need to add this check in alua_tur as well? We are checking for the NOT_READY case.
On Mon, 2024-05-06 at 16:48 -0500, Mike Christie wrote: > On 5/3/24 2:56 PM, Martin Wilck wrote: > > case UNIT_ATTENTION: > > + if (sense_hdr->asc == 0x04 && sense_hdr->ascq == > > 0x0a) > > Do you need to add this check in alua_tur as well? We are checking > for > the NOT_READY case. Good point. I'll add the check, I suppose it can't hurt. But I notice that scsi_test_unit_ready() tries to "eat" UA conditions and alua_tur() calls it with ALUA_FAILOVER_RETRIES (5) retries, so checking the sense key in alua_tur() probably won't make much of a difference, either. [Side note: I am wondering if it makes sense to have scsi_test_unit_ready() retry on UA when called from alua_tur(). After all, alua_tur() is only called to check whether another RTPG must be scheduled. @Hannes?] Regards, Martin
On Mon, 2024-05-06 at 07:54 +0200, Christoph Hellwig wrote: > > -static enum scsi_disposition alua_check_sense(struct scsi_device > > *sdev, > > - struct > > scsi_sense_hdr *sense_hdr) > > +static enum scsi_disposition alua_handle_state_transition(struct > > scsi_device *sdev) > > { > > struct alua_dh_data *h = sdev->handler_data; > > struct alua_port_group *pg; > > > > + /* > > + * LUN Not Accessible - ALUA state transition > > + */ > > + rcu_read_lock(); > > + pg = rcu_dereference(h->pg); > > + if (pg) > > + pg->state = SCSI_ACCESS_STATE_TRANSITIONING; > > + rcu_read_unlock(); > > + alua_check(sdev, false); > > + return NEEDS_RETRY; > > This always returns NEEDS_RETRY, so you can drop the return value > entirely and handle this in the callers. > I liked being able to write "return alua_handle_state_transition(...)" in the caller. But np, I'll change it. Martin
On 5/4/24 04:56, Martin Wilck wrote: > When a host is configured with a few LUNs and IO is running, > injecting FC faults repeatedly leads to path recovery problems. > The LUNs have 4 paths each and 3 of them come back active after > say an FC fault which makes two of the paths go down, instead of > all 4. This happens after several iterations of continuous FC faults. > > Reason here is that we're returning an I/O error whenever we're > encountering sense code 06/04/0a (LOGICAL UNIT NOT ACCESSIBLE, > ASYMMETRIC ACCESS STATE TRANSITION) instead of retrying. > > mwilck: Resending a modified version of this patch, which was originally > authored by Rajashekhar M A from Netapp, and submitted in 2021. > Moved the changes to alua_check_sense() as suggested by Mike Christie [1]. > Evan Milne had raised the question whether pg->state should be set to > transitioning in the UA case [2]. I believe that doing this is > correct. SCSI_ACCESS_STATE_TRANSITIONING by itself doesn't cause I/O > errors. Our handler schedules an RTPG, which will only result in an I/O > error condition if the transitioning timeout expires. > > [1] https://lore.kernel.org/all/0bc96e82-fdda-4187-148d-5b34f81d4942@oracle.com/ > [2] https://lore.kernel.org/all/CAGtn9r=kicnTDE2o7Gt5Y=yoidHYD7tG8XdMHEBJTBraVEoOCw@mail.gmail.com/ > > Signed-off-by: Hannes Reinecke <hare@suse.de> > Signed-off-by: Martin Wilck <mwilck@suse.com> > Co-authored-by: Rajashekhar M A <rajs@netapp.com> > --- > drivers/scsi/device_handler/scsi_dh_alua.c | 34 +++++++++++++--------- > 1 file changed, 20 insertions(+), 14 deletions(-) > > diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c > index a226dc1b65d7..682d5bb53d14 100644 > --- a/drivers/scsi/device_handler/scsi_dh_alua.c > +++ b/drivers/scsi/device_handler/scsi_dh_alua.c > @@ -414,28 +414,34 @@ static char print_alua_state(unsigned char state) > } > } > > -static enum scsi_disposition alua_check_sense(struct scsi_device *sdev, > - struct scsi_sense_hdr *sense_hdr) > +static enum scsi_disposition alua_handle_state_transition(struct scsi_device *sdev) > { > struct alua_dh_data *h = sdev->handler_data; > struct alua_port_group *pg; > > + /* > + * LUN Not Accessible - ALUA state transition > + */ > + rcu_read_lock(); > + pg = rcu_dereference(h->pg); > + if (pg) > + pg->state = SCSI_ACCESS_STATE_TRANSITIONING; > + rcu_read_unlock(); > + alua_check(sdev, false); > + return NEEDS_RETRY; > +} > + > +static enum scsi_disposition alua_check_sense(struct scsi_device *sdev, > + struct scsi_sense_hdr *sense_hdr) > +{ > switch (sense_hdr->sense_key) { > case NOT_READY: > - if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x0a) { > - /* > - * LUN Not Accessible - ALUA state transition > - */ > - rcu_read_lock(); > - pg = rcu_dereference(h->pg); > - if (pg) > - pg->state = SCSI_ACCESS_STATE_TRANSITIONING; > - rcu_read_unlock(); > - alua_check(sdev, false); > - return NEEDS_RETRY; > - } > + if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x0a) Please keep the comment that spells out what this asc/ascq is. > + return alua_handle_state_transition(sdev); > break; > case UNIT_ATTENTION: > + if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x0a) > + return alua_handle_state_transition(sdev); > if (sense_hdr->asc == 0x29 && sense_hdr->ascq == 0x00) { > /* > * Power On, Reset, or Bus Device Reset.
diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index a226dc1b65d7..682d5bb53d14 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -414,28 +414,34 @@ static char print_alua_state(unsigned char state) } } -static enum scsi_disposition alua_check_sense(struct scsi_device *sdev, - struct scsi_sense_hdr *sense_hdr) +static enum scsi_disposition alua_handle_state_transition(struct scsi_device *sdev) { struct alua_dh_data *h = sdev->handler_data; struct alua_port_group *pg; + /* + * LUN Not Accessible - ALUA state transition + */ + rcu_read_lock(); + pg = rcu_dereference(h->pg); + if (pg) + pg->state = SCSI_ACCESS_STATE_TRANSITIONING; + rcu_read_unlock(); + alua_check(sdev, false); + return NEEDS_RETRY; +} + +static enum scsi_disposition alua_check_sense(struct scsi_device *sdev, + struct scsi_sense_hdr *sense_hdr) +{ switch (sense_hdr->sense_key) { case NOT_READY: - if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x0a) { - /* - * LUN Not Accessible - ALUA state transition - */ - rcu_read_lock(); - pg = rcu_dereference(h->pg); - if (pg) - pg->state = SCSI_ACCESS_STATE_TRANSITIONING; - rcu_read_unlock(); - alua_check(sdev, false); - return NEEDS_RETRY; - } + if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x0a) + return alua_handle_state_transition(sdev); break; case UNIT_ATTENTION: + if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x0a) + return alua_handle_state_transition(sdev); if (sense_hdr->asc == 0x29 && sense_hdr->ascq == 0x00) { /* * Power On, Reset, or Bus Device Reset.