diff mbox series

[v2,2/4] PM: hibernate: call wait_for_device_probe() without system_transition_mutex held

Message ID 48d01ce7-e028-c103-ea7f-5a4ea4c8930b@I-love.SAKURA.ne.jp
State New
Headers show
Series [v2,1/4] char: misc: allow calling open() callback without misc_mtx held | expand

Commit Message

Tetsuo Handa July 10, 2022, 2:24 a.m. UTC
syzbot is reporting hung task at misc_open() [1], for there is a race
window of AB-BA deadlock which involves probe_count variable.

Even with "char: misc: allow calling open() callback without misc_mtx
held", wait_for_device_probe() (w_f_d_p() afterward) from
snapshot_open() can sleep forever if probe_count cannot become 0.

w_f_d_p() in snapshot_open() was added by commit c751085943362143
("PM/Hibernate: Wait for SCSI devices scan to complete during resume"),

   "In addition, if the resume from hibernation is userland-driven, it's
    better to wait for all device probes in the kernel to complete before
    attempting to open the resume device."

but that commit did not take into account possibility of unresponsive
hardware, for the timeout is supposed to come from the SCSI layer in the
general case. syzbot is reporting that USB storage, which is a very tiny
wrapper around the whole SCSI protocol, is failing to apply timeout.

Fortunately, holding system_transition_mutex is not required when waiting
for device probe. Therefore, as one of steps for making it possible to
recover from such situation, this patch changes snapshot_open() to call
w_f_d_p() before calling lock_system_sleep().

Note that the problem that w_f_d_p() can sleep too long to wait remains.
But how to fix that part deserves different patches.

Link: https://syzkaller.appspot.com/bug?extid=358c9ab4c93da7b7238c [1]
Reported-by: syzbot <syzbot+358c9ab4c93da7b7238c@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Wedson Almeida Filho <wedsonaf@google.com>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/power/user.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

Comments

Greg Kroah-Hartman July 11, 2022, 8:12 a.m. UTC | #1
On Sun, Jul 10, 2022 at 11:25:08AM +0900, Tetsuo Handa wrote:
> syzbot is reporting hung task at misc_open() [1], for there is a race
> window of AB-BA deadlock which involves probe_count variable.
> 
> Even with "char: misc: allow calling open() callback without misc_mtx
> held" and "PM: hibernate: call wait_for_device_probe() without
> system_transition_mutex held", wait_for_device_probe() from snapshot_open()
> can sleep forever if probe_count cannot become 0.
> 
> Since snapshot_open() is a userland-driven hibernation/resume request,
> it should be acceptable to fail if something is wrong. Users would not
> want to wait for hours if device stopped responding. Therefore, introduce
> killable version of wait_for_device_probe() with timeout.
> 
> According to Oliver Neukum, there are SCSI commands that can run for more
> than 60 seconds. Therefore, this patch choose 5 minutes for timeout.
> 
> Link: https://syzkaller.appspot.com/bug?extid=358c9ab4c93da7b7238c [1]
> Reported-by: syzbot <syzbot+358c9ab4c93da7b7238c@syzkaller.appspotmail.com>
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Cc: Greg KH <gregkh@linuxfoundation.org>
> Cc: Oliver Neukum <oneukum@suse.com>
> Cc: Wedson Almeida Filho <wedsonaf@google.com>
> Cc: Rafael J. Wysocki <rjw@sisk.pl>
> Cc: Arjan van de Ven <arjan@linux.intel.com>
> ---
>  drivers/base/dd.c             | 14 ++++++++++++++
>  include/linux/device/driver.h |  1 +
>  kernel/power/user.c           |  9 +++++++--
>  3 files changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> index 70f79fc71539..3136b8403bb0 100644
> --- a/drivers/base/dd.c
> +++ b/drivers/base/dd.c
> @@ -724,6 +724,20 @@ void wait_for_device_probe(void)
>  }
>  EXPORT_SYMBOL_GPL(wait_for_device_probe);
>  
> +void wait_for_device_probe_killable_timeout(unsigned long timeout)
> +{
> +	/* wait for probe timeout */
> +	wait_event(probe_timeout_waitqueue, !driver_deferred_probe_timeout);
> +
> +	/* wait for the deferred probe workqueue to finish */
> +	flush_work(&deferred_probe_work);
> +
> +	/* wait for the known devices to complete their probing */
> +	wait_event_killable_timeout(probe_waitqueue,
> +				    atomic_read(&probe_count) == 0, timeout);
> +	async_synchronize_full();
> +}
> +
>  static int __driver_probe_device(struct device_driver *drv, struct device *dev)
>  {
>  	int ret = 0;
> diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
> index 7acaabde5396..4ee909144470 100644
> --- a/include/linux/device/driver.h
> +++ b/include/linux/device/driver.h
> @@ -129,6 +129,7 @@ extern struct device_driver *driver_find(const char *name,
>  					 struct bus_type *bus);
>  extern int driver_probe_done(void);
>  extern void wait_for_device_probe(void);
> +extern void wait_for_device_probe_killable_timeout(unsigned long timeout);
>  void __init wait_for_init_devices_probe(void);
>  
>  /* sysfs interface for exporting driver attributes */
> diff --git a/kernel/power/user.c b/kernel/power/user.c
> index db98a028dfdd..32dd5a855e8c 100644
> --- a/kernel/power/user.c
> +++ b/kernel/power/user.c
> @@ -58,8 +58,13 @@ static int snapshot_open(struct inode *inode, struct file *filp)
>  		/* The image device should be already ready. */
>  		break;
>  	default: /* Resuming */
> -		/* We may need to wait for the image device to appear. */
> -		wait_for_device_probe();
> +		/*
> +		 * Since the image device might not be ready, try to wait up to
> +		 * 5 minutes. We should not wait forever, for we might get stuck
> +		 * due to unresponsive devices and/or new probe events which
> +		 * are irrelevant to the image device keep coming in.
> +		 */
> +		wait_for_device_probe_killable_timeout(300 * HZ);

5 minutes is a total random choice.  anything that calls
wait_for_device_probe() feels wrong for other reasons, creating a
locking loop like this should be resolved first, not just papering over
it by allowing 5 minutes to pass before it resolves itself.  5 minutes
is forever and any sane watchdog detector would have rebooted the
machine by then.

thanks,

greg k-h
Tetsuo Handa July 11, 2022, 10:44 a.m. UTC | #2
On 2022/07/11 17:12, Greg KH wrote:
>                                                        creating a
> locking loop like this should be resolved first,

Rafael and Arjan, can we agree with removing wait_for_device_probe() from snapshot_open() ?
Arjan van de Ven July 11, 2022, 5:58 p.m. UTC | #3
On 7/11/2022 3:44 AM, Tetsuo Handa wrote:
> On 2022/07/11 17:12, Greg KH wrote:
>>                                                         creating a
>> locking loop like this should be resolved first,
> 
> Rafael and Arjan, can we agree with removing wait_for_device_probe() from snapshot_open() ?
> 

we can probably remove it. the "fun" is then that devices you need might not be ready once you remove it.
so if we otherwise would panic, we should at least try again after some delay...
(since a panic() is very nasty to debug for all the obvious reasons.. especially is the screen isn't on yet)
Rafael J. Wysocki July 11, 2022, 6:14 p.m. UTC | #4
On Mon, Jul 11, 2022 at 1:21 PM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> On 2022/07/11 17:12, Greg KH wrote:
> >                                                        creating a
> > locking loop like this should be resolved first,
>
> Rafael and Arjan, can we agree with removing wait_for_device_probe() from snapshot_open() ?

No, we can't.
Tetsuo Handa July 12, 2022, 1:52 a.m. UTC | #5
On 2022/07/12 3:14, Rafael J. Wysocki wrote:
> On Mon, Jul 11, 2022 at 1:21 PM Tetsuo Handa
> <penguin-kernel@i-love.sakura.ne.jp> wrote:
>>
>> On 2022/07/11 17:12, Greg KH wrote:
>>>                                                        creating a
>>> locking loop like this should be resolved first,
>>
>> Rafael and Arjan, can we agree with removing wait_for_device_probe() from snapshot_open() ?
> 
> No, we can't.

Then, can we defer wait_for_device_probe() till first write()/ioctl()
which is called without locks?

 kernel/power/user.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/power/user.c b/kernel/power/user.c
index a00a728ddfc1..92aecb989c76 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -26,6 +26,7 @@
 
 #include "power.h"
 
+static bool need_wait;
 
 static struct snapshot_data {
 	struct snapshot_handle handle;
@@ -78,7 +79,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 		 * Resuming.  We may need to wait for the image device to
 		 * appear.
 		 */
-		wait_for_device_probe();
+		need_wait = true;
 
 		data->swap = -1;
 		data->mode = O_WRONLY;
@@ -168,6 +169,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 	ssize_t res;
 	loff_t pg_offp = *offp & ~PAGE_MASK;
 
+	if (need_wait) {
+		wait_for_device_probe();
+		need_wait = false;
+	}
+
 	lock_system_sleep();
 
 	data = filp->private_data;
@@ -244,6 +250,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 	loff_t size;
 	sector_t offset;
 
+	if (need_wait) {
+		wait_for_device_probe();
+		need_wait = false;
+	}
+
 	if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
 		return -ENOTTY;
 	if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
Rafael J. Wysocki July 14, 2022, 6:48 p.m. UTC | #6
On Tue, Jul 12, 2022 at 3:52 AM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> On 2022/07/12 3:14, Rafael J. Wysocki wrote:
> > On Mon, Jul 11, 2022 at 1:21 PM Tetsuo Handa
> > <penguin-kernel@i-love.sakura.ne.jp> wrote:
> >>
> >> On 2022/07/11 17:12, Greg KH wrote:
> >>>                                                        creating a
> >>> locking loop like this should be resolved first,
> >>
> >> Rafael and Arjan, can we agree with removing wait_for_device_probe() from snapshot_open() ?
> >
> > No, we can't.
>
> Then, can we defer wait_for_device_probe() till first write()/ioctl()
> which is called without locks?

Yes, wait_for_device_probe() can be deferred to right before the first
actual access.

>  kernel/power/user.c | 13 ++++++++++++-
>  1 file changed, 12 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/power/user.c b/kernel/power/user.c
> index a00a728ddfc1..92aecb989c76 100644
> --- a/kernel/power/user.c
> +++ b/kernel/power/user.c
> @@ -26,6 +26,7 @@
>
>  #include "power.h"
>
> +static bool need_wait;
>
>  static struct snapshot_data {
>         struct snapshot_handle handle;
> @@ -78,7 +79,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
>                  * Resuming.  We may need to wait for the image device to
>                  * appear.
>                  */
> -               wait_for_device_probe();
> +               need_wait = true;
>
>                 data->swap = -1;
>                 data->mode = O_WRONLY;
> @@ -168,6 +169,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
>         ssize_t res;
>         loff_t pg_offp = *offp & ~PAGE_MASK;
>
> +       if (need_wait) {
> +               wait_for_device_probe();
> +               need_wait = false;
> +       }
> +
>         lock_system_sleep();
>
>         data = filp->private_data;
> @@ -244,6 +250,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
>         loff_t size;
>         sector_t offset;
>
> +       if (need_wait) {
> +               wait_for_device_probe();
> +               need_wait = false;
> +       }
> +
>         if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
>                 return -ENOTTY;
>         if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
> --
> 2.18.4
Rafael J. Wysocki July 15, 2022, 5:42 p.m. UTC | #7
On Sun, Jul 10, 2022 at 4:25 AM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> syzbot is reporting hung task at misc_open() [1], for there is a race
> window of AB-BA deadlock which involves probe_count variable.
>
> Even with "char: misc: allow calling open() callback without misc_mtx
> held", wait_for_device_probe() (w_f_d_p() afterward) from
> snapshot_open() can sleep forever if probe_count cannot become 0.
>
> w_f_d_p() in snapshot_open() was added by commit c751085943362143
> ("PM/Hibernate: Wait for SCSI devices scan to complete during resume"),
>
>    "In addition, if the resume from hibernation is userland-driven, it's
>     better to wait for all device probes in the kernel to complete before
>     attempting to open the resume device."
>
> but that commit did not take into account possibility of unresponsive
> hardware, for the timeout is supposed to come from the SCSI layer in the
> general case. syzbot is reporting that USB storage, which is a very tiny
> wrapper around the whole SCSI protocol, is failing to apply timeout.
>
> Fortunately, holding system_transition_mutex is not required when waiting
> for device probe. Therefore, as one of steps for making it possible to
> recover from such situation, this patch changes snapshot_open() to call
> w_f_d_p() before calling lock_system_sleep().
>
> Note that the problem that w_f_d_p() can sleep too long to wait remains.
> But how to fix that part deserves different patches.
>
> Link: https://syzkaller.appspot.com/bug?extid=358c9ab4c93da7b7238c [1]
> Reported-by: syzbot <syzbot+358c9ab4c93da7b7238c@syzkaller.appspotmail.com>
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Cc: Greg KH <gregkh@linuxfoundation.org>
> Cc: Oliver Neukum <oneukum@suse.com>
> Cc: Wedson Almeida Filho <wedsonaf@google.com>
> Cc: Rafael J. Wysocki <rjw@sisk.pl>
> Cc: Arjan van de Ven <arjan@linux.intel.com>
> ---
>  kernel/power/user.c | 24 ++++++++++++------------
>  1 file changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/kernel/power/user.c b/kernel/power/user.c
> index 59912060109f..db98a028dfdd 100644
> --- a/kernel/power/user.c
> +++ b/kernel/power/user.c
> @@ -51,6 +51,18 @@ static int snapshot_open(struct inode *inode, struct file *filp)
>         if (!hibernation_available())
>                 return -EPERM;
>
> +       switch (filp->f_flags & O_ACCMODE) {
> +       case O_RDWR: /* Can't do both at the same time. */
> +               return -ENOSYS;

if ((filp->f_flags & O_ACCMODE) == O_RDWR)
              return -ENOSYS;

/* On resume, we may need to wait for the image device to appear. */
if ((filp->f_flags & O_ACCMODE) == O_WRONLY)
              wait_for_device_probe();

> +       case O_RDONLY: /* Hibernating */
> +               /* The image device should be already ready. */
> +               break;
> +       default: /* Resuming */
> +               /* We may need to wait for the image device to appear. */
> +               wait_for_device_probe();
> +               break;
> +       }
> +
>         lock_system_sleep();
>
>         if (!hibernate_acquire()) {
> @@ -58,28 +70,16 @@ static int snapshot_open(struct inode *inode, struct file *filp)
>                 goto Unlock;
>         }
>
> -       if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
> -               hibernate_release();
> -               error = -ENOSYS;
> -               goto Unlock;
> -       }
>         nonseekable_open(inode, filp);
>         data = &snapshot_state;
>         filp->private_data = data;
>         memset(&data->handle, 0, sizeof(struct snapshot_handle));
>         if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
> -               /* Hibernating.  The image device should be accessible. */

No need to remove this comment.

>                 data->swap = swap_type_of(swsusp_resume_device, 0);
>                 data->mode = O_RDONLY;
>                 data->free_bitmaps = false;
>                 error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
>         } else {
> -               /*
> -                * Resuming.  We may need to wait for the image device to
> -                * appear.
> -                */
> -               wait_for_device_probe();
> -
>                 data->swap = -1;
>                 data->mode = O_WRONLY;
>                 error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE);
> --
> 2.18.4
>
diff mbox series

Patch

diff --git a/kernel/power/user.c b/kernel/power/user.c
index 59912060109f..db98a028dfdd 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -51,6 +51,18 @@  static int snapshot_open(struct inode *inode, struct file *filp)
 	if (!hibernation_available())
 		return -EPERM;
 
+	switch (filp->f_flags & O_ACCMODE) {
+	case O_RDWR: /* Can't do both at the same time. */
+		return -ENOSYS;
+	case O_RDONLY: /* Hibernating */
+		/* The image device should be already ready. */
+		break;
+	default: /* Resuming */
+		/* We may need to wait for the image device to appear. */
+		wait_for_device_probe();
+		break;
+	}
+
 	lock_system_sleep();
 
 	if (!hibernate_acquire()) {
@@ -58,28 +70,16 @@  static int snapshot_open(struct inode *inode, struct file *filp)
 		goto Unlock;
 	}
 
-	if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
-		hibernate_release();
-		error = -ENOSYS;
-		goto Unlock;
-	}
 	nonseekable_open(inode, filp);
 	data = &snapshot_state;
 	filp->private_data = data;
 	memset(&data->handle, 0, sizeof(struct snapshot_handle));
 	if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
-		/* Hibernating.  The image device should be accessible. */
 		data->swap = swap_type_of(swsusp_resume_device, 0);
 		data->mode = O_RDONLY;
 		data->free_bitmaps = false;
 		error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
 	} else {
-		/*
-		 * Resuming.  We may need to wait for the image device to
-		 * appear.
-		 */
-		wait_for_device_probe();
-
 		data->swap = -1;
 		data->mode = O_WRONLY;
 		error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE);