diff mbox series

hwrng: core - fix task hung in hwrng_fillfn

Message ID tencent_97BC6EC36EF24C91A7E7C6DFD2C106688906@qq.com
State New
Headers show
Series hwrng: core - fix task hung in hwrng_fillfn | expand

Commit Message

Edward Adam Davis Nov. 26, 2023, 7:07 a.m. UTC
[Syz repo]
INFO: task hwrng:749 blocked for more than 143 seconds.
      Not tainted 6.7.0-rc2-syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:hwrng           state:D stack:29040 pid:749   tgid:749   ppid:2      flags:0x00004000
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5376 [inline]
 __schedule+0xedb/0x5af0 kernel/sched/core.c:6688
 __schedule_loop kernel/sched/core.c:6763 [inline]
 schedule+0xe9/0x270 kernel/sched/core.c:6778
 schedule_preempt_disabled+0x13/0x20 kernel/sched/core.c:6835
 __mutex_lock_common kernel/locking/mutex.c:679 [inline]
 __mutex_lock+0x5b9/0x9d0 kernel/locking/mutex.c:747
 hwrng_fillfn+0x145/0x430 drivers/char/hw_random/core.c:504
 kthread+0x2c6/0x3a0 kernel/kthread.c:388
 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147
 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242
 </TASK>

...

RIP: 0010:rep_movs_alternative+0x57/0x70 arch/x86/lib/copy_user_64.S:80
Code: 00 66 90 48 8b 06 48 89 07 48 83 c6 08 48 83 c7 08 83 e9 08 74 df 83 f9 08 73 e8 eb c9 eb 01 c3 48 89 c8 48 c1 e9 03 83 e0 07 <f3> 48 a5 89 c1 85 c9 75 b3 c3 48 8d 0c c8 eb ac 66 0f 1f 84 00 00
RSP: 0018:ffffc90004427bb0 EFLAGS: 00050246
RAX: 0000000000000000 RBX: 0000000000000040 RCX: 0000000000000008
RDX: ffffed1028a4ab48 RSI: ffff888145255a00 RDI: 0000000020019980
RBP: 0000000020019980 R08: 0000000000000000 R09: ffffed1028a4ab47
R10: ffff888145255a3f R11: 0000000000000001 R12: ffff888145255a00
R13: 00000000200199c0 R14: 0000000000000000 R15: dffffc0000000000
 copy_user_generic arch/x86/include/asm/uaccess_64.h:112 [inline]
 raw_copy_to_user arch/x86/include/asm/uaccess_64.h:133 [inline]
 _copy_to_user lib/usercopy.c:41 [inline]
 _copy_to_user+0xa8/0xb0 lib/usercopy.c:34
 copy_to_user include/linux/uaccess.h:191 [inline]
 rng_dev_read+0x184/0x580 drivers/char/hw_random/core.c:255
 do_loop_readv_writev fs/read_write.c:755 [inline]
 do_loop_readv_writev fs/read_write.c:743 [inline]
 do_iter_read+0x567/0x830 fs/read_write.c:797
 vfs_readv+0x12d/0x1a0 fs/read_write.c:915
 do_preadv fs/read_write.c:1007 [inline]
 __do_sys_preadv fs/read_write.c:1057 [inline]
 __se_sys_preadv fs/read_write.c:1052 [inline]
 __x64_sys_preadv+0x228/0x300 fs/read_write.c:1052
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82
 entry_SYSCALL_64_after_hwframe+0x63/0x6b

[Analysis]
The lock reading_mutex in rng_dev_read() has been occupied for too long, 
causing the thread callback function hwrng_fillfn() to wait for a timeout.

[Fix]
Move code that does not require this lock protection out of the critical area.

Reported-and-tested-by: syzbot+c52ab18308964d248092@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
---
 drivers/char/hw_random/core.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

Comments

Herbert Xu Dec. 1, 2023, 9:56 a.m. UTC | #1
Edward Adam Davis <eadavis@qq.com> wrote:
>
> diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
> index 420f155d251f..7323ddc958ce 100644
> --- a/drivers/char/hw_random/core.c
> +++ b/drivers/char/hw_random/core.c
> @@ -225,17 +225,18 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf,
>                        goto out;
>                }
> 
> -               if (mutex_lock_interruptible(&reading_mutex)) {
> -                       err = -ERESTARTSYS;
> -                       goto out_put;
> -               }
>                if (!data_avail) {
> +                       if (mutex_lock_interruptible(&reading_mutex)) {
> +                               err = -ERESTARTSYS;
> +                               goto out_put;
> +                       }
>                        bytes_read = rng_get_data(rng, rng_buffer,
>                                rng_buffer_size(),
>                                !(filp->f_flags & O_NONBLOCK));
> +                       mutex_unlock(&reading_mutex);
>                        if (bytes_read < 0) {
>                                err = bytes_read;
> -                               goto out_unlock_reading;
> +                               goto out_put;
>                        }
>                        data_avail = bytes_read;
>                }

Does this change anything at all? Please explain why it was holding
this lock for 143 seconds in the first place.  If it's doing it in
rng_get_data, then your change has zero effect.

> @@ -501,7 +499,10 @@ static int hwrng_fillfn(void *unused)
>                rng = get_current_rng();
>                if (IS_ERR(rng) || !rng)
>                        break;
> -               mutex_lock(&reading_mutex);
> +               if (mutex_lock_interruptible(&reading_mutex)) {
> +                       put_rng(rng);
> +                       return -ERESTARTSYS;
> +               }

No this is just the symptom.  The real problem is why is the driver
spending 143 seconds in rng_get_data?

Cheers,
Edward Adam Davis Dec. 1, 2023, 11:37 a.m. UTC | #2
On Fri, 1 Dec 2023 17:56:59 +0800, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> > diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
> > index 420f155d251f..7323ddc958ce 100644
> > --- a/drivers/char/hw_random/core.c
> > +++ b/drivers/char/hw_random/core.c
> > @@ -225,17 +225,18 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf,
> >                        goto out;
> >                }
> >
> > -               if (mutex_lock_interruptible(&reading_mutex)) {
> > -                       err = -ERESTARTSYS;
> > -                       goto out_put;
> > -               }
> >                if (!data_avail) {
> > +                       if (mutex_lock_interruptible(&reading_mutex)) {
> > +                               err = -ERESTARTSYS;
> > +                               goto out_put;
> > +                       }
> >                        bytes_read = rng_get_data(rng, rng_buffer,
> >                                rng_buffer_size(),
> >                                !(filp->f_flags & O_NONBLOCK));
> > +                       mutex_unlock(&reading_mutex);
> >                        if (bytes_read < 0) {
> >                                err = bytes_read;
> > -                               goto out_unlock_reading;
> > +                               goto out_put;
> >                        }
> >                        data_avail = bytes_read;
> >                }
> 
> Does this change anything at all? Please explain why it was holding
> this lock for 143 seconds in the first place.  If it's doing it in
> rng_get_data, then your change has zero effect.
Reduce the scope of critical zone protection.
The original critical zone contains a too large range, especially like 
copy_to_user() should not be included in the critical zone.
> 
> > @@ -501,7 +499,10 @@ static int hwrng_fillfn(void *unused)
> >                rng = get_current_rng();
> >                if (IS_ERR(rng) || !rng)
> >                        break;
> > -               mutex_lock(&reading_mutex);
> > +               if (mutex_lock_interruptible(&reading_mutex)) {
> > +                       put_rng(rng);
> > +                       return -ERESTARTSYS;
> > +               }
> 
> No this is just the symptom.  The real problem is why is the driver
> spending 143 seconds in rng_get_data?
In the second version of the patch, I have removed the fix in hwrng_fillfn().
But for some reason, the V2 patch did not appear in the mailing list.

I think it was due to consuming too much time while executing copy_to_user() 
that resulted in 143s.
So, I narrowed down the scope of the critical area and moved the code 
copy_to_user() out of the critical area.

Edward
diff mbox series

Patch

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 420f155d251f..7323ddc958ce 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -225,17 +225,18 @@  static ssize_t rng_dev_read(struct file *filp, char __user *buf,
 			goto out;
 		}
 
-		if (mutex_lock_interruptible(&reading_mutex)) {
-			err = -ERESTARTSYS;
-			goto out_put;
-		}
 		if (!data_avail) {
+			if (mutex_lock_interruptible(&reading_mutex)) {
+				err = -ERESTARTSYS;
+				goto out_put;
+			}
 			bytes_read = rng_get_data(rng, rng_buffer,
 				rng_buffer_size(),
 				!(filp->f_flags & O_NONBLOCK));
+			mutex_unlock(&reading_mutex);
 			if (bytes_read < 0) {
 				err = bytes_read;
-				goto out_unlock_reading;
+				goto out_put;
 			}
 			data_avail = bytes_read;
 		}
@@ -243,7 +244,7 @@  static ssize_t rng_dev_read(struct file *filp, char __user *buf,
 		if (!data_avail) {
 			if (filp->f_flags & O_NONBLOCK) {
 				err = -EAGAIN;
-				goto out_unlock_reading;
+				goto out_put;
 			}
 		} else {
 			len = data_avail;
@@ -255,14 +256,13 @@  static ssize_t rng_dev_read(struct file *filp, char __user *buf,
 			if (copy_to_user(buf + ret, rng_buffer + data_avail,
 								len)) {
 				err = -EFAULT;
-				goto out_unlock_reading;
+				goto out_put;
 			}
 
 			size -= len;
 			ret += len;
 		}
 
-		mutex_unlock(&reading_mutex);
 		put_rng(rng);
 
 		if (need_resched())
@@ -276,8 +276,6 @@  static ssize_t rng_dev_read(struct file *filp, char __user *buf,
 out:
 	return ret ? : err;
 
-out_unlock_reading:
-	mutex_unlock(&reading_mutex);
 out_put:
 	put_rng(rng);
 	goto out;
@@ -501,7 +499,10 @@  static int hwrng_fillfn(void *unused)
 		rng = get_current_rng();
 		if (IS_ERR(rng) || !rng)
 			break;
-		mutex_lock(&reading_mutex);
+		if (mutex_lock_interruptible(&reading_mutex)) {
+			put_rng(rng);
+			return -ERESTARTSYS;
+		}
 		rc = rng_get_data(rng, rng_fillbuf,
 				  rng_buffer_size(), 1);
 		if (current_quality != rng->quality)