Message ID | 20210810041410.142035-2-desmondcheongzx@gmail.com |
---|---|
State | New |
Headers | show |
Series | Bluetooth: fix locking and socket killing in SCO and RFCOMM | expand |
On 8/9/21 9:14 PM, Desmond Cheong Zhi Xi wrote: > struct sock.sk_timer should be used as a sock cleanup timer. However, > SCO uses it to implement sock timeouts. > > This causes issues because struct sock.sk_timer's callback is run in > an IRQ context, and the timer callback function sco_sock_timeout takes > a spin lock on the socket. However, other functions such as > sco_conn_del and sco_conn_ready take the spin lock with interrupts > enabled. > > This inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} lock usage could > lead to deadlocks as reported by Syzbot [1]: > CPU0 > ---- > lock(slock-AF_BLUETOOTH-BTPROTO_SCO); > <Interrupt> > lock(slock-AF_BLUETOOTH-BTPROTO_SCO); > > To fix this, we use delayed work to implement SCO sock timouts > instead. This allows us to avoid taking the spin lock on the socket in > an IRQ context, and corrects the misuse of struct sock.sk_timer. > > As a note, cancel_delayed_work is used instead of > cancel_delayed_work_sync in sco_sock_set_timer and > sco_sock_clear_timer to avoid a deadlock. In the future, the call to > bh_lock_sock inside sco_sock_timeout should be changed to lock_sock to > synchronize with other functions using lock_sock. However, since > sco_sock_set_timer and sco_sock_clear_timer are sometimes called under > the locked socket (in sco_connect and __sco_sock_close), > cancel_delayed_work_sync might cause them to sleep until an > sco_sock_timeout that has started finishes running. But > sco_sock_timeout would also sleep until it can grab the lock_sock. > > Using cancel_delayed_work is fine because sco_sock_timeout does not > change from run to run, hence there is no functional difference > between: > 1. waiting for a timeout to finish running before scheduling another > timeout > 2. scheduling another timeout while a timeout is running. > > Link: https://syzkaller.appspot.com/bug?id=9089d89de0502e120f234ca0fc8a703f7368b31e [1] > Reported-by: syzbot+2f6d7c28bb4bf7e82060@syzkaller.appspotmail.com > Tested-by: syzbot+2f6d7c28bb4bf7e82060@syzkaller.appspotmail.com > Signed-off-by: Desmond Cheong Zhi Xi <desmondcheongzx@gmail.com> > --- > net/bluetooth/sco.c | 35 +++++++++++++++++++++++++++++------ > 1 file changed, 29 insertions(+), 6 deletions(-) > > diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c > index ffa2a77a3e4c..62e638f971a9 100644 > --- a/net/bluetooth/sco.c > +++ b/net/bluetooth/sco.c > @@ -48,6 +48,8 @@ struct sco_conn { > spinlock_t lock; > struct sock *sk; > > + struct delayed_work timeout_work; > + > unsigned int mtu; > }; > > @@ -74,9 +76,20 @@ struct sco_pinfo { > #define SCO_CONN_TIMEOUT (HZ * 40) > #define SCO_DISCONN_TIMEOUT (HZ * 2) > > -static void sco_sock_timeout(struct timer_list *t) > +static void sco_sock_timeout(struct work_struct *work) > { > - struct sock *sk = from_timer(sk, t, sk_timer); > + struct sco_conn *conn = container_of(work, struct sco_conn, > + timeout_work.work); > + struct sock *sk; > + > + sco_conn_lock(conn); > + sk = conn->sk; > + if (sk) > + sock_hold(sk); syzbot complains here that sk refcount can be zero at this time. refcount_t: addition on 0; use-after-free. WARNING: CPU: 0 PID: 10451 at lib/refcount.c:25 refcount_warn_saturate+0x169/0x1e0 lib/refcount.c:25 Modules linked in: CPU: 0 PID: 10451 Comm: kworker/0:8 Not tainted 5.14.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events sco_sock_timeout RIP: 0010:refcount_warn_saturate+0x169/0x1e0 lib/refcount.c:25 Code: 09 31 ff 89 de e8 d7 c9 9e fd 84 db 0f 85 36 ff ff ff e8 8a c3 9e fd 48 c7 c7 20 8f e3 89 c6 05 e8 7f 81 09 01 e8 f0 98 16 05 <0f> 0b e9 17 ff ff ff e8 6b c3 9e fd 0f b6 1d cd 7f 81 09 31 ff 89 RSP: 0018:ffffc9001766fce8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88802cea3880 RSI: ffffffff815d87a5 RDI: fffff52002ecdf8f RBP: 0000000000000002 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff815d25de R11: 0000000000000000 R12: ffff88806d23ce08 R13: ffff8880712c8080 R14: ffff88802edf4500 R15: ffff8880b9c51240 FS: 0000000000000000(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3748c20000 CR3: 0000000017644000 CR4: 00000000001506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __refcount_add include/linux/refcount.h:199 [inline] __refcount_inc include/linux/refcount.h:250 [inline] refcount_inc include/linux/refcount.h:267 [inline] sock_hold include/net/sock.h:702 [inline] sco_sock_timeout+0x216/0x290 net/bluetooth/sco.c:88 process_one_work+0x98d/0x1630 kernel/workqueue.c:2276 worker_thread+0x658/0x11f0 kernel/workqueue.c:2422 kthread+0x3e5/0x4d0 kernel/kthread.c:319 > + sco_conn_unlock(conn); > + > + if (!sk) > + return; > > BT_DBG("sock %p state %d", sk, sk->sk_state); > > @@ -91,14 +104,21 @@ static void sco_sock_timeout(struct timer_list *t) > > static void sco_sock_set_timer(struct sock *sk, long timeout) > { > + if (!sco_pi(sk)->conn) > + return; > + > BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); > - sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout); > + cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); > + schedule_delayed_work(&sco_pi(sk)->conn->timeout_work, timeout); > } > > static void sco_sock_clear_timer(struct sock *sk) > { > + if (!sco_pi(sk)->conn) > + return; > + > BT_DBG("sock %p state %d", sk, sk->sk_state); > - sk_stop_timer(sk, &sk->sk_timer); > + cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); > } > > /* ---- SCO connections ---- */ > @@ -179,6 +199,9 @@ static void sco_conn_del(struct hci_conn *hcon, int err) > bh_unlock_sock(sk); > sco_sock_kill(sk); > sock_put(sk); > + > + /* Ensure no more work items will run before freeing conn. */ Maybe you should have done this cancel_delayed_work_sync() before the prior sock_put(sk) ? > + cancel_delayed_work_sync(&conn->timeout_work); > } > > hcon->sco_data = NULL; > @@ -193,6 +216,8 @@ static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, > sco_pi(sk)->conn = conn; > conn->sk = sk; > > + INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); > + > if (parent) > bt_accept_enqueue(parent, sk, true); > } > @@ -500,8 +525,6 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, > > sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; > > - timer_setup(&sk->sk_timer, sco_sock_timeout, 0); > - > bt_sock_link(&sco_sk_list, sk); > return sk; > } >
On 2/9/21 3:17 pm, Eric Dumazet wrote: > > > On 8/9/21 9:14 PM, Desmond Cheong Zhi Xi wrote: >> struct sock.sk_timer should be used as a sock cleanup timer. However, >> SCO uses it to implement sock timeouts. >> >> This causes issues because struct sock.sk_timer's callback is run in >> an IRQ context, and the timer callback function sco_sock_timeout takes >> a spin lock on the socket. However, other functions such as >> sco_conn_del and sco_conn_ready take the spin lock with interrupts >> enabled. >> >> This inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} lock usage could >> lead to deadlocks as reported by Syzbot [1]: >> CPU0 >> ---- >> lock(slock-AF_BLUETOOTH-BTPROTO_SCO); >> <Interrupt> >> lock(slock-AF_BLUETOOTH-BTPROTO_SCO); >> >> To fix this, we use delayed work to implement SCO sock timouts >> instead. This allows us to avoid taking the spin lock on the socket in >> an IRQ context, and corrects the misuse of struct sock.sk_timer. >> >> As a note, cancel_delayed_work is used instead of >> cancel_delayed_work_sync in sco_sock_set_timer and >> sco_sock_clear_timer to avoid a deadlock. In the future, the call to >> bh_lock_sock inside sco_sock_timeout should be changed to lock_sock to >> synchronize with other functions using lock_sock. However, since >> sco_sock_set_timer and sco_sock_clear_timer are sometimes called under >> the locked socket (in sco_connect and __sco_sock_close), >> cancel_delayed_work_sync might cause them to sleep until an >> sco_sock_timeout that has started finishes running. But >> sco_sock_timeout would also sleep until it can grab the lock_sock. >> >> Using cancel_delayed_work is fine because sco_sock_timeout does not >> change from run to run, hence there is no functional difference >> between: >> 1. waiting for a timeout to finish running before scheduling another >> timeout >> 2. scheduling another timeout while a timeout is running. >> >> Link: https://syzkaller.appspot.com/bug?id=9089d89de0502e120f234ca0fc8a703f7368b31e [1] >> Reported-by: syzbot+2f6d7c28bb4bf7e82060@syzkaller.appspotmail.com >> Tested-by: syzbot+2f6d7c28bb4bf7e82060@syzkaller.appspotmail.com >> Signed-off-by: Desmond Cheong Zhi Xi <desmondcheongzx@gmail.com> >> --- >> net/bluetooth/sco.c | 35 +++++++++++++++++++++++++++++------ >> 1 file changed, 29 insertions(+), 6 deletions(-) >> >> diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c >> index ffa2a77a3e4c..62e638f971a9 100644 >> --- a/net/bluetooth/sco.c >> +++ b/net/bluetooth/sco.c >> @@ -48,6 +48,8 @@ struct sco_conn { >> spinlock_t lock; >> struct sock *sk; >> >> + struct delayed_work timeout_work; >> + >> unsigned int mtu; >> }; >> >> @@ -74,9 +76,20 @@ struct sco_pinfo { >> #define SCO_CONN_TIMEOUT (HZ * 40) >> #define SCO_DISCONN_TIMEOUT (HZ * 2) >> >> -static void sco_sock_timeout(struct timer_list *t) >> +static void sco_sock_timeout(struct work_struct *work) >> { >> - struct sock *sk = from_timer(sk, t, sk_timer); >> + struct sco_conn *conn = container_of(work, struct sco_conn, >> + timeout_work.work); >> + struct sock *sk; >> + >> + sco_conn_lock(conn); >> + sk = conn->sk; >> + if (sk) >> + sock_hold(sk); > > syzbot complains here that sk refcount can be zero at this time. > > refcount_t: addition on 0; use-after-free. > WARNING: CPU: 0 PID: 10451 at lib/refcount.c:25 refcount_warn_saturate+0x169/0x1e0 lib/refcount.c:25 > Modules linked in: > CPU: 0 PID: 10451 Comm: kworker/0:8 Not tainted 5.14.0-rc7-syzkaller #0 > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 > Workqueue: events sco_sock_timeout > RIP: 0010:refcount_warn_saturate+0x169/0x1e0 lib/refcount.c:25 > Code: 09 31 ff 89 de e8 d7 c9 9e fd 84 db 0f 85 36 ff ff ff e8 8a c3 9e fd 48 c7 c7 20 8f e3 89 c6 05 e8 7f 81 09 01 e8 f0 98 16 05 <0f> 0b e9 17 ff ff ff e8 6b c3 9e fd 0f b6 1d cd 7f 81 09 31 ff 89 > RSP: 0018:ffffc9001766fce8 EFLAGS: 00010282 > RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 > RDX: ffff88802cea3880 RSI: ffffffff815d87a5 RDI: fffff52002ecdf8f > RBP: 0000000000000002 R08: 0000000000000000 R09: 0000000000000000 > R10: ffffffff815d25de R11: 0000000000000000 R12: ffff88806d23ce08 > R13: ffff8880712c8080 R14: ffff88802edf4500 R15: ffff8880b9c51240 > FS: 0000000000000000(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 > CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > CR2: 00007f3748c20000 CR3: 0000000017644000 CR4: 00000000001506f0 > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > Call Trace: > __refcount_add include/linux/refcount.h:199 [inline] > __refcount_inc include/linux/refcount.h:250 [inline] > refcount_inc include/linux/refcount.h:267 [inline] > sock_hold include/net/sock.h:702 [inline] > sco_sock_timeout+0x216/0x290 net/bluetooth/sco.c:88 > process_one_work+0x98d/0x1630 kernel/workqueue.c:2276 > worker_thread+0x658/0x11f0 kernel/workqueue.c:2422 > kthread+0x3e5/0x4d0 kernel/kthread.c:319 > > >> + sco_conn_unlock(conn); >> + >> + if (!sk) >> + return; >> >> BT_DBG("sock %p state %d", sk, sk->sk_state); >> >> @@ -91,14 +104,21 @@ static void sco_sock_timeout(struct timer_list *t) >> >> static void sco_sock_set_timer(struct sock *sk, long timeout) >> { >> + if (!sco_pi(sk)->conn) >> + return; >> + >> BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); >> - sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout); >> + cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); >> + schedule_delayed_work(&sco_pi(sk)->conn->timeout_work, timeout); > >> } >> >> static void sco_sock_clear_timer(struct sock *sk) >> { >> + if (!sco_pi(sk)->conn) >> + return; >> + >> BT_DBG("sock %p state %d", sk, sk->sk_state); >> - sk_stop_timer(sk, &sk->sk_timer); >> + cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); > > >> } >> >> /* ---- SCO connections ---- */ >> @@ -179,6 +199,9 @@ static void sco_conn_del(struct hci_conn *hcon, int err) >> bh_unlock_sock(sk); >> sco_sock_kill(sk); >> sock_put(sk); >> + >> + /* Ensure no more work items will run before freeing conn. */ > > Maybe you should have done this cancel_delayed_work_sync() before the prior sock_put(sk) ? > >> + cancel_delayed_work_sync(&conn->timeout_work); >> } >> >> hcon->sco_data = NULL; >> @@ -193,6 +216,8 @@ static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, >> sco_pi(sk)->conn = conn; >> conn->sk = sk; >> >> + INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); >> + >> if (parent) >> bt_accept_enqueue(parent, sk, true); >> } >> @@ -500,8 +525,6 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, >> >> sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; >> >> - timer_setup(&sk->sk_timer, sco_sock_timeout, 0); >> - >> bt_sock_link(&sco_sk_list, sk); >> return sk; >> } >> Hi Eric, This actually seems to be a pre-existing error in sco_sock_connect that we now hit in sco_sock_timeout. Any thoughts on the following patch to address the problem? Link: https://lore.kernel.org/lkml/20210831065601.101185-1-desmondcheongzx@gmail.com/
On 9/2/21 12:32 PM, Desmond Cheong Zhi Xi wrote: > > Hi Eric, > > This actually seems to be a pre-existing error in sco_sock_connect that we now hit in sco_sock_timeout. > > Any thoughts on the following patch to address the problem? > > Link: https://lore.kernel.org/lkml/20210831065601.101185-1-desmondcheongzx@gmail.com/ syzbot is still working on finding a repro, this is obviously not trivial, because this is a race window. I think this can happen even with a single SCO connection. This might be triggered more easily forcing a delay in sco_sock_timeout() diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 98a88158651281c9f75c4e0371044251e976e7ef..71ebe0243fab106c676c308724fe3a3f92a62cbd 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -84,8 +84,14 @@ static void sco_sock_timeout(struct work_struct *work) sco_conn_lock(conn); sk = conn->sk; - if (sk) + if (sk) { + // lets pretend cpu has been busy (in interrupts) for 100ms + int i; + for (i=0;i<100000;i++) + udelay(1); + sock_hold(sk); + } sco_conn_unlock(conn); if (!sk) Stack trace tells us that sco_sock_timeout() is running after last reference on socket has been released. __refcount_add include/linux/refcount.h:199 [inline] __refcount_inc include/linux/refcount.h:250 [inline] refcount_inc include/linux/refcount.h:267 [inline] sock_hold include/net/sock.h:702 [inline] sco_sock_timeout+0x216/0x290 net/bluetooth/sco.c:88 process_one_work+0x98d/0x1630 kernel/workqueue.c:2276 worker_thread+0x658/0x11f0 kernel/workqueue.c:2422 kthread+0x3e5/0x4d0 kernel/kthread.c:319 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 This is why I suggested to delay sock_put() to make sure this can not happen. diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 98a88158651281c9f75c4e0371044251e976e7ef..bd0222e3f05a6bcb40cffe8405c9dfff98d7afde 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -195,10 +195,11 @@ static void sco_conn_del(struct hci_conn *hcon, int err) sco_sock_clear_timer(sk); sco_chan_del(sk, err); release_sock(sk); - sock_put(sk); /* Ensure no more work items will run before freeing conn. */ cancel_delayed_work_sync(&conn->timeout_work); + + sock_put(sk); } hcon->sco_data = NULL;
On 2/9/21 5:41 pm, Eric Dumazet wrote: > > > On 9/2/21 12:32 PM, Desmond Cheong Zhi Xi wrote: >> >> Hi Eric, >> >> This actually seems to be a pre-existing error in sco_sock_connect that we now hit in sco_sock_timeout. >> >> Any thoughts on the following patch to address the problem? >> >> Link: https://lore.kernel.org/lkml/20210831065601.101185-1-desmondcheongzx@gmail.com/ > > > syzbot is still working on finding a repro, this is obviously not trivial, > because this is a race window. > > I think this can happen even with a single SCO connection. > > This might be triggered more easily forcing a delay in sco_sock_timeout() > > diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c > index 98a88158651281c9f75c4e0371044251e976e7ef..71ebe0243fab106c676c308724fe3a3f92a62cbd 100644 > --- a/net/bluetooth/sco.c > +++ b/net/bluetooth/sco.c > @@ -84,8 +84,14 @@ static void sco_sock_timeout(struct work_struct *work) > > sco_conn_lock(conn); > sk = conn->sk; > - if (sk) > + if (sk) { > + // lets pretend cpu has been busy (in interrupts) for 100ms > + int i; > + for (i=0;i<100000;i++) > + udelay(1); > + > sock_hold(sk); > + }> sco_conn_unlock(conn); > > if (!sk) > > > Stack trace tells us that sco_sock_timeout() is running after last reference > on socket has been released. > > __refcount_add include/linux/refcount.h:199 [inline] > __refcount_inc include/linux/refcount.h:250 [inline] > refcount_inc include/linux/refcount.h:267 [inline] > sock_hold include/net/sock.h:702 [inline] > sco_sock_timeout+0x216/0x290 net/bluetooth/sco.c:88 > process_one_work+0x98d/0x1630 kernel/workqueue.c:2276 > worker_thread+0x658/0x11f0 kernel/workqueue.c:2422 > kthread+0x3e5/0x4d0 kernel/kthread.c:319 > ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 > > This is why I suggested to delay sock_put() to make sure this can not happen. > > diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c > index 98a88158651281c9f75c4e0371044251e976e7ef..bd0222e3f05a6bcb40cffe8405c9dfff98d7afde 100644 > --- a/net/bluetooth/sco.c > +++ b/net/bluetooth/sco.c > @@ -195,10 +195,11 @@ static void sco_conn_del(struct hci_conn *hcon, int err) > sco_sock_clear_timer(sk); > sco_chan_del(sk, err); > release_sock(sk); > - sock_put(sk); > > /* Ensure no more work items will run before freeing conn. */ > cancel_delayed_work_sync(&conn->timeout_work); > + > + sock_put(sk); > } > > hcon->sco_data = NULL; > I see where you're going with this, but once sco_chan_del returns, any instance of sco_sock_timeout that hasn't yet called sock_hold will simply return, because conn->sk is NULL. Adding a delay to the sco_conn_lock critical section in sco_sock_timeout would not affect this because sco_chan_del clears conn->sk while holding onto the lock. The main reason that cancel_delayed_work_sync is run there is to make sure that we don't have a UAF on the SCO connection itself after we free conn. For a single SCO connection with well-formed channel, I think there can't be a race. Here's the reasoning: - For the timeout to be scheduled, a socket must have a channel with a connection. - When a channel between a socket and connection is established, the socket transitions from BT_OPEN to BT_CONNECTED, BT_CONNECT, or BT_CONNECT2. - For a socket to be released, it has to be zapped. For sockets that have a state of BT_CONNECTED, BT_CONNECT, or BT_CONNECT2, they are zapped only when the channel is deleted. - If the channel is deleted (which is protected by sco_conn_lock), then conn->sk is NULL, and sco_sock_timeout simply exits. If we had entered the critical section in sco_sock_timeout before the channel was deleted, then we increased the reference count on the socket, so it won't be freed until sco_sock_timeout is done. Hence, sco_sock_timeout doesn't race with the release of a socket that has a well-formed channel with a connection. But if multiple connections are allocated and overwritten in sco_sock_connect, then none of the above assumptions hold because the SCO connection can't be cleaned up (i.e. conn->sk cannot be set to NULL) when the associated socket is released. This scenario happens in the syzbot reproducer for the crash here: https://syzkaller.appspot.com/bug?id=bcc246d137428d00ed14b476c2068579515fe2bc That aside, upon taking a closer look, I think there is indeed a race lurking in sco_conn_del, but it's not the one that syzbot is hitting. Our sock_hold simply comes too late, and by the time it's called we might have already have freed the socket. So probably something like this needs to happen: diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index fa25b07120c9..ea18e5b56343 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -187,10 +187,11 @@ static void sco_conn_del(struct hci_conn *hcon, int err) /* Kill socket */ sco_conn_lock(conn); sk = conn->sk; + if (sk) + sock_hold(sk); sco_conn_unlock(conn); if (sk) { - sock_hold(sk); lock_sock(sk); sco_sock_clear_timer(sk); sco_chan_del(sk, err);
On 2/9/21 6:53 pm, Desmond Cheong Zhi Xi wrote: > On 2/9/21 5:41 pm, Eric Dumazet wrote: >> >> >> On 9/2/21 12:32 PM, Desmond Cheong Zhi Xi wrote: >>> >>> Hi Eric, >>> >>> This actually seems to be a pre-existing error in sco_sock_connect >>> that we now hit in sco_sock_timeout. >>> >>> Any thoughts on the following patch to address the problem? >>> >>> Link: >>> https://lore.kernel.org/lkml/20210831065601.101185-1-desmondcheongzx@gmail.com/ >>> >> >> >> syzbot is still working on finding a repro, this is obviously not >> trivial, >> because this is a race window. >> >> I think this can happen even with a single SCO connection. >> >> This might be triggered more easily forcing a delay in sco_sock_timeout() >> >> diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c >> index >> 98a88158651281c9f75c4e0371044251e976e7ef..71ebe0243fab106c676c308724fe3a3f92a62cbd >> 100644 >> --- a/net/bluetooth/sco.c >> +++ b/net/bluetooth/sco.c >> @@ -84,8 +84,14 @@ static void sco_sock_timeout(struct work_struct *work) >> sco_conn_lock(conn); >> sk = conn->sk; >> - if (sk) >> + if (sk) { >> + // lets pretend cpu has been busy (in interrupts) for >> 100ms >> + int i; >> + for (i=0;i<100000;i++) >> + udelay(1); >> + >> sock_hold(sk); >> + }> sco_conn_unlock(conn); >> if (!sk) >> >> >> Stack trace tells us that sco_sock_timeout() is running after last >> reference >> on socket has been released. >> >> __refcount_add include/linux/refcount.h:199 [inline] >> __refcount_inc include/linux/refcount.h:250 [inline] >> refcount_inc include/linux/refcount.h:267 [inline] >> sock_hold include/net/sock.h:702 [inline] >> sco_sock_timeout+0x216/0x290 net/bluetooth/sco.c:88 >> process_one_work+0x98d/0x1630 kernel/workqueue.c:2276 >> worker_thread+0x658/0x11f0 kernel/workqueue.c:2422 >> kthread+0x3e5/0x4d0 kernel/kthread.c:319 >> ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 >> >> This is why I suggested to delay sock_put() to make sure this can not >> happen. >> >> diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c >> index >> 98a88158651281c9f75c4e0371044251e976e7ef..bd0222e3f05a6bcb40cffe8405c9dfff98d7afde >> 100644 >> --- a/net/bluetooth/sco.c >> +++ b/net/bluetooth/sco.c >> @@ -195,10 +195,11 @@ static void sco_conn_del(struct hci_conn *hcon, >> int err) >> sco_sock_clear_timer(sk); >> sco_chan_del(sk, err); >> release_sock(sk); >> - sock_put(sk); >> /* Ensure no more work items will run before freeing >> conn. */ >> cancel_delayed_work_sync(&conn->timeout_work); >> + >> + sock_put(sk); >> } >> hcon->sco_data = NULL; >> > > I see where you're going with this, but once sco_chan_del returns, any > instance of sco_sock_timeout that hasn't yet called sock_hold will > simply return, because conn->sk is NULL. Adding a delay to the > sco_conn_lock critical section in sco_sock_timeout would not affect this > because sco_chan_del clears conn->sk while holding onto the lock. > > The main reason that cancel_delayed_work_sync is run there is to make > sure that we don't have a UAF on the SCO connection itself after we free > conn. > Now that I think about this, the init and cleanup isn't quite right either. The delayed work should be initialized when the connection is allocated, and we should always cancel all work before freeing: diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index ea18e5b56343..bba5cdb4cb4a 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -133,6 +133,7 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) return NULL; spin_lock_init(&conn->lock); + INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); hcon->sco_data = conn; conn->hcon = hcon; @@ -197,11 +198,11 @@ static void sco_conn_del(struct hci_conn *hcon, int err) sco_chan_del(sk, err); release_sock(sk); sock_put(sk); - - /* Ensure no more work items will run before freeing conn. */ - cancel_delayed_work_sync(&conn->timeout_work); } + /* Ensure no more work items will run before freeing conn. */ + cancel_delayed_work_sync(&conn->timeout_work); + hcon->sco_data = NULL; kfree(conn); } @@ -214,8 +215,6 @@ static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, sco_pi(sk)->conn = conn; conn->sk = sk; - INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); - if (parent) bt_accept_enqueue(parent, sk, true); } > For a single SCO connection with well-formed channel, I think there > can't be a race. Here's the reasoning: > > - For the timeout to be scheduled, a socket must have a channel with a > connection. > > - When a channel between a socket and connection is established, the > socket transitions from BT_OPEN to BT_CONNECTED, BT_CONNECT, or > BT_CONNECT2. > > - For a socket to be released, it has to be zapped. For sockets that > have a state of BT_CONNECTED, BT_CONNECT, or BT_CONNECT2, they are > zapped only when the channel is deleted. > > - If the channel is deleted (which is protected by sco_conn_lock), then > conn->sk is NULL, and sco_sock_timeout simply exits. If we had entered > the critical section in sco_sock_timeout before the channel was deleted, > then we increased the reference count on the socket, so it won't be > freed until sco_sock_timeout is done. > > Hence, sco_sock_timeout doesn't race with the release of a socket that > has a well-formed channel with a connection. > > But if multiple connections are allocated and overwritten in > sco_sock_connect, then none of the above assumptions hold because the > SCO connection can't be cleaned up (i.e. conn->sk cannot be set to NULL) > when the associated socket is released. This scenario happens in the > syzbot reproducer for the crash here: > https://syzkaller.appspot.com/bug?id=bcc246d137428d00ed14b476c2068579515fe2bc > > > That aside, upon taking a closer look, I think there is indeed a race > lurking in sco_conn_del, but it's not the one that syzbot is hitting. > Our sock_hold simply comes too late, and by the time it's called we > might have already have freed the socket. > > So probably something like this needs to happen: > > diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c > index fa25b07120c9..ea18e5b56343 100644 > --- a/net/bluetooth/sco.c > +++ b/net/bluetooth/sco.c > @@ -187,10 +187,11 @@ static void sco_conn_del(struct hci_conn *hcon, > int err) > /* Kill socket */ > sco_conn_lock(conn); > sk = conn->sk; > + if (sk) > + sock_hold(sk); > sco_conn_unlock(conn); > > if (sk) { > - sock_hold(sk); > lock_sock(sk); > sco_sock_clear_timer(sk); > sco_chan_del(sk, err);
Hi Desmond, On Thu, Sep 2, 2021 at 4:05 PM Desmond Cheong Zhi Xi <desmondcheongzx@gmail.com> wrote: > > On 2/9/21 6:53 pm, Desmond Cheong Zhi Xi wrote: > > On 2/9/21 5:41 pm, Eric Dumazet wrote: > >> > >> > >> On 9/2/21 12:32 PM, Desmond Cheong Zhi Xi wrote: > >>> > >>> Hi Eric, > >>> > >>> This actually seems to be a pre-existing error in sco_sock_connect > >>> that we now hit in sco_sock_timeout. > >>> > >>> Any thoughts on the following patch to address the problem? > >>> > >>> Link: > >>> https://lore.kernel.org/lkml/20210831065601.101185-1-desmondcheongzx@gmail.com/ > >>> > >> > >> > >> syzbot is still working on finding a repro, this is obviously not > >> trivial, > >> because this is a race window. > >> > >> I think this can happen even with a single SCO connection. > >> > >> This might be triggered more easily forcing a delay in sco_sock_timeout() > >> > >> diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c > >> index > >> 98a88158651281c9f75c4e0371044251e976e7ef..71ebe0243fab106c676c308724fe3a3f92a62cbd > >> 100644 > >> --- a/net/bluetooth/sco.c > >> +++ b/net/bluetooth/sco.c > >> @@ -84,8 +84,14 @@ static void sco_sock_timeout(struct work_struct *work) > >> sco_conn_lock(conn); > >> sk = conn->sk; > >> - if (sk) > >> + if (sk) { > >> + // lets pretend cpu has been busy (in interrupts) for > >> 100ms > >> + int i; > >> + for (i=0;i<100000;i++) > >> + udelay(1); > >> + > >> sock_hold(sk); > >> + }> sco_conn_unlock(conn); > >> if (!sk) > >> > >> > >> Stack trace tells us that sco_sock_timeout() is running after last > >> reference > >> on socket has been released. > >> > >> __refcount_add include/linux/refcount.h:199 [inline] > >> __refcount_inc include/linux/refcount.h:250 [inline] > >> refcount_inc include/linux/refcount.h:267 [inline] > >> sock_hold include/net/sock.h:702 [inline] > >> sco_sock_timeout+0x216/0x290 net/bluetooth/sco.c:88 > >> process_one_work+0x98d/0x1630 kernel/workqueue.c:2276 > >> worker_thread+0x658/0x11f0 kernel/workqueue.c:2422 > >> kthread+0x3e5/0x4d0 kernel/kthread.c:319 > >> ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 > >> > >> This is why I suggested to delay sock_put() to make sure this can not > >> happen. > >> > >> diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c > >> index > >> 98a88158651281c9f75c4e0371044251e976e7ef..bd0222e3f05a6bcb40cffe8405c9dfff98d7afde > >> 100644 > >> --- a/net/bluetooth/sco.c > >> +++ b/net/bluetooth/sco.c > >> @@ -195,10 +195,11 @@ static void sco_conn_del(struct hci_conn *hcon, > >> int err) > >> sco_sock_clear_timer(sk); > >> sco_chan_del(sk, err); > >> release_sock(sk); > >> - sock_put(sk); > >> /* Ensure no more work items will run before freeing > >> conn. */ > >> cancel_delayed_work_sync(&conn->timeout_work); > >> + > >> + sock_put(sk); > >> } > >> hcon->sco_data = NULL; > >> > > > > I see where you're going with this, but once sco_chan_del returns, any > > instance of sco_sock_timeout that hasn't yet called sock_hold will > > simply return, because conn->sk is NULL. Adding a delay to the > > sco_conn_lock critical section in sco_sock_timeout would not affect this > > because sco_chan_del clears conn->sk while holding onto the lock. > > > > The main reason that cancel_delayed_work_sync is run there is to make > > sure that we don't have a UAF on the SCO connection itself after we free > > conn. > > > > Now that I think about this, the init and cleanup isn't quite right > either. The delayed work should be initialized when the connection is > allocated, and we should always cancel all work before freeing: > > diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c > index ea18e5b56343..bba5cdb4cb4a 100644 > --- a/net/bluetooth/sco.c > +++ b/net/bluetooth/sco.c > @@ -133,6 +133,7 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) > return NULL; > > spin_lock_init(&conn->lock); > + INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); > > hcon->sco_data = conn; > conn->hcon = hcon; > @@ -197,11 +198,11 @@ static void sco_conn_del(struct hci_conn *hcon, int err) > sco_chan_del(sk, err); > release_sock(sk); > sock_put(sk); > - > - /* Ensure no more work items will run before freeing conn. */ > - cancel_delayed_work_sync(&conn->timeout_work); > } > > + /* Ensure no more work items will run before freeing conn. */ > + cancel_delayed_work_sync(&conn->timeout_work); > + > hcon->sco_data = NULL; > kfree(conn); > } > @@ -214,8 +215,6 @@ static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, > sco_pi(sk)->conn = conn; > conn->sk = sk; > > - INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); > - > if (parent) > bt_accept_enqueue(parent, sk, true); > } I have come to something similar, do you care to send a proper patch so we can get this merged. > > For a single SCO connection with well-formed channel, I think there > > can't be a race. Here's the reasoning: > > > > - For the timeout to be scheduled, a socket must have a channel with a > > connection. > > > > - When a channel between a socket and connection is established, the > > socket transitions from BT_OPEN to BT_CONNECTED, BT_CONNECT, or > > BT_CONNECT2. > > > > - For a socket to be released, it has to be zapped. For sockets that > > have a state of BT_CONNECTED, BT_CONNECT, or BT_CONNECT2, they are > > zapped only when the channel is deleted. > > > > - If the channel is deleted (which is protected by sco_conn_lock), then > > conn->sk is NULL, and sco_sock_timeout simply exits. If we had entered > > the critical section in sco_sock_timeout before the channel was deleted, > > then we increased the reference count on the socket, so it won't be > > freed until sco_sock_timeout is done. > > > > Hence, sco_sock_timeout doesn't race with the release of a socket that > > has a well-formed channel with a connection. > > > > But if multiple connections are allocated and overwritten in > > sco_sock_connect, then none of the above assumptions hold because the > > SCO connection can't be cleaned up (i.e. conn->sk cannot be set to NULL) > > when the associated socket is released. This scenario happens in the > > syzbot reproducer for the crash here: > > https://syzkaller.appspot.com/bug?id=bcc246d137428d00ed14b476c2068579515fe2bc > > > > > > That aside, upon taking a closer look, I think there is indeed a race > > lurking in sco_conn_del, but it's not the one that syzbot is hitting. > > Our sock_hold simply comes too late, and by the time it's called we > > might have already have freed the socket. > > > > So probably something like this needs to happen: > > > > diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c > > index fa25b07120c9..ea18e5b56343 100644 > > --- a/net/bluetooth/sco.c > > +++ b/net/bluetooth/sco.c > > @@ -187,10 +187,11 @@ static void sco_conn_del(struct hci_conn *hcon, > > int err) > > /* Kill socket */ > > sco_conn_lock(conn); > > sk = conn->sk; > > + if (sk) > > + sock_hold(sk); > > sco_conn_unlock(conn); > > > > if (sk) { > > - sock_hold(sk); > > lock_sock(sk); > > sco_sock_clear_timer(sk); > > sco_chan_del(sk, err); >
Hi Luiz, On 2/9/21 7:42 pm, Luiz Augusto von Dentz wrote: > Hi Desmond, > > On Thu, Sep 2, 2021 at 4:05 PM Desmond Cheong Zhi Xi > <desmondcheongzx@gmail.com> wrote: >> >> On 2/9/21 6:53 pm, Desmond Cheong Zhi Xi wrote: >>> On 2/9/21 5:41 pm, Eric Dumazet wrote: >>>> >>>> >>>> On 9/2/21 12:32 PM, Desmond Cheong Zhi Xi wrote: >>>>> >>>>> Hi Eric, >>>>> >>>>> This actually seems to be a pre-existing error in sco_sock_connect >>>>> that we now hit in sco_sock_timeout. >>>>> >>>>> Any thoughts on the following patch to address the problem? >>>>> >>>>> Link: >>>>> https://lore.kernel.org/lkml/20210831065601.101185-1-desmondcheongzx@gmail.com/ >>>>> >>>> >>>> >>>> syzbot is still working on finding a repro, this is obviously not >>>> trivial, >>>> because this is a race window. >>>> >>>> I think this can happen even with a single SCO connection. >>>> >>>> This might be triggered more easily forcing a delay in sco_sock_timeout() >>>> >>>> diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c >>>> index >>>> 98a88158651281c9f75c4e0371044251e976e7ef..71ebe0243fab106c676c308724fe3a3f92a62cbd >>>> 100644 >>>> --- a/net/bluetooth/sco.c >>>> +++ b/net/bluetooth/sco.c >>>> @@ -84,8 +84,14 @@ static void sco_sock_timeout(struct work_struct *work) >>>> sco_conn_lock(conn); >>>> sk = conn->sk; >>>> - if (sk) >>>> + if (sk) { >>>> + // lets pretend cpu has been busy (in interrupts) for >>>> 100ms >>>> + int i; >>>> + for (i=0;i<100000;i++) >>>> + udelay(1); >>>> + >>>> sock_hold(sk); >>>> + }> sco_conn_unlock(conn); >>>> if (!sk) >>>> >>>> >>>> Stack trace tells us that sco_sock_timeout() is running after last >>>> reference >>>> on socket has been released. >>>> >>>> __refcount_add include/linux/refcount.h:199 [inline] >>>> __refcount_inc include/linux/refcount.h:250 [inline] >>>> refcount_inc include/linux/refcount.h:267 [inline] >>>> sock_hold include/net/sock.h:702 [inline] >>>> sco_sock_timeout+0x216/0x290 net/bluetooth/sco.c:88 >>>> process_one_work+0x98d/0x1630 kernel/workqueue.c:2276 >>>> worker_thread+0x658/0x11f0 kernel/workqueue.c:2422 >>>> kthread+0x3e5/0x4d0 kernel/kthread.c:319 >>>> ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 >>>> >>>> This is why I suggested to delay sock_put() to make sure this can not >>>> happen. >>>> >>>> diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c >>>> index >>>> 98a88158651281c9f75c4e0371044251e976e7ef..bd0222e3f05a6bcb40cffe8405c9dfff98d7afde >>>> 100644 >>>> --- a/net/bluetooth/sco.c >>>> +++ b/net/bluetooth/sco.c >>>> @@ -195,10 +195,11 @@ static void sco_conn_del(struct hci_conn *hcon, >>>> int err) >>>> sco_sock_clear_timer(sk); >>>> sco_chan_del(sk, err); >>>> release_sock(sk); >>>> - sock_put(sk); >>>> /* Ensure no more work items will run before freeing >>>> conn. */ >>>> cancel_delayed_work_sync(&conn->timeout_work); >>>> + >>>> + sock_put(sk); >>>> } >>>> hcon->sco_data = NULL; >>>> >>> >>> I see where you're going with this, but once sco_chan_del returns, any >>> instance of sco_sock_timeout that hasn't yet called sock_hold will >>> simply return, because conn->sk is NULL. Adding a delay to the >>> sco_conn_lock critical section in sco_sock_timeout would not affect this >>> because sco_chan_del clears conn->sk while holding onto the lock. >>> >>> The main reason that cancel_delayed_work_sync is run there is to make >>> sure that we don't have a UAF on the SCO connection itself after we free >>> conn. >>> >> >> Now that I think about this, the init and cleanup isn't quite right >> either. The delayed work should be initialized when the connection is >> allocated, and we should always cancel all work before freeing: >> >> diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c >> index ea18e5b56343..bba5cdb4cb4a 100644 >> --- a/net/bluetooth/sco.c >> +++ b/net/bluetooth/sco.c >> @@ -133,6 +133,7 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) >> return NULL; >> >> spin_lock_init(&conn->lock); >> + INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); >> >> hcon->sco_data = conn; >> conn->hcon = hcon; >> @@ -197,11 +198,11 @@ static void sco_conn_del(struct hci_conn *hcon, int err) >> sco_chan_del(sk, err); >> release_sock(sk); >> sock_put(sk); >> - >> - /* Ensure no more work items will run before freeing conn. */ >> - cancel_delayed_work_sync(&conn->timeout_work); >> } >> >> + /* Ensure no more work items will run before freeing conn. */ >> + cancel_delayed_work_sync(&conn->timeout_work); >> + >> hcon->sco_data = NULL; >> kfree(conn); >> } >> @@ -214,8 +215,6 @@ static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, >> sco_pi(sk)->conn = conn; >> conn->sk = sk; >> >> - INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); >> - >> if (parent) >> bt_accept_enqueue(parent, sk, true); >> } > > I have come to something similar, do you care to send a proper patch > so we can get this merged. > Sounds good. Just finished running some tests locally, I'll send out the patches now. >>> For a single SCO connection with well-formed channel, I think there >>> can't be a race. Here's the reasoning: >>> >>> - For the timeout to be scheduled, a socket must have a channel with a >>> connection. >>> >>> - When a channel between a socket and connection is established, the >>> socket transitions from BT_OPEN to BT_CONNECTED, BT_CONNECT, or >>> BT_CONNECT2. >>> >>> - For a socket to be released, it has to be zapped. For sockets that >>> have a state of BT_CONNECTED, BT_CONNECT, or BT_CONNECT2, they are >>> zapped only when the channel is deleted. >>> >>> - If the channel is deleted (which is protected by sco_conn_lock), then >>> conn->sk is NULL, and sco_sock_timeout simply exits. If we had entered >>> the critical section in sco_sock_timeout before the channel was deleted, >>> then we increased the reference count on the socket, so it won't be >>> freed until sco_sock_timeout is done. >>> >>> Hence, sco_sock_timeout doesn't race with the release of a socket that >>> has a well-formed channel with a connection. >>> >>> But if multiple connections are allocated and overwritten in >>> sco_sock_connect, then none of the above assumptions hold because the >>> SCO connection can't be cleaned up (i.e. conn->sk cannot be set to NULL) >>> when the associated socket is released. This scenario happens in the >>> syzbot reproducer for the crash here: >>> https://syzkaller.appspot.com/bug?id=bcc246d137428d00ed14b476c2068579515fe2bc >>> >>> >>> That aside, upon taking a closer look, I think there is indeed a race >>> lurking in sco_conn_del, but it's not the one that syzbot is hitting. >>> Our sock_hold simply comes too late, and by the time it's called we >>> might have already have freed the socket. >>> >>> So probably something like this needs to happen: >>> >>> diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c >>> index fa25b07120c9..ea18e5b56343 100644 >>> --- a/net/bluetooth/sco.c >>> +++ b/net/bluetooth/sco.c >>> @@ -187,10 +187,11 @@ static void sco_conn_del(struct hci_conn *hcon, >>> int err) >>> /* Kill socket */ >>> sco_conn_lock(conn); >>> sk = conn->sk; >>> + if (sk) >>> + sock_hold(sk); >>> sco_conn_unlock(conn); >>> >>> if (sk) { >>> - sock_hold(sk); >>> lock_sock(sk); >>> sco_sock_clear_timer(sk); >>> sco_chan_del(sk, err); >> > >
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index ffa2a77a3e4c..62e638f971a9 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -48,6 +48,8 @@ struct sco_conn { spinlock_t lock; struct sock *sk; + struct delayed_work timeout_work; + unsigned int mtu; }; @@ -74,9 +76,20 @@ struct sco_pinfo { #define SCO_CONN_TIMEOUT (HZ * 40) #define SCO_DISCONN_TIMEOUT (HZ * 2) -static void sco_sock_timeout(struct timer_list *t) +static void sco_sock_timeout(struct work_struct *work) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sco_conn *conn = container_of(work, struct sco_conn, + timeout_work.work); + struct sock *sk; + + sco_conn_lock(conn); + sk = conn->sk; + if (sk) + sock_hold(sk); + sco_conn_unlock(conn); + + if (!sk) + return; BT_DBG("sock %p state %d", sk, sk->sk_state); @@ -91,14 +104,21 @@ static void sco_sock_timeout(struct timer_list *t) static void sco_sock_set_timer(struct sock *sk, long timeout) { + if (!sco_pi(sk)->conn) + return; + BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); - sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout); + cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); + schedule_delayed_work(&sco_pi(sk)->conn->timeout_work, timeout); } static void sco_sock_clear_timer(struct sock *sk) { + if (!sco_pi(sk)->conn) + return; + BT_DBG("sock %p state %d", sk, sk->sk_state); - sk_stop_timer(sk, &sk->sk_timer); + cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); } /* ---- SCO connections ---- */ @@ -179,6 +199,9 @@ static void sco_conn_del(struct hci_conn *hcon, int err) bh_unlock_sock(sk); sco_sock_kill(sk); sock_put(sk); + + /* Ensure no more work items will run before freeing conn. */ + cancel_delayed_work_sync(&conn->timeout_work); } hcon->sco_data = NULL; @@ -193,6 +216,8 @@ static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, sco_pi(sk)->conn = conn; conn->sk = sk; + INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); + if (parent) bt_accept_enqueue(parent, sk, true); } @@ -500,8 +525,6 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; - timer_setup(&sk->sk_timer, sco_sock_timeout, 0); - bt_sock_link(&sco_sk_list, sk); return sk; }