diff mbox series

net: qrtr: ns: Protect radix_tree_deref_slot() using rcu read locks

Message ID 20200926165625.11660-1-manivannan.sadhasivam@linaro.org
State New
Headers show
Series net: qrtr: ns: Protect radix_tree_deref_slot() using rcu read locks | expand

Commit Message

Manivannan Sadhasivam Sept. 26, 2020, 4:56 p.m. UTC
The rcu read locks are needed to avoid potential race condition while
dereferencing radix tree from multiple threads. The issue was identified
by syzbot. Below is the crash report:

-- 
2.17.1

Comments

David Miller Sept. 28, 2020, 10:56 p.m. UTC | #1
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>

Date: Sat, 26 Sep 2020 22:26:25 +0530

> The rcu read locks are needed to avoid potential race condition while

> dereferencing radix tree from multiple threads. The issue was identified

> by syzbot. Below is the crash report:

 ...
> Fixes: 0c2204a4ad71 ("net: qrtr: Migrate nameservice to kernel from userspace")

> Reported-and-tested-by: syzbot+0f84f6eed90503da72fc@syzkaller.appspotmail.com

> Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>


Applied and queued up for -stable, thank you.
Doug Anderson Oct. 1, 2020, 10:53 p.m. UTC | #2
Hi,

On Mon, Sep 28, 2020 at 4:15 PM David Miller <davem@davemloft.net> wrote:
>

> From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>

> Date: Sat, 26 Sep 2020 22:26:25 +0530

>

> > The rcu read locks are needed to avoid potential race condition while

> > dereferencing radix tree from multiple threads. The issue was identified

> > by syzbot. Below is the crash report:

>  ...

> > Fixes: 0c2204a4ad71 ("net: qrtr: Migrate nameservice to kernel from userspace")

> > Reported-and-tested-by: syzbot+0f84f6eed90503da72fc@syzkaller.appspotmail.com

> > Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>

>

> Applied and queued up for -stable, thank you.


The cure is worse than the disease.  I tested by picking back to a
5.4-based kernel and got this crash.  I expect the crash would also be
present on mainline:

 BUG: sleeping function called from invalid context at net/core/sock.c:3000
 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 7, name: kworker/u16:0
 3 locks held by kworker/u16:0/7:
  #0: ffffff81b65a7b28 ((wq_completion)qrtr_ns_handler){+.+.}, at:
process_one_work+0x1bc/0x614
  #1: ffffff81b6edfd58 ((work_completion)(&qrtr_ns.work)){+.+.}, at:
process_one_work+0x1e4/0x614
  #2: ffffffd01144c328 (rcu_read_lock){....}, at: rcu_lock_acquire+0x8/0x38
 CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.4.68 #33
 Hardware name: Google Lazor (rev0) with LTE (DT)
 Workqueue: qrtr_ns_handler qrtr_ns_worker
 Call trace:
  dump_backtrace+0x0/0x158
  show_stack+0x20/0x2c
  dump_stack+0xdc/0x180
  ___might_sleep+0x1c0/0x1d0
  __might_sleep+0x50/0x88
  lock_sock_nested+0x34/0x94
  qrtr_sendmsg+0x7c/0x260
  sock_sendmsg+0x44/0x5c
  kernel_sendmsg+0x50/0x64
  lookup_notify+0xa8/0x118
  qrtr_ns_worker+0x8d8/0x1050
  process_one_work+0x338/0x614
  worker_thread+0x29c/0x46c
  kthread+0x150/0x160
  ret_from_fork+0x10/0x18

I'll give the stack crawl from kgdb too since inlining makes things
less obvious with the above...

(gdb) bt
#0  arch_kgdb_breakpoint ()
    at .../arch/arm64/include/asm/kgdb.h:21
#1  kgdb_breakpoint ()
    at .../kernel/debug/debug_core.c:1183
#2  0xffffffd010131058 in ___might_sleep (
    file=file@entry=0xffffffd010efec42 "net/core/sock.c",
    line=line@entry=3000, preempt_offset=preempt_offset@entry=0)
    at .../kernel/sched/core.c:7994
#3  0xffffffd010130ee0 in __might_sleep (
    file=0xffffffd010efec42 "net/core/sock.c", line=3000,
    preempt_offset=0)
    at .../kernel/sched/core.c:7965
#4  0xffffffd01094d1c8 in lock_sock_nested (
    sk=sk@entry=0xffffff8147e457c0, subclass=0)
    at .../net/core/sock.c:3000
#5  0xffffffd010b26028 in lock_sock (sk=0xffffff8147e457c0)
    at .../include/net/sock.h:1536
#6  qrtr_sendmsg (sock=0xffffff8148c4b240, msg=0xffffff81422afab8,
    len=20)
    at .../net/qrtr/qrtr.c:891
#7  0xffffffd01093f8f4 in sock_sendmsg_nosec (
    sock=0xffffff8148c4b240, msg=0xffffff81422afab8)
    at .../net/socket.c:638
#8  sock_sendmsg (sock=sock@entry=0xffffff8148c4b240,
    msg=msg@entry=0xffffff81422afab8)
    at .../net/socket.c:658
#9  0xffffffd01093f95c in kernel_sendmsg (sock=0x1,
    msg=msg@entry=0xffffff81422afab8, vec=<optimized out>,
    vec@entry=0xffffff81422afaa8, num=<optimized out>, num@entry=1,
    size=<optimized out>, size@entry=20)
    at .../net/socket.c:678
#10 0xffffffd010b28be0 in service_announce_new (
    dest=dest@entry=0xffffff81422afc20,
    srv=srv@entry=0xffffff81370f6380)
    at .../net/qrtr/ns.c:127
#11 0xffffffd010b279f4 in announce_servers (sq=0xffffff81422afc20)
    at .../net/qrtr/ns.c:207
#12 ctrl_cmd_hello (sq=0xffffff81422afc20)
    at .../net/qrtr/ns.c:328
#13 qrtr_ns_worker (work=<optimized out>)
    at .../net/qrtr/ns.c:661
#14 0xffffffd010119a94 in process_one_work (
    worker=worker@entry=0xffffff8142267900,
    work=0xffffffd0128ddaf8 <qrtr_ns+48>)
    at .../kernel/workqueue.c:2272
#15 0xffffffd01011a16c in worker_thread (
    __worker=__worker@entry=0xffffff8142267900)
    at .../kernel/workqueue.c:2418
#16 0xffffffd01011fb78 in kthread (_create=0xffffff8142269200)
    at .../kernel/kthread.c:268
#17 0xffffffd01008645c in ret_from_fork ()
    at .../arch/arm64/kernel/entry.S:1169


-Doug
Manivannan Sadhasivam Oct. 2, 2020, 7:10 a.m. UTC | #3
Hi Doug,

On Thu, Oct 01, 2020 at 03:53:12PM -0700, Doug Anderson wrote:
> Hi,

> 

> On Mon, Sep 28, 2020 at 4:15 PM David Miller <davem@davemloft.net> wrote:

> >

> > From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>

> > Date: Sat, 26 Sep 2020 22:26:25 +0530

> >

> > > The rcu read locks are needed to avoid potential race condition while

> > > dereferencing radix tree from multiple threads. The issue was identified

> > > by syzbot. Below is the crash report:

> >  ...

> > > Fixes: 0c2204a4ad71 ("net: qrtr: Migrate nameservice to kernel from userspace")

> > > Reported-and-tested-by: syzbot+0f84f6eed90503da72fc@syzkaller.appspotmail.com

> > > Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>

> >

> > Applied and queued up for -stable, thank you.

> 

> The cure is worse than the disease.  I tested by picking back to a

> 5.4-based kernel and got this crash.  I expect the crash would also be

> present on mainline:

>


Thanks for the report! I intended to fix the issue reported by syzbot but
failed to notice the lock_sock() in qrtr_sendmsg() function. This function is
not supposed to be called while holding a lock as it might sleep.

I'll submit a patch to fix this issue asap.

Thanks,
Mani
 
>  BUG: sleeping function called from invalid context at net/core/sock.c:3000

>  in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 7, name: kworker/u16:0

>  3 locks held by kworker/u16:0/7:

>   #0: ffffff81b65a7b28 ((wq_completion)qrtr_ns_handler){+.+.}, at:

> process_one_work+0x1bc/0x614

>   #1: ffffff81b6edfd58 ((work_completion)(&qrtr_ns.work)){+.+.}, at:

> process_one_work+0x1e4/0x614

>   #2: ffffffd01144c328 (rcu_read_lock){....}, at: rcu_lock_acquire+0x8/0x38

>  CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.4.68 #33

>  Hardware name: Google Lazor (rev0) with LTE (DT)

>  Workqueue: qrtr_ns_handler qrtr_ns_worker

>  Call trace:

>   dump_backtrace+0x0/0x158

>   show_stack+0x20/0x2c

>   dump_stack+0xdc/0x180

>   ___might_sleep+0x1c0/0x1d0

>   __might_sleep+0x50/0x88

>   lock_sock_nested+0x34/0x94

>   qrtr_sendmsg+0x7c/0x260

>   sock_sendmsg+0x44/0x5c

>   kernel_sendmsg+0x50/0x64

>   lookup_notify+0xa8/0x118

>   qrtr_ns_worker+0x8d8/0x1050

>   process_one_work+0x338/0x614

>   worker_thread+0x29c/0x46c

>   kthread+0x150/0x160

>   ret_from_fork+0x10/0x18

> 

> I'll give the stack crawl from kgdb too since inlining makes things

> less obvious with the above...

> 

> (gdb) bt

> #0  arch_kgdb_breakpoint ()

>     at .../arch/arm64/include/asm/kgdb.h:21

> #1  kgdb_breakpoint ()

>     at .../kernel/debug/debug_core.c:1183

> #2  0xffffffd010131058 in ___might_sleep (

>     file=file@entry=0xffffffd010efec42 "net/core/sock.c",

>     line=line@entry=3000, preempt_offset=preempt_offset@entry=0)

>     at .../kernel/sched/core.c:7994

> #3  0xffffffd010130ee0 in __might_sleep (

>     file=0xffffffd010efec42 "net/core/sock.c", line=3000,

>     preempt_offset=0)

>     at .../kernel/sched/core.c:7965

> #4  0xffffffd01094d1c8 in lock_sock_nested (

>     sk=sk@entry=0xffffff8147e457c0, subclass=0)

>     at .../net/core/sock.c:3000

> #5  0xffffffd010b26028 in lock_sock (sk=0xffffff8147e457c0)

>     at .../include/net/sock.h:1536

> #6  qrtr_sendmsg (sock=0xffffff8148c4b240, msg=0xffffff81422afab8,

>     len=20)

>     at .../net/qrtr/qrtr.c:891

> #7  0xffffffd01093f8f4 in sock_sendmsg_nosec (

>     sock=0xffffff8148c4b240, msg=0xffffff81422afab8)

>     at .../net/socket.c:638

> #8  sock_sendmsg (sock=sock@entry=0xffffff8148c4b240,

>     msg=msg@entry=0xffffff81422afab8)

>     at .../net/socket.c:658

> #9  0xffffffd01093f95c in kernel_sendmsg (sock=0x1,

>     msg=msg@entry=0xffffff81422afab8, vec=<optimized out>,

>     vec@entry=0xffffff81422afaa8, num=<optimized out>, num@entry=1,

>     size=<optimized out>, size@entry=20)

>     at .../net/socket.c:678

> #10 0xffffffd010b28be0 in service_announce_new (

>     dest=dest@entry=0xffffff81422afc20,

>     srv=srv@entry=0xffffff81370f6380)

>     at .../net/qrtr/ns.c:127

> #11 0xffffffd010b279f4 in announce_servers (sq=0xffffff81422afc20)

>     at .../net/qrtr/ns.c:207

> #12 ctrl_cmd_hello (sq=0xffffff81422afc20)

>     at .../net/qrtr/ns.c:328

> #13 qrtr_ns_worker (work=<optimized out>)

>     at .../net/qrtr/ns.c:661

> #14 0xffffffd010119a94 in process_one_work (

>     worker=worker@entry=0xffffff8142267900,

>     work=0xffffffd0128ddaf8 <qrtr_ns+48>)

>     at .../kernel/workqueue.c:2272

> #15 0xffffffd01011a16c in worker_thread (

>     __worker=__worker@entry=0xffffff8142267900)

>     at .../kernel/workqueue.c:2418

> #16 0xffffffd01011fb78 in kthread (_create=0xffffff8142269200)

>     at .../kernel/kthread.c:268

> #17 0xffffffd01008645c in ret_from_fork ()

>     at .../arch/arm64/kernel/entry.S:1169

> 

> 

> -Doug
diff mbox series

Patch

=============================
WARNING: suspicious RCU usage
5.7.0-syzkaller #0 Not tainted
-----------------------------
include/linux/radix-tree.h:176 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
2 locks held by kworker/u4:1/21:
 #0: ffff88821b097938 ((wq_completion)qrtr_ns_handler){+.+.}-{0:0}, at: spin_unlock_irq include/linux/spinlock.h:403 [inline]
 #0: ffff88821b097938 ((wq_completion)qrtr_ns_handler){+.+.}-{0:0}, at: process_one_work+0x6df/0xfd0 kernel/workqueue.c:2241
 #1: ffffc90000dd7d80 ((work_completion)(&qrtr_ns.work)){+.+.}-{0:0}, at: process_one_work+0x71e/0xfd0 kernel/workqueue.c:2243

stack backtrace:
CPU: 0 PID: 21 Comm: kworker/u4:1 Not tainted 5.7.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: qrtr_ns_handler qrtr_ns_worker
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1e9/0x30e lib/dump_stack.c:118
 radix_tree_deref_slot include/linux/radix-tree.h:176 [inline]
 ctrl_cmd_new_lookup net/qrtr/ns.c:558 [inline]
 qrtr_ns_worker+0x2aff/0x4500 net/qrtr/ns.c:674
 process_one_work+0x76e/0xfd0 kernel/workqueue.c:2268
 worker_thread+0xa7f/0x1450 kernel/workqueue.c:2414
 kthread+0x353/0x380 kernel/kthread.c:268

Fixes: 0c2204a4ad71 ("net: qrtr: Migrate nameservice to kernel from userspace")
Reported-and-tested-by: syzbot+0f84f6eed90503da72fc@syzkaller.appspotmail.com
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 net/qrtr/ns.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c
index d8252fdab851..934999b56d60 100644
--- a/net/qrtr/ns.c
+++ b/net/qrtr/ns.c
@@ -193,12 +193,13 @@  static int announce_servers(struct sockaddr_qrtr *sq)
 	struct qrtr_server *srv;
 	struct qrtr_node *node;
 	void __rcu **slot;
-	int ret;
+	int ret = 0;
 
 	node = node_get(qrtr_ns.local_node);
 	if (!node)
 		return 0;
 
+	rcu_read_lock();
 	/* Announce the list of servers registered in this node */
 	radix_tree_for_each_slot(slot, &node->servers, &iter, 0) {
 		srv = radix_tree_deref_slot(slot);
@@ -206,11 +207,14 @@  static int announce_servers(struct sockaddr_qrtr *sq)
 		ret = service_announce_new(sq, srv);
 		if (ret < 0) {
 			pr_err("failed to announce new service\n");
-			return ret;
+			goto err_out;
 		}
 	}
 
-	return 0;
+err_out:
+	rcu_read_unlock();
+
+	return ret;
 }
 
 static struct qrtr_server *server_add(unsigned int service,
@@ -335,7 +339,7 @@  static int ctrl_cmd_bye(struct sockaddr_qrtr *from)
 	struct qrtr_node *node;
 	void __rcu **slot;
 	struct kvec iv;
-	int ret;
+	int ret = 0;
 
 	iv.iov_base = &pkt;
 	iv.iov_len = sizeof(pkt);
@@ -344,11 +348,13 @@  static int ctrl_cmd_bye(struct sockaddr_qrtr *from)
 	if (!node)
 		return 0;
 
+	rcu_read_lock();
 	/* Advertise removal of this client to all servers of remote node */
 	radix_tree_for_each_slot(slot, &node->servers, &iter, 0) {
 		srv = radix_tree_deref_slot(slot);
 		server_del(node, srv->port);
 	}
+	rcu_read_unlock();
 
 	/* Advertise the removal of this client to all local servers */
 	local_node = node_get(qrtr_ns.local_node);
@@ -359,6 +365,7 @@  static int ctrl_cmd_bye(struct sockaddr_qrtr *from)
 	pkt.cmd = cpu_to_le32(QRTR_TYPE_BYE);
 	pkt.client.node = cpu_to_le32(from->sq_node);
 
+	rcu_read_lock();
 	radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) {
 		srv = radix_tree_deref_slot(slot);
 
@@ -372,11 +379,14 @@  static int ctrl_cmd_bye(struct sockaddr_qrtr *from)
 		ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
 		if (ret < 0) {
 			pr_err("failed to send bye cmd\n");
-			return ret;
+			goto err_out;
 		}
 	}
 
-	return 0;
+err_out:
+	rcu_read_unlock();
+
+	return ret;
 }
 
 static int ctrl_cmd_del_client(struct sockaddr_qrtr *from,
@@ -394,7 +404,7 @@  static int ctrl_cmd_del_client(struct sockaddr_qrtr *from,
 	struct list_head *li;
 	void __rcu **slot;
 	struct kvec iv;
-	int ret;
+	int ret = 0;
 
 	iv.iov_base = &pkt;
 	iv.iov_len = sizeof(pkt);
@@ -434,6 +444,7 @@  static int ctrl_cmd_del_client(struct sockaddr_qrtr *from,
 	pkt.client.node = cpu_to_le32(node_id);
 	pkt.client.port = cpu_to_le32(port);
 
+	rcu_read_lock();
 	radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) {
 		srv = radix_tree_deref_slot(slot);
 
@@ -447,11 +458,14 @@  static int ctrl_cmd_del_client(struct sockaddr_qrtr *from,
 		ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
 		if (ret < 0) {
 			pr_err("failed to send del client cmd\n");
-			return ret;
+			goto err_out;
 		}
 	}
 
-	return 0;
+err_out:
+	rcu_read_unlock();
+
+	return ret;
 }
 
 static int ctrl_cmd_new_server(struct sockaddr_qrtr *from,
@@ -554,6 +568,7 @@  static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from,
 	filter.service = service;
 	filter.instance = instance;
 
+	rcu_read_lock();
 	radix_tree_for_each_slot(node_slot, &nodes, &node_iter, 0) {
 		node = radix_tree_deref_slot(node_slot);
 
@@ -568,6 +583,7 @@  static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from,
 			lookup_notify(from, srv, true);
 		}
 	}
+	rcu_read_unlock();
 
 	/* Empty notification, to indicate end of listing */
 	lookup_notify(from, NULL, true);