diff mbox series

[bpf-next,v5,07/11] af_unix: implement unix_dgram_bpf_recvmsg()

Message ID 20210704190252.11866-8-xiyou.wangcong@gmail.com
State New
Headers show
Series sockmap: add sockmap support for unix datagram socket | expand

Commit Message

Cong Wang July 4, 2021, 7:02 p.m. UTC
From: Cong Wang <cong.wang@bytedance.com>

We have to implement unix_dgram_bpf_recvmsg() to replace the
original ->recvmsg() to retrieve skmsg from ingress_msg.

AF_UNIX is again special here because the lack of
sk_prot->recvmsg(). I simply add a special case inside
unix_dgram_recvmsg() to call sk->sk_prot->recvmsg() directly.

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/net/af_unix.h |  2 ++
 net/unix/af_unix.c    | 19 +++++++++--
 net/unix/unix_bpf.c   | 75 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 3 deletions(-)

Comments

Eric Dumazet July 18, 2021, 5:49 p.m. UTC | #1
On 7/4/21 9:02 PM, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>

> 

> We have to implement unix_dgram_bpf_recvmsg() to replace the

> original ->recvmsg() to retrieve skmsg from ingress_msg.

> 

> AF_UNIX is again special here because the lack of

> sk_prot->recvmsg(). I simply add a special case inside

> unix_dgram_recvmsg() to call sk->sk_prot->recvmsg() directly.

> 

> Cc: John Fastabend <john.fastabend@gmail.com>

> Cc: Daniel Borkmann <daniel@iogearbox.net>

> Cc: Jakub Sitnicki <jakub@cloudflare.com>

> Cc: Lorenz Bauer <lmb@cloudflare.com>

> Signed-off-by: Cong Wang <cong.wang@bytedance.com>

> ---

>  include/net/af_unix.h |  2 ++

>  net/unix/af_unix.c    | 19 +++++++++--

>  net/unix/unix_bpf.c   | 75 +++++++++++++++++++++++++++++++++++++++++++

>  3 files changed, 93 insertions(+), 3 deletions(-)

> 

> diff --git a/include/net/af_unix.h b/include/net/af_unix.h

> index cca645846af1..435a2c3d5a6f 100644

> --- a/include/net/af_unix.h

> +++ b/include/net/af_unix.h

> @@ -82,6 +82,8 @@ static inline struct unix_sock *unix_sk(const struct sock *sk)

>  long unix_inq_len(struct sock *sk);

>  long unix_outq_len(struct sock *sk);

>  

> +int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,

> +			 int flags);

>  #ifdef CONFIG_SYSCTL

>  int unix_sysctl_register(struct net *net);

>  void unix_sysctl_unregister(struct net *net);

> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c

> index 573253c5b5c2..89927678c0dc 100644

> --- a/net/unix/af_unix.c

> +++ b/net/unix/af_unix.c

> @@ -2098,11 +2098,11 @@ static void unix_copy_addr(struct msghdr *msg, struct sock *sk)

>  	}

>  }

>  

> -static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,

> -			      size_t size, int flags)

> +int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,

> +			 int flags)

>  {

>  	struct scm_cookie scm;

> -	struct sock *sk = sock->sk;

> +	struct socket *sock = sk->sk_socket;

>  	struct unix_sock *u = unix_sk(sk);

>  	struct sk_buff *skb, *last;

>  	long timeo;

> @@ -2205,6 +2205,19 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,

>  	return err;

>  }

>  

> +static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,

> +			      int flags)

> +{

> +	struct sock *sk = sock->sk;

> +

> +#ifdef CONFIG_BPF_SYSCALL

> +	if (sk->sk_prot != &unix_proto)

> +		return sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,

> +					    flags & ~MSG_DONTWAIT, NULL);

> +#endif

> +	return __unix_dgram_recvmsg(sk, msg, size, flags);

> +}

> +

>  static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,

>  			  sk_read_actor_t recv_actor)

>  {

> diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c

> index b1582a659427..db0cda29fb2f 100644

> --- a/net/unix/unix_bpf.c

> +++ b/net/unix/unix_bpf.c

> @@ -6,6 +6,80 @@

>  #include <net/sock.h>

>  #include <net/af_unix.h>

>  

> +#define unix_sk_has_data(__sk, __psock)					\

> +		({	!skb_queue_empty(&__sk->sk_receive_queue) ||	\

> +			!skb_queue_empty(&__psock->ingress_skb) ||	\

> +			!list_empty(&__psock->ingress_msg);		\

> +		})

> +

> +static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock,

> +			      long timeo)

> +{

> +	DEFINE_WAIT_FUNC(wait, woken_wake_function);

> +	struct unix_sock *u = unix_sk(sk);

> +	int ret = 0;

> +

> +	if (sk->sk_shutdown & RCV_SHUTDOWN)

> +		return 1;

> +

> +	if (!timeo)

> +		return ret;

> +

> +	add_wait_queue(sk_sleep(sk), &wait);

> +	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);

> +	if (!unix_sk_has_data(sk, psock)) {

> +		mutex_unlock(&u->iolock);

> +		wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);

> +		mutex_lock(&u->iolock);

> +		ret = unix_sk_has_data(sk, psock);

> +	}

> +	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);

> +	remove_wait_queue(sk_sleep(sk), &wait);

> +	return ret;

> +}

> +

> +static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg,

> +				  size_t len, int nonblock, int flags,

> +				  int *addr_len)

> +{

> +	struct unix_sock *u = unix_sk(sk);

> +	struct sk_psock *psock;

> +	int copied, ret;

> +

> +	psock = sk_psock_get(sk);

> +	if (unlikely(!psock))

> +		return __unix_dgram_recvmsg(sk, msg, len, flags);

> +

> +	mutex_lock(&u->iolock);


u->iolock mutex is owned here.

> +	if (!skb_queue_empty(&sk->sk_receive_queue) &&

> +	    sk_psock_queue_empty(psock)) {

> +		ret = __unix_dgram_recvmsg(sk, msg, len, flags);


But __unix_dgram_recvmsg() will also try to grab this mutex ?

> +		goto out;

> +	}

> +

> +msg_bytes_ready:

> +	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);

> +	if (!copied) {

> +		long timeo;

> +		int data;

> +

> +		timeo = sock_rcvtimeo(sk, nonblock);

> +		data = unix_msg_wait_data(sk, psock, timeo);

> +		if (data) {

> +			if (!sk_psock_queue_empty(psock))

> +				goto msg_bytes_ready;

> +			ret = __unix_dgram_recvmsg(sk, msg, len, flags);

> +			goto out;

> +		}

> +		copied = -EAGAIN;

> +	}

> +	ret = copied;

> +out:

> +	mutex_unlock(&u->iolock);

> +	sk_psock_put(sk, psock);

> +	return ret;

> +}

> +

>  static struct proto *unix_prot_saved __read_mostly;

>  static DEFINE_SPINLOCK(unix_prot_lock);

>  static struct proto unix_bpf_prot;

> @@ -14,6 +88,7 @@ static void unix_bpf_rebuild_protos(struct proto *prot, const struct proto *base

>  {

>  	*prot        = *base;

>  	prot->close  = sock_map_close;

> +	prot->recvmsg = unix_dgram_bpf_recvmsg;

>  }

>  

>  static void unix_bpf_check_needs_rebuild(struct proto *ops)

>
Cong Wang July 20, 2021, 12:03 a.m. UTC | #2
On Sun, Jul 18, 2021 at 10:49 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>

>

>

> On 7/4/21 9:02 PM, Cong Wang wrote:

> > From: Cong Wang <cong.wang@bytedance.com>

> > +     mutex_lock(&u->iolock);

>

> u->iolock mutex is owned here.

>

> > +     if (!skb_queue_empty(&sk->sk_receive_queue) &&

> > +         sk_psock_queue_empty(psock)) {

> > +             ret = __unix_dgram_recvmsg(sk, msg, len, flags);

>

> But __unix_dgram_recvmsg() will also try to grab this mutex ?


Good catch. I should release the lock before calling it. I will send
a patch.

Thanks.
diff mbox series

Patch

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index cca645846af1..435a2c3d5a6f 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -82,6 +82,8 @@  static inline struct unix_sock *unix_sk(const struct sock *sk)
 long unix_inq_len(struct sock *sk);
 long unix_outq_len(struct sock *sk);
 
+int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
+			 int flags);
 #ifdef CONFIG_SYSCTL
 int unix_sysctl_register(struct net *net);
 void unix_sysctl_unregister(struct net *net);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 573253c5b5c2..89927678c0dc 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2098,11 +2098,11 @@  static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
 	}
 }
 
-static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
-			      size_t size, int flags)
+int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
+			 int flags)
 {
 	struct scm_cookie scm;
-	struct sock *sk = sock->sk;
+	struct socket *sock = sk->sk_socket;
 	struct unix_sock *u = unix_sk(sk);
 	struct sk_buff *skb, *last;
 	long timeo;
@@ -2205,6 +2205,19 @@  static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 	return err;
 }
 
+static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+			      int flags)
+{
+	struct sock *sk = sock->sk;
+
+#ifdef CONFIG_BPF_SYSCALL
+	if (sk->sk_prot != &unix_proto)
+		return sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
+					    flags & ~MSG_DONTWAIT, NULL);
+#endif
+	return __unix_dgram_recvmsg(sk, msg, size, flags);
+}
+
 static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
 			  sk_read_actor_t recv_actor)
 {
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
index b1582a659427..db0cda29fb2f 100644
--- a/net/unix/unix_bpf.c
+++ b/net/unix/unix_bpf.c
@@ -6,6 +6,80 @@ 
 #include <net/sock.h>
 #include <net/af_unix.h>
 
+#define unix_sk_has_data(__sk, __psock)					\
+		({	!skb_queue_empty(&__sk->sk_receive_queue) ||	\
+			!skb_queue_empty(&__psock->ingress_skb) ||	\
+			!list_empty(&__psock->ingress_msg);		\
+		})
+
+static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock,
+			      long timeo)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct unix_sock *u = unix_sk(sk);
+	int ret = 0;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		return 1;
+
+	if (!timeo)
+		return ret;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	if (!unix_sk_has_data(sk, psock)) {
+		mutex_unlock(&u->iolock);
+		wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+		mutex_lock(&u->iolock);
+		ret = unix_sk_has_data(sk, psock);
+	}
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return ret;
+}
+
+static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
+				  size_t len, int nonblock, int flags,
+				  int *addr_len)
+{
+	struct unix_sock *u = unix_sk(sk);
+	struct sk_psock *psock;
+	int copied, ret;
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return __unix_dgram_recvmsg(sk, msg, len, flags);
+
+	mutex_lock(&u->iolock);
+	if (!skb_queue_empty(&sk->sk_receive_queue) &&
+	    sk_psock_queue_empty(psock)) {
+		ret = __unix_dgram_recvmsg(sk, msg, len, flags);
+		goto out;
+	}
+
+msg_bytes_ready:
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	if (!copied) {
+		long timeo;
+		int data;
+
+		timeo = sock_rcvtimeo(sk, nonblock);
+		data = unix_msg_wait_data(sk, psock, timeo);
+		if (data) {
+			if (!sk_psock_queue_empty(psock))
+				goto msg_bytes_ready;
+			ret = __unix_dgram_recvmsg(sk, msg, len, flags);
+			goto out;
+		}
+		copied = -EAGAIN;
+	}
+	ret = copied;
+out:
+	mutex_unlock(&u->iolock);
+	sk_psock_put(sk, psock);
+	return ret;
+}
+
 static struct proto *unix_prot_saved __read_mostly;
 static DEFINE_SPINLOCK(unix_prot_lock);
 static struct proto unix_bpf_prot;
@@ -14,6 +88,7 @@  static void unix_bpf_rebuild_protos(struct proto *prot, const struct proto *base
 {
 	*prot        = *base;
 	prot->close  = sock_map_close;
+	prot->recvmsg = unix_dgram_bpf_recvmsg;
 }
 
 static void unix_bpf_check_needs_rebuild(struct proto *ops)