diff mbox series

[bpf-next,v4,04/11] skmsg: avoid lock_sock() in sk_psock_backlog()

Message ID 20210310053222.41371-5-xiyou.wangcong@gmail.com
State New
Headers show
Series None | expand

Commit Message

Cong Wang March 10, 2021, 5:32 a.m. UTC
From: Cong Wang <cong.wang@bytedance.com>

We do not have to lock the sock to avoid losing sk_socket,
instead we can purge all the ingress queues when we close
the socket. Sending or receiving packets after orphaning
socket makes no sense.

We do purge these queues when psock refcnt reaches 0 but
here we want to purge them explicitly in sock_map_close().

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/linux/skmsg.h |  1 +
 net/core/skmsg.c      | 22 ++++++++++++++--------
 net/core/sock_map.c   |  1 +
 3 files changed, 16 insertions(+), 8 deletions(-)

Comments

Jakub Sitnicki March 12, 2021, 12:02 p.m. UTC | #1
On Wed, Mar 10, 2021 at 06:32 AM CET, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>

>

> We do not have to lock the sock to avoid losing sk_socket,

> instead we can purge all the ingress queues when we close

> the socket. Sending or receiving packets after orphaning

> socket makes no sense.

>

> We do purge these queues when psock refcnt reaches 0 but

> here we want to purge them explicitly in sock_map_close().

>

> Cc: John Fastabend <john.fastabend@gmail.com>

> Cc: Daniel Borkmann <daniel@iogearbox.net>

> Cc: Jakub Sitnicki <jakub@cloudflare.com>

> Cc: Lorenz Bauer <lmb@cloudflare.com>

> Signed-off-by: Cong Wang <cong.wang@bytedance.com>

> ---

>  include/linux/skmsg.h |  1 +

>  net/core/skmsg.c      | 22 ++++++++++++++--------

>  net/core/sock_map.c   |  1 +

>  3 files changed, 16 insertions(+), 8 deletions(-)

>

> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h

> index 7333bf881b81..91b357817bb8 100644

> --- a/include/linux/skmsg.h

> +++ b/include/linux/skmsg.h

> @@ -347,6 +347,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err)

>  }

>

>  struct sk_psock *sk_psock_init(struct sock *sk, int node);

> +void sk_psock_purge(struct sk_psock *psock);

>

>  #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)

>  int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);

> diff --git a/net/core/skmsg.c b/net/core/skmsg.c

> index 41a5f82c53e6..bf0f874780c1 100644

> --- a/net/core/skmsg.c

> +++ b/net/core/skmsg.c

> @@ -497,7 +497,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,

>  	if (!ingress) {

>  		if (!sock_writeable(psock->sk))

>  			return -EAGAIN;

> -		return skb_send_sock_locked(psock->sk, skb, off, len);

> +		return skb_send_sock(psock->sk, skb, off, len);

>  	}

>  	return sk_psock_skb_ingress(psock, skb);

>  }

> @@ -511,8 +511,6 @@ static void sk_psock_backlog(struct work_struct *work)

>  	u32 len, off;

>  	int ret;

>

> -	/* Lock sock to avoid losing sk_socket during loop. */

> -	lock_sock(psock->sk);

>  	if (state->skb) {

>  		skb = state->skb;

>  		len = state->len;

> @@ -529,7 +527,7 @@ static void sk_psock_backlog(struct work_struct *work)

>  		skb_bpf_redirect_clear(skb);

>  		do {

>  			ret = -EIO;

> -			if (likely(psock->sk->sk_socket))

> +			if (!sock_flag(psock->sk, SOCK_DEAD))

>  				ret = sk_psock_handle_skb(psock, skb, off,

>  							  len, ingress);

>  			if (ret <= 0) {

> @@ -537,13 +535,13 @@ static void sk_psock_backlog(struct work_struct *work)

>  					state->skb = skb;

>  					state->len = len;

>  					state->off = off;

> -					goto end;

> +					return;

>  				}

>  				/* Hard errors break pipe and stop xmit. */

>  				sk_psock_report_error(psock, ret ? -ret : EPIPE);

>  				sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);

>  				kfree_skb(skb);

> -				goto end;

> +				return;

>  			}

>  			off += ret;

>  			len -= ret;

> @@ -552,8 +550,6 @@ static void sk_psock_backlog(struct work_struct *work)

>  		if (!ingress)

>  			kfree_skb(skb);

>  	}

> -end:

> -	release_sock(psock->sk);

>  }

>

>  struct sk_psock *sk_psock_init(struct sock *sk, int node)

> @@ -654,6 +650,16 @@ static void sk_psock_link_destroy(struct sk_psock *psock)

>  	}

>  }

>

> +void sk_psock_purge(struct sk_psock *psock)

> +{

> +	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);

> +

> +	cancel_work_sync(&psock->work);

> +

> +	sk_psock_cork_free(psock);

> +	sk_psock_zap_ingress(psock);

> +}

> +

>  static void sk_psock_done_strp(struct sk_psock *psock);

>

>  static void sk_psock_destroy_deferred(struct work_struct *gc)

> diff --git a/net/core/sock_map.c b/net/core/sock_map.c

> index dd53a7771d7e..26ba47b099f1 100644

> --- a/net/core/sock_map.c

> +++ b/net/core/sock_map.c

> @@ -1540,6 +1540,7 @@ void sock_map_close(struct sock *sk, long timeout)

>  	saved_close = psock->saved_close;

>  	sock_map_remove_links(sk, psock);

>  	rcu_read_unlock();

> +	sk_psock_purge(psock);

>  	release_sock(sk);

>  	saved_close(sk, timeout);

>  }


Nothing stops sk_psock_backlog from running after sk_psock_purge:


CPU 1							CPU 2

sk_psock_skb_redirect()
  sk_psock(sk_other)
  sock_flag(sk_other, SOCK_DEAD)
  sk_psock_test_state(psock_other,
                      SK_PSOCK_TX_ENABLED)
							sk_psock_purge()
  skb_queue_tail(&psock_other->ingress_skb, skb)
  schedule_work(&psock_other->work)


And sock_orphan can run while we're in sendmsg/sendpage_unlocked:


CPU 1                                                   CPU 2

sk_psock_backlog
  ...
  sendmsg_unlocked
    sock = sk->sk_socket
                                                        tcp_close
                                                          __tcp_close
                                                            sock_orphan
    kernel_sendmsg(sock, msg, vec, num, size)


So, after this change, without lock_sock in sk_psock_backlog, we will
not block tcp_close from running.

This makes me think that the process socket can get released from under
us, before kernel_sendmsg/sendpage runs.

What did I miss?
Cong Wang March 13, 2021, 5:32 p.m. UTC | #2
On Fri, Mar 12, 2021 at 4:02 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>

> On Wed, Mar 10, 2021 at 06:32 AM CET, Cong Wang wrote:

> > diff --git a/net/core/sock_map.c b/net/core/sock_map.c

> > index dd53a7771d7e..26ba47b099f1 100644

> > --- a/net/core/sock_map.c

> > +++ b/net/core/sock_map.c

> > @@ -1540,6 +1540,7 @@ void sock_map_close(struct sock *sk, long timeout)

> >       saved_close = psock->saved_close;

> >       sock_map_remove_links(sk, psock);

> >       rcu_read_unlock();

> > +     sk_psock_purge(psock);

> >       release_sock(sk);

> >       saved_close(sk, timeout);

> >  }

>

> Nothing stops sk_psock_backlog from running after sk_psock_purge:

>

>

> CPU 1                                                   CPU 2

>

> sk_psock_skb_redirect()

>   sk_psock(sk_other)

>   sock_flag(sk_other, SOCK_DEAD)

>   sk_psock_test_state(psock_other,

>                       SK_PSOCK_TX_ENABLED)

>                                                         sk_psock_purge()

>   skb_queue_tail(&psock_other->ingress_skb, skb)

>   schedule_work(&psock_other->work)

>

>

> And sock_orphan can run while we're in sendmsg/sendpage_unlocked:

>

>

> CPU 1                                                   CPU 2

>

> sk_psock_backlog

>   ...

>   sendmsg_unlocked

>     sock = sk->sk_socket

>                                                         tcp_close

>                                                           __tcp_close

>                                                             sock_orphan

>     kernel_sendmsg(sock, msg, vec, num, size)

>

>

> So, after this change, without lock_sock in sk_psock_backlog, we will

> not block tcp_close from running.

>

> This makes me think that the process socket can get released from under

> us, before kernel_sendmsg/sendpage runs.


I think you are right, I thought socket is orphaned in inet_release(), clearly
I was wrong. But, I'd argue in the above scenario, the packet should not
be even queued in the first place, as SK_PSOCK_TX_ENABLED is going
to be cleared, so I think the right fix is probably to make clearing psock
state and queuing the packet under a spinlock.

Thanks.
Jakub Sitnicki March 15, 2021, 8:55 p.m. UTC | #3
On Sat, Mar 13, 2021 at 06:32 PM CET, Cong Wang wrote:
> On Fri, Mar 12, 2021 at 4:02 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:

>>

>> On Wed, Mar 10, 2021 at 06:32 AM CET, Cong Wang wrote:

>> > diff --git a/net/core/sock_map.c b/net/core/sock_map.c

>> > index dd53a7771d7e..26ba47b099f1 100644

>> > --- a/net/core/sock_map.c

>> > +++ b/net/core/sock_map.c

>> > @@ -1540,6 +1540,7 @@ void sock_map_close(struct sock *sk, long timeout)

>> >       saved_close = psock->saved_close;

>> >       sock_map_remove_links(sk, psock);

>> >       rcu_read_unlock();

>> > +     sk_psock_purge(psock);

>> >       release_sock(sk);

>> >       saved_close(sk, timeout);

>> >  }

>>

>> Nothing stops sk_psock_backlog from running after sk_psock_purge:

>>

>>

>> CPU 1                                                   CPU 2

>>

>> sk_psock_skb_redirect()

>>   sk_psock(sk_other)

>>   sock_flag(sk_other, SOCK_DEAD)

>>   sk_psock_test_state(psock_other,

>>                       SK_PSOCK_TX_ENABLED)

>>                                                         sk_psock_purge()

>>   skb_queue_tail(&psock_other->ingress_skb, skb)

>>   schedule_work(&psock_other->work)

>>

>>

>> And sock_orphan can run while we're in sendmsg/sendpage_unlocked:

>>

>>

>> CPU 1                                                   CPU 2

>>

>> sk_psock_backlog

>>   ...

>>   sendmsg_unlocked

>>     sock = sk->sk_socket

>>                                                         tcp_close

>>                                                           __tcp_close

>>                                                             sock_orphan

>>     kernel_sendmsg(sock, msg, vec, num, size)

>>

>>

>> So, after this change, without lock_sock in sk_psock_backlog, we will

>> not block tcp_close from running.

>>

>> This makes me think that the process socket can get released from under

>> us, before kernel_sendmsg/sendpage runs.

>

> I think you are right, I thought socket is orphaned in inet_release(), clearly

> I was wrong. But, I'd argue in the above scenario, the packet should not

> be even queued in the first place, as SK_PSOCK_TX_ENABLED is going

> to be cleared, so I think the right fix is probably to make clearing psock

> state and queuing the packet under a spinlock.


Sounds like a good idea. The goal, I understand, is to guarantee that
psock holds a ref count on proces socket for the duration of
sk_psock_backlog() run.

That would not only let us get rid of lock_sock(), with finer grained
queue locks, but also the sock_flag(psock->sk, SOCK_DEAD) check.
diff mbox series

Patch

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 7333bf881b81..91b357817bb8 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -347,6 +347,7 @@  static inline void sk_psock_report_error(struct sk_psock *psock, int err)
 }
 
 struct sk_psock *sk_psock_init(struct sock *sk, int node);
+void sk_psock_purge(struct sk_psock *psock);
 
 #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 41a5f82c53e6..bf0f874780c1 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -497,7 +497,7 @@  static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
 	if (!ingress) {
 		if (!sock_writeable(psock->sk))
 			return -EAGAIN;
-		return skb_send_sock_locked(psock->sk, skb, off, len);
+		return skb_send_sock(psock->sk, skb, off, len);
 	}
 	return sk_psock_skb_ingress(psock, skb);
 }
@@ -511,8 +511,6 @@  static void sk_psock_backlog(struct work_struct *work)
 	u32 len, off;
 	int ret;
 
-	/* Lock sock to avoid losing sk_socket during loop. */
-	lock_sock(psock->sk);
 	if (state->skb) {
 		skb = state->skb;
 		len = state->len;
@@ -529,7 +527,7 @@  static void sk_psock_backlog(struct work_struct *work)
 		skb_bpf_redirect_clear(skb);
 		do {
 			ret = -EIO;
-			if (likely(psock->sk->sk_socket))
+			if (!sock_flag(psock->sk, SOCK_DEAD))
 				ret = sk_psock_handle_skb(psock, skb, off,
 							  len, ingress);
 			if (ret <= 0) {
@@ -537,13 +535,13 @@  static void sk_psock_backlog(struct work_struct *work)
 					state->skb = skb;
 					state->len = len;
 					state->off = off;
-					goto end;
+					return;
 				}
 				/* Hard errors break pipe and stop xmit. */
 				sk_psock_report_error(psock, ret ? -ret : EPIPE);
 				sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
 				kfree_skb(skb);
-				goto end;
+				return;
 			}
 			off += ret;
 			len -= ret;
@@ -552,8 +550,6 @@  static void sk_psock_backlog(struct work_struct *work)
 		if (!ingress)
 			kfree_skb(skb);
 	}
-end:
-	release_sock(psock->sk);
 }
 
 struct sk_psock *sk_psock_init(struct sock *sk, int node)
@@ -654,6 +650,16 @@  static void sk_psock_link_destroy(struct sk_psock *psock)
 	}
 }
 
+void sk_psock_purge(struct sk_psock *psock)
+{
+	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+
+	cancel_work_sync(&psock->work);
+
+	sk_psock_cork_free(psock);
+	sk_psock_zap_ingress(psock);
+}
+
 static void sk_psock_done_strp(struct sk_psock *psock);
 
 static void sk_psock_destroy_deferred(struct work_struct *gc)
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index dd53a7771d7e..26ba47b099f1 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1540,6 +1540,7 @@  void sock_map_close(struct sock *sk, long timeout)
 	saved_close = psock->saved_close;
 	sock_map_remove_links(sk, psock);
 	rcu_read_unlock();
+	sk_psock_purge(psock);
 	release_sock(sk);
 	saved_close(sk, timeout);
 }