diff mbox series

[bpf-next,v3,4/6] bpf, sockmap: remove dropped data on errors in redirect case

Message ID 160226863689.5692.13861422742592309285.stgit@john-Precision-5820-Tower
State Superseded
Headers show
Series sockmap/sk_skb program memory acct fixes | expand

Commit Message

John Fastabend Oct. 9, 2020, 6:37 p.m. UTC
In the sk_skb redirect case we didn't handle the case where we overrun
the sk_rmem_alloc entry on ingress redirect or sk_wmem_alloc on egress.
Because we didn't have anything implemented we simply dropped the skb.
This meant data could be dropped if socket memory accounting was in
place.

This fixes the above dropped data case by moving the memory checks
later in the code where we actually do the send or recv. This pushes
those checks into the workqueue and allows us to return an EAGAIN error
which in turn allows us to try again later from the workqueue.

Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 net/core/skmsg.c |   28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

Comments

Jakub Sitnicki Oct. 12, 2020, 12:04 p.m. UTC | #1
On Fri, Oct 09, 2020 at 08:37 PM CEST, John Fastabend wrote:
> In the sk_skb redirect case we didn't handle the case where we overrun

> the sk_rmem_alloc entry on ingress redirect or sk_wmem_alloc on egress.

> Because we didn't have anything implemented we simply dropped the skb.

> This meant data could be dropped if socket memory accounting was in

> place.

>

> This fixes the above dropped data case by moving the memory checks

> later in the code where we actually do the send or recv. This pushes

> those checks into the workqueue and allows us to return an EAGAIN error

> which in turn allows us to try again later from the workqueue.

>

> Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path")

> Signed-off-by: John Fastabend <john.fastabend@gmail.com>

> ---

>  net/core/skmsg.c |   28 ++++++++++++++--------------

>  1 file changed, 14 insertions(+), 14 deletions(-)

>


[...]

> @@ -709,30 +711,28 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)

>  {

>  	struct sk_psock *psock_other;

>  	struct sock *sk_other;

> -	bool ingress;

>

>  	sk_other = tcp_skb_bpf_redirect_fetch(skb);

> +	/* This error is a buggy BPF program, it returned a redirect

> +	 * return code, but then didn't set a redirect interface.

> +	 */

>  	if (unlikely(!sk_other)) {

>  		kfree_skb(skb);

>  		return;

>  	}

>  	psock_other = sk_psock(sk_other);

> +	/* This error indicates the socket is being torn down or had another

> +	 * error that caused the pipe to break. We can't send a packet on

> +	 * a socket that is in this state so we drop the skb.

> +	 */

>  	if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||

>  	    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {

>  		kfree_skb(skb);

>  		return;

>  	}

>

> -	ingress = tcp_skb_bpf_ingress(skb);

> -	if ((!ingress && sock_writeable(sk_other)) ||

> -	    (ingress &&

> -	     atomic_read(&sk_other->sk_rmem_alloc) <=

> -	     sk_other->sk_rcvbuf)) {


I'm wondering why the check for going over socket's rcvbuf was removed?

I see that we now rely exclusively on
sk_psock_skb_ingress→sk_rmem_schedule for sk_rmem_alloc checks, which I
don't think applies the rcvbuf limit.

> -		skb_queue_tail(&psock_other->ingress_skb, skb);

> -		schedule_work(&psock_other->work);

> -	} else {

> -		kfree_skb(skb);

> -	}

> +	skb_queue_tail(&psock_other->ingress_skb, skb);

> +	schedule_work(&psock_other->work);

>  }

>

>  static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict)
John Fastabend Oct. 12, 2020, 5:19 p.m. UTC | #2
Jakub Sitnicki wrote:
> On Fri, Oct 09, 2020 at 08:37 PM CEST, John Fastabend wrote:
> > In the sk_skb redirect case we didn't handle the case where we overrun
> > the sk_rmem_alloc entry on ingress redirect or sk_wmem_alloc on egress.
> > Because we didn't have anything implemented we simply dropped the skb.
> > This meant data could be dropped if socket memory accounting was in
> > place.
> >
> > This fixes the above dropped data case by moving the memory checks
> > later in the code where we actually do the send or recv. This pushes
> > those checks into the workqueue and allows us to return an EAGAIN error
> > which in turn allows us to try again later from the workqueue.
> >
> > Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path")
> > Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> > ---
> >  net/core/skmsg.c |   28 ++++++++++++++--------------
> >  1 file changed, 14 insertions(+), 14 deletions(-)
> >
> 
> [...]
> 
> > @@ -709,30 +711,28 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
> >  {
> >  	struct sk_psock *psock_other;
> >  	struct sock *sk_other;
> > -	bool ingress;
> >
> >  	sk_other = tcp_skb_bpf_redirect_fetch(skb);
> > +	/* This error is a buggy BPF program, it returned a redirect
> > +	 * return code, but then didn't set a redirect interface.
> > +	 */
> >  	if (unlikely(!sk_other)) {
> >  		kfree_skb(skb);
> >  		return;
> >  	}
> >  	psock_other = sk_psock(sk_other);
> > +	/* This error indicates the socket is being torn down or had another
> > +	 * error that caused the pipe to break. We can't send a packet on
> > +	 * a socket that is in this state so we drop the skb.
> > +	 */
> >  	if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
> >  	    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
> >  		kfree_skb(skb);
> >  		return;
> >  	}
> >
> > -	ingress = tcp_skb_bpf_ingress(skb);
> > -	if ((!ingress && sock_writeable(sk_other)) ||
> > -	    (ingress &&
> > -	     atomic_read(&sk_other->sk_rmem_alloc) <=
> > -	     sk_other->sk_rcvbuf)) {
> 
> I'm wondering why the check for going over socket's rcvbuf was removed?

Couple reasons, I never checked it from skmsg side so after this patch
accounting for both skmsg and sk_skb types are the same. I think this
should be the case going forward with anything we do around memory
accounting. The other, and more immediate, reason is we don't want the
error case here with the kfree_skb().

> 
> I see that we now rely exclusively on
> sk_psock_skb_ingress→sk_rmem_schedule for sk_rmem_alloc checks, which I
> don't think applies the rcvbuf limit.

Right. Also notice even though we checked it here we never charged the
skm_rmem_alloc for skmsg on the ingress queue. So we were effectively
getting that memory for free. Still doing some review and drafting a
patch to see if this works, but my proposal is:

  For ingress sk_skb case we check sk_rmem_alloc before enqueuing
  the new sk_msg into ingress queue and then also charge the memory
  the same as skb_set_owner_r except do it with a new helper
  skmsg_set_owner_r(skmsg, sk) and only do the atomic add against
  sk_rmem_alloc and the sk_mem_charge() part.

  Then for skmsg programs convert the sk_mem_charge() calls to
  use the new skmsg_set_owner_r() to get the same memory accounting.
  Finally, on copy to user buffer we unwind this. Then we will have
  the memory in the queue accounted for against the socket.

I'll give it a try. Thanks for asking. wdyt? Any other ideas.

> 
> > -		skb_queue_tail(&psock_other->ingress_skb, skb);
> > -		schedule_work(&psock_other->work);
> > -	} else {
> > -		kfree_skb(skb);
> > -	}
> > +	skb_queue_tail(&psock_other->ingress_skb, skb);
> > +	schedule_work(&psock_other->work);
> >  }
> >
> >  static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict)
Jakub Sitnicki Oct. 13, 2020, 10:27 a.m. UTC | #3
On Mon, Oct 12, 2020 at 07:19 PM CEST, John Fastabend wrote:
> Jakub Sitnicki wrote:
>> On Fri, Oct 09, 2020 at 08:37 PM CEST, John Fastabend wrote:
>> > In the sk_skb redirect case we didn't handle the case where we overrun
>> > the sk_rmem_alloc entry on ingress redirect or sk_wmem_alloc on egress.
>> > Because we didn't have anything implemented we simply dropped the skb.
>> > This meant data could be dropped if socket memory accounting was in
>> > place.
>> >
>> > This fixes the above dropped data case by moving the memory checks
>> > later in the code where we actually do the send or recv. This pushes
>> > those checks into the workqueue and allows us to return an EAGAIN error
>> > which in turn allows us to try again later from the workqueue.
>> >
>> > Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path")
>> > Signed-off-by: John Fastabend <john.fastabend@gmail.com>
>> > ---
>> >  net/core/skmsg.c |   28 ++++++++++++++--------------
>> >  1 file changed, 14 insertions(+), 14 deletions(-)
>> >
>>
>> [...]
>>
>> > @@ -709,30 +711,28 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
>> >  {
>> >  	struct sk_psock *psock_other;
>> >  	struct sock *sk_other;
>> > -	bool ingress;
>> >
>> >  	sk_other = tcp_skb_bpf_redirect_fetch(skb);
>> > +	/* This error is a buggy BPF program, it returned a redirect
>> > +	 * return code, but then didn't set a redirect interface.
>> > +	 */
>> >  	if (unlikely(!sk_other)) {
>> >  		kfree_skb(skb);
>> >  		return;
>> >  	}
>> >  	psock_other = sk_psock(sk_other);
>> > +	/* This error indicates the socket is being torn down or had another
>> > +	 * error that caused the pipe to break. We can't send a packet on
>> > +	 * a socket that is in this state so we drop the skb.
>> > +	 */
>> >  	if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
>> >  	    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
>> >  		kfree_skb(skb);
>> >  		return;
>> >  	}
>> >
>> > -	ingress = tcp_skb_bpf_ingress(skb);
>> > -	if ((!ingress && sock_writeable(sk_other)) ||
>> > -	    (ingress &&
>> > -	     atomic_read(&sk_other->sk_rmem_alloc) <=
>> > -	     sk_other->sk_rcvbuf)) {
>>
>> I'm wondering why the check for going over socket's rcvbuf was removed?
>
> Couple reasons, I never checked it from skmsg side so after this patch
> accounting for both skmsg and sk_skb types are the same. I think this
> should be the case going forward with anything we do around memory
> accounting. The other, and more immediate, reason is we don't want the
> error case here with the kfree_skb().

Right, we definitely don't want to drop the skb.

What crossed my mind is that sk_psock_handle_skb() could check for
sk_rmem_alloc <= sk_rcvbuf and error out with -EAGAIN. Similarly to how
we check for sock_writable() with this change.

This would let the process owning the destination socket, we are
redirecting to, to push back by tuning down SO_RCVBUF.

>
>>
>> I see that we now rely exclusively on
>> sk_psock_skb_ingress→sk_rmem_schedule for sk_rmem_alloc checks, which I
>> don't think applies the rcvbuf limit.
>
> Right. Also notice even though we checked it here we never charged the
> skm_rmem_alloc for skmsg on the ingress queue. So we were effectively
> getting that memory for free. Still doing some review and drafting a
> patch to see if this works, but my proposal is:
>
>   For ingress sk_skb case we check sk_rmem_alloc before enqueuing
>   the new sk_msg into ingress queue and then also charge the memory
>   the same as skb_set_owner_r except do it with a new helper
>   skmsg_set_owner_r(skmsg, sk) and only do the atomic add against
>   sk_rmem_alloc and the sk_mem_charge() part.
>
>   Then for skmsg programs convert the sk_mem_charge() calls to
>   use the new skmsg_set_owner_r() to get the same memory accounting.
>   Finally, on copy to user buffer we unwind this. Then we will have
>   the memory in the queue accounted for against the socket.
>
> I'll give it a try. Thanks for asking. wdyt? Any other ideas.

SGTM, nothing to add except for honoring SO_RCVBUF I mentioned above.

[...]
diff mbox series

Patch

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 7389d5d7e7f8..880b84baab5e 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -433,10 +433,12 @@  static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
 static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
 			       u32 off, u32 len, bool ingress)
 {
-	if (ingress)
-		return sk_psock_skb_ingress(psock, skb);
-	else
+	if (!ingress) {
+		if (!sock_writeable(psock->sk))
+			return -EAGAIN;
 		return skb_send_sock_locked(psock->sk, skb, off, len);
+	}
+	return sk_psock_skb_ingress(psock, skb);
 }
 
 static void sk_psock_backlog(struct work_struct *work)
@@ -709,30 +711,28 @@  static void sk_psock_skb_redirect(struct sk_buff *skb)
 {
 	struct sk_psock *psock_other;
 	struct sock *sk_other;
-	bool ingress;
 
 	sk_other = tcp_skb_bpf_redirect_fetch(skb);
+	/* This error is a buggy BPF program, it returned a redirect
+	 * return code, but then didn't set a redirect interface.
+	 */
 	if (unlikely(!sk_other)) {
 		kfree_skb(skb);
 		return;
 	}
 	psock_other = sk_psock(sk_other);
+	/* This error indicates the socket is being torn down or had another
+	 * error that caused the pipe to break. We can't send a packet on
+	 * a socket that is in this state so we drop the skb.
+	 */
 	if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
 	    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
 		kfree_skb(skb);
 		return;
 	}
 
-	ingress = tcp_skb_bpf_ingress(skb);
-	if ((!ingress && sock_writeable(sk_other)) ||
-	    (ingress &&
-	     atomic_read(&sk_other->sk_rmem_alloc) <=
-	     sk_other->sk_rcvbuf)) {
-		skb_queue_tail(&psock_other->ingress_skb, skb);
-		schedule_work(&psock_other->work);
-	} else {
-		kfree_skb(skb);
-	}
+	skb_queue_tail(&psock_other->ingress_skb, skb);
+	schedule_work(&psock_other->work);
 }
 
 static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict)