diff mbox series

[bpf-next,v3,4/5] skmsg: use skb ext instead of TCP_SKB_CB

Message ID 20210213214421.226357-5-xiyou.wangcong@gmail.com
State New
Headers show
Series sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT | expand

Commit Message

Cong Wang Feb. 13, 2021, 9:44 p.m. UTC
From: Cong Wang <cong.wang@bytedance.com>

Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly
does not work for any other non-TCP protocols. We can move them to
skb ext instead of playing with skb cb, which is harder to make
correct.

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/linux/skbuff.h |  3 +++
 include/linux/skmsg.h  | 40 ++++++++++++++++++++++++++++++++++++++++
 include/net/tcp.h      | 19 -------------------
 net/Kconfig            |  1 +
 net/core/skbuff.c      |  7 +++++++
 net/core/skmsg.c       | 35 +++++++++++++++++++++++------------
 net/core/sock_map.c    | 12 ++++++------
 7 files changed, 80 insertions(+), 37 deletions(-)

Comments

John Fastabend Feb. 15, 2021, 7:20 p.m. UTC | #1
Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>

> 

> Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly

> does not work for any other non-TCP protocols. We can move them to

> skb ext instead of playing with skb cb, which is harder to make

> correct.

> 

> Cc: John Fastabend <john.fastabend@gmail.com>

> Cc: Daniel Borkmann <daniel@iogearbox.net>

> Cc: Jakub Sitnicki <jakub@cloudflare.com>

> Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>

> Signed-off-by: Cong Wang <cong.wang@bytedance.com>

> ---


I'm not seeing the advantage of doing this at the moment. We can
continue to use cb[] here, which is simpler IMO and use the ext
if needed for the other use cases. This is adding a per packet
alloc cost that we don't have at the moment as I understand it.

[...]

> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h

> index e3bb712af257..d5c711ef6d4b 100644

> --- a/include/linux/skmsg.h

> +++ b/include/linux/skmsg.h

> @@ -459,4 +459,44 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)

>  		return false;

>  	return !!psock->saved_data_ready;

>  }

> +

> +struct skb_bpf_ext {

> +	__u32 flags;

> +	struct sock *sk_redir;

> +};

> +

> +#if IS_ENABLED(CONFIG_NET_SOCK_MSG)

> +static inline

> +bool skb_bpf_ext_ingress(const struct sk_buff *skb)

> +{

> +	struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);

> +

> +	return ext->flags & BPF_F_INGRESS;

> +}

> +

> +static inline

> +void skb_bpf_ext_set_ingress(const struct sk_buff *skb)

> +{

> +	struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);

> +

> +	ext->flags |= BPF_F_INGRESS;

> +}

> +

> +static inline

> +struct sock *skb_bpf_ext_redirect_fetch(struct sk_buff *skb)

> +{

> +	struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);

> +

> +	return ext->sk_redir;

> +}

> +

> +static inline

> +void skb_bpf_ext_redirect_clear(struct sk_buff *skb)

> +{

> +	struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);

> +

> +	ext->flags = 0;

> +	ext->sk_redir = NULL;

> +}

> +#endif /* CONFIG_NET_SOCK_MSG */


So we will have some slight duplication for cb[] variant and ext
variant above. I'm OK with that to avoid an allocation.

[...]

> @@ -1003,11 +1008,17 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,

>  		goto out;

>  	}

>  	skb_set_owner_r(skb, sk);

> +	if (!skb_ext_add(skb, SKB_EXT_BPF)) {

> +		len = 0;

> +		kfree_skb(skb);

> +		goto out;

> +	}

> +


per packet cost here. Perhaps you can argue small alloc will usually not be 
noticable in such a large stack, but once we convert over it will be very
hard to go back. And I'm looking at optimizing this path now.

>  	prog = READ_ONCE(psock->progs.skb_verdict);

>  	if (likely(prog)) {

> -		tcp_skb_bpf_redirect_clear(skb);

> +		skb_bpf_ext_redirect_clear(skb);

>  		ret = sk_psock_bpf_run(psock, prog, skb);

> -		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));

> +		ret = sk_psock_map_verd(ret, skb_bpf_ext_redirect_fetch(skb));

>  	}

>  	sk_psock_verdict_apply(psock, skb, ret);


Thanks for the series Cong. Drop this patch and resubmit carry ACKs forward
and then lets revisit this later.

Thanks,
John
Cong Wang Feb. 15, 2021, 10:24 p.m. UTC | #2
On Mon, Feb 15, 2021 at 11:20 AM John Fastabend
<john.fastabend@gmail.com> wrote:
>

> Cong Wang wrote:

> > From: Cong Wang <cong.wang@bytedance.com>

> >

> > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly

> > does not work for any other non-TCP protocols. We can move them to

> > skb ext instead of playing with skb cb, which is harder to make

> > correct.

> >

> > Cc: John Fastabend <john.fastabend@gmail.com>

> > Cc: Daniel Borkmann <daniel@iogearbox.net>

> > Cc: Jakub Sitnicki <jakub@cloudflare.com>

> > Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>

> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>

> > ---

>

> I'm not seeing the advantage of doing this at the moment. We can

> continue to use cb[] here, which is simpler IMO and use the ext

> if needed for the other use cases. This is adding a per packet

> alloc cost that we don't have at the moment as I understand it.


Hmm? How can we continue using TCP_SKB_CB() for UDP or
AF_UNIX?

I am not sure I get your "at the moment" correctly, do you mean
I should move this patch to a later patchset, maybe the UDP
patchset? At least this patch is needed, no matter by which patchset,
so it should not be dropped.


>

> [...]

>

> > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h

> > index e3bb712af257..d5c711ef6d4b 100644

> > --- a/include/linux/skmsg.h

> > +++ b/include/linux/skmsg.h

> > @@ -459,4 +459,44 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)

> >               return false;

> >       return !!psock->saved_data_ready;

> >  }

> > +

> > +struct skb_bpf_ext {

> > +     __u32 flags;

> > +     struct sock *sk_redir;

> > +};

> > +

> > +#if IS_ENABLED(CONFIG_NET_SOCK_MSG)

> > +static inline

> > +bool skb_bpf_ext_ingress(const struct sk_buff *skb)

> > +{

> > +     struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);

> > +

> > +     return ext->flags & BPF_F_INGRESS;

> > +}

> > +

> > +static inline

> > +void skb_bpf_ext_set_ingress(const struct sk_buff *skb)

> > +{

> > +     struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);

> > +

> > +     ext->flags |= BPF_F_INGRESS;

> > +}

> > +

> > +static inline

> > +struct sock *skb_bpf_ext_redirect_fetch(struct sk_buff *skb)

> > +{

> > +     struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);

> > +

> > +     return ext->sk_redir;

> > +}

> > +

> > +static inline

> > +void skb_bpf_ext_redirect_clear(struct sk_buff *skb)

> > +{

> > +     struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);

> > +

> > +     ext->flags = 0;

> > +     ext->sk_redir = NULL;

> > +}

> > +#endif /* CONFIG_NET_SOCK_MSG */

>

> So we will have some slight duplication for cb[] variant and ext

> variant above. I'm OK with that to avoid an allocation.


Not sure what you mean by "duplication", these are removed from
TCP_SKB_CB(), so there is clearly no duplication.

>

> [...]

>

> > @@ -1003,11 +1008,17 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,

> >               goto out;

> >       }

> >       skb_set_owner_r(skb, sk);

> > +     if (!skb_ext_add(skb, SKB_EXT_BPF)) {

> > +             len = 0;

> > +             kfree_skb(skb);

> > +             goto out;

> > +     }

> > +

>

> per packet cost here. Perhaps you can argue small alloc will usually not be

> noticable in such a large stack, but once we convert over it will be very

> hard to go back. And I'm looking at optimizing this path now.


This is a price we need to pay to avoid CB, and skb_ext_add() has been
used on other fast paths too, for example, tcf_classify_ingress() and
mptcp_incoming_options(). So, it is definitely acceptable.

>

> >       prog = READ_ONCE(psock->progs.skb_verdict);

> >       if (likely(prog)) {

> > -             tcp_skb_bpf_redirect_clear(skb);

> > +             skb_bpf_ext_redirect_clear(skb);

> >               ret = sk_psock_bpf_run(psock, prog, skb);

> > -             ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));

> > +             ret = sk_psock_map_verd(ret, skb_bpf_ext_redirect_fetch(skb));

> >       }

> >       sk_psock_verdict_apply(psock, skb, ret);

>

> Thanks for the series Cong. Drop this patch and resubmit carry ACKs forward

> and then lets revisit this later.


I still believe it is best to stay in this patchset, as it does not change
any functionality and is a preparation too. And the next patchset will be
UDP/AF_UNIX changes as you suggested, it is very awkward to put this
patch into either UDP or AF_UNIX changes.

So, let's keep it in this patchset, and I am happy to address any concerns
and open to other ideas than using skb ext.

Thanks.
John Fastabend Feb. 15, 2021, 11:57 p.m. UTC | #3
Cong Wang wrote:
> On Mon, Feb 15, 2021 at 11:20 AM John Fastabend

> <john.fastabend@gmail.com> wrote:

> >

> > Cong Wang wrote:

> > > From: Cong Wang <cong.wang@bytedance.com>

> > >

> > > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly

> > > does not work for any other non-TCP protocols. We can move them to

> > > skb ext instead of playing with skb cb, which is harder to make

> > > correct.

> > >

> > > Cc: John Fastabend <john.fastabend@gmail.com>

> > > Cc: Daniel Borkmann <daniel@iogearbox.net>

> > > Cc: Jakub Sitnicki <jakub@cloudflare.com>

> > > Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>

> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com>

> > > ---

> >

> > I'm not seeing the advantage of doing this at the moment. We can

> > continue to use cb[] here, which is simpler IMO and use the ext

> > if needed for the other use cases. This is adding a per packet

> > alloc cost that we don't have at the moment as I understand it.

> 

> Hmm? How can we continue using TCP_SKB_CB() for UDP or

> AF_UNIX?

> 

> I am not sure I get your "at the moment" correctly, do you mean

> I should move this patch to a later patchset, maybe the UDP

> patchset? At least this patch is needed, no matter by which patchset,

> so it should not be dropped.


Agree, the skb_bpf_ext{} pieces are needed for UDP and AF_UNIX. Its
not required for TCP side though. What I'm suggesting is leave the
TCP side as-is, using the cb[] fields. Then use the skb_bpf_ext fields
from UDP and AF_UNIX.

> 

> 

> >

> > [...]

> >

> > > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h

> > > index e3bb712af257..d5c711ef6d4b 100644

> > > --- a/include/linux/skmsg.h

> > > +++ b/include/linux/skmsg.h

> > > @@ -459,4 +459,44 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)

> > >               return false;

> > >       return !!psock->saved_data_ready;

> > >  }

> > > +

> > > +struct skb_bpf_ext {

> > > +     __u32 flags;

> > > +     struct sock *sk_redir;

> > > +};

> > > +

> > > +#if IS_ENABLED(CONFIG_NET_SOCK_MSG)

> > > +static inline

> > > +bool skb_bpf_ext_ingress(const struct sk_buff *skb)

> > > +{

> > > +     struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);

> > > +

> > > +     return ext->flags & BPF_F_INGRESS;

> > > +}

> > > +

> > > +static inline

> > > +void skb_bpf_ext_set_ingress(const struct sk_buff *skb)

> > > +{

> > > +     struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);

> > > +

> > > +     ext->flags |= BPF_F_INGRESS;

> > > +}

> > > +

> > > +static inline

> > > +struct sock *skb_bpf_ext_redirect_fetch(struct sk_buff *skb)

> > > +{

> > > +     struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);

> > > +

> > > +     return ext->sk_redir;

> > > +}

> > > +

> > > +static inline

> > > +void skb_bpf_ext_redirect_clear(struct sk_buff *skb)

> > > +{

> > > +     struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);

> > > +

 > > +     ext->flags = 0;

> > > +     ext->sk_redir = NULL;

> > > +}

> > > +#endif /* CONFIG_NET_SOCK_MSG */

> >

> > So we will have some slight duplication for cb[] variant and ext

> > variant above. I'm OK with that to avoid an allocation.

> 

> Not sure what you mean by "duplication", these are removed from

> TCP_SKB_CB(), so there is clearly no duplication.


In this patch yes, no duplication.  But, I want to leave TCP alone
and have it continue to use cb[] to avoid alloc per packet.

> 

> >

> > [...]

> >

> > > @@ -1003,11 +1008,17 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,

> > >               goto out;

> > >       }

> > >       skb_set_owner_r(skb, sk);

> > > +     if (!skb_ext_add(skb, SKB_EXT_BPF)) {

> > > +             len = 0;

> > > +             kfree_skb(skb);

> > > +             goto out;

> > > +     }

> > > +

> >

> > per packet cost here. Perhaps you can argue small alloc will usually not be

> > noticable in such a large stack, but once we convert over it will be very

> > hard to go back. And I'm looking at optimizing this path now.

> 

> This is a price we need to pay to avoid CB, and skb_ext_add() has been

> used on other fast paths too, for example, tcf_classify_ingress() and

> mptcp_incoming_options(). So, it is definitely acceptable.


For TCP case we can continue to use CB and not pay the price. For UDP
and AF_UNIX we can do the extra alloc.

The use in tcf_classify_ingress is a miss case so not the common path. If
it is/was in the common path I would suggest we rip it out.

> 

> >

> > >       prog = READ_ONCE(psock->progs.skb_verdict);

> > >       if (likely(prog)) {

> > > -             tcp_skb_bpf_redirect_clear(skb);

> > > +             skb_bpf_ext_redirect_clear(skb);

> > >               ret = sk_psock_bpf_run(psock, prog, skb);

> > > -             ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));

> > > +             ret = sk_psock_map_verd(ret, skb_bpf_ext_redirect_fetch(skb));

> > >       }

> > >       sk_psock_verdict_apply(psock, skb, ret);

> >

> > Thanks for the series Cong. Drop this patch and resubmit carry ACKs forward

> > and then lets revisit this later.

> 

> I still believe it is best to stay in this patchset, as it does not change

> any functionality and is a preparation too. And the next patchset will be

> UDP/AF_UNIX changes as you suggested, it is very awkward to put this

> patch into either UDP or AF_UNIX changes.


Disagree. It adds extra code to the TCP side that I think is not needed. Any
reason the TCP implementation can't continue to use cb[]?

> 

> So, let's keep it in this patchset, and I am happy to address any concerns

> and open to other ideas than using skb ext.


The idea here is to just use cb[] in TCP case per above. I only scanned your
other patches, but presumably this can be patch 1 with just the functions

 skb_bpf_ext_ingress()
 skb_bpf_ext_set_ingress()
 skb_bpf_ext_redirect_fetch()
 skb_bpf_ext_redirect_clear()

And none of the removals from TCP side.

.John
Cong Wang Feb. 16, 2021, 12:28 a.m. UTC | #4
On Mon, Feb 15, 2021 at 3:57 PM John Fastabend <john.fastabend@gmail.com> wrote:
>

> For TCP case we can continue to use CB and not pay the price. For UDP

> and AF_UNIX we can do the extra alloc.


I see your point, but specializing TCP case does not give much benefit
here, the skmsg code would have to check skb->protocol etc. to decide
whether to use TCP_SKB_CB() or skb_ext:

if (skb->protocol == ...)
  TCP_SKB_CB(skb) = ...;
else
  ext = skb_ext_find(skb);

which looks ugly to me. And I doubt skb->protocol alone is sufficient to
distinguish TCP, so we may end up having more checks above.

So do you really want to trade code readability with an extra alloc?

>

> The use in tcf_classify_ingress is a miss case so not the common path. If

> it is/was in the common path I would suggest we rip it out.

>


Excellent point, what about nf_bridge_unshare()? It is a common path
for bridge netfilter, which is also probably why skb ext was introduced
(IIRC). secpath_set() seems on a common path for XFRM too.

Are you suggesting to remove them all? ;)

Thanks.
John Fastabend Feb. 16, 2021, 12:54 a.m. UTC | #5
Cong Wang wrote:
> On Mon, Feb 15, 2021 at 3:57 PM John Fastabend <john.fastabend@gmail.com> wrote:

> >

> > For TCP case we can continue to use CB and not pay the price. For UDP

> > and AF_UNIX we can do the extra alloc.

> 

> I see your point, but specializing TCP case does not give much benefit

> here, the skmsg code would have to check skb->protocol etc. to decide

> whether to use TCP_SKB_CB() or skb_ext:

> 

> if (skb->protocol == ...)

>   TCP_SKB_CB(skb) = ...;

> else

>   ext = skb_ext_find(skb);

> 

> which looks ugly to me. And I doubt skb->protocol alone is sufficient to

> distinguish TCP, so we may end up having more checks above.

> 

> So do you really want to trade code readability with an extra alloc?


Above is ugly. So I look at where the patch replaces things,

sk_psock_tls_strp_read(), this is TLS specific read hook so can't really
work in generic case anyways.

sk_psock_strp_read(), will you have UDP, AF_UNIX stream parsers? Do these
even work outside TCP cases.

For these ones: sk_psock_verdict_apply(), sk_psock_verdict_recv(),
sk_psock_backlog(), can't we just do some refactoring around their
hook points so we know the context. For example sk_psock_tls_verdict_apply
is calling sk_psock_skb_redirect(). Why not have a sk_psock_unix_redirect()
and a sk_psock_udp_redirect(). There are likely some optimizations we can
deploy this way. We've already don this for tls and sk_msg types for example.

Then the helpers will know their types by program type, just use the right
variants.

So not suggestiong if/else the checks so much as having per type hooks.

> 

> >

> > The use in tcf_classify_ingress is a miss case so not the common path. If

> > it is/was in the common path I would suggest we rip it out.

> >

> 

> Excellent point, what about nf_bridge_unshare()? It is a common path

> for bridge netfilter, which is also probably why skb ext was introduced

> (IIRC). secpath_set() seems on a common path for XFRM too.


Yeah not nice, but we don't use nf_bridge so doesn't bother me.

> 

> Are you suggesting to remove them all? ;)


From the hotpath where I care about perfromance yes. 

> 

> Thanks.
Cong Wang Feb. 16, 2021, 1:04 a.m. UTC | #6
On Mon, Feb 15, 2021 at 4:54 PM John Fastabend <john.fastabend@gmail.com> wrote:
>

> Cong Wang wrote:

> > On Mon, Feb 15, 2021 at 3:57 PM John Fastabend <john.fastabend@gmail.com> wrote:

> > >

> > > For TCP case we can continue to use CB and not pay the price. For UDP

> > > and AF_UNIX we can do the extra alloc.

> >

> > I see your point, but specializing TCP case does not give much benefit

> > here, the skmsg code would have to check skb->protocol etc. to decide

> > whether to use TCP_SKB_CB() or skb_ext:

> >

> > if (skb->protocol == ...)

> >   TCP_SKB_CB(skb) = ...;

> > else

> >   ext = skb_ext_find(skb);

> >

> > which looks ugly to me. And I doubt skb->protocol alone is sufficient to

> > distinguish TCP, so we may end up having more checks above.

> >

> > So do you really want to trade code readability with an extra alloc?

>

> Above is ugly. So I look at where the patch replaces things,

>

> sk_psock_tls_strp_read(), this is TLS specific read hook so can't really

> work in generic case anyways.

>

> sk_psock_strp_read(), will you have UDP, AF_UNIX stream parsers? Do these

> even work outside TCP cases.

>

> For these ones: sk_psock_verdict_apply(), sk_psock_verdict_recv(),

> sk_psock_backlog(), can't we just do some refactoring around their

> hook points so we know the context. For example sk_psock_tls_verdict_apply

> is calling sk_psock_skb_redirect(). Why not have a sk_psock_unix_redirect()

> and a sk_psock_udp_redirect(). There are likely some optimizations we can

> deploy this way. We've already don this for tls and sk_msg types for example.

>

> Then the helpers will know their types by program type, just use the right

> variants.

>

> So not suggestiong if/else the checks so much as having per type hooks.

>


Hmm, but sk_psock_backlog() is still the only one that handles all three
above cases, right? It uses TCP_SKB_CB() too and more importantly it
is also why we can't use a per-cpu struct here (see bpf_redirect_info).

Thanks.
John Fastabend Feb. 16, 2021, 1:50 a.m. UTC | #7
Cong Wang wrote:
> On Mon, Feb 15, 2021 at 4:54 PM John Fastabend <john.fastabend@gmail.com> wrote:

> >

> > Cong Wang wrote:

> > > On Mon, Feb 15, 2021 at 3:57 PM John Fastabend <john.fastabend@gmail.com> wrote:

> > > >

> > > > For TCP case we can continue to use CB and not pay the price. For UDP

> > > > and AF_UNIX we can do the extra alloc.

> > >

> > > I see your point, but specializing TCP case does not give much benefit

> > > here, the skmsg code would have to check skb->protocol etc. to decide

> > > whether to use TCP_SKB_CB() or skb_ext:

> > >

> > > if (skb->protocol == ...)

> > >   TCP_SKB_CB(skb) = ...;

> > > else

> > >   ext = skb_ext_find(skb);

> > >

> > > which looks ugly to me. And I doubt skb->protocol alone is sufficient to

> > > distinguish TCP, so we may end up having more checks above.

> > >

> > > So do you really want to trade code readability with an extra alloc?

> >

> > Above is ugly. So I look at where the patch replaces things,

> >

> > sk_psock_tls_strp_read(), this is TLS specific read hook so can't really

> > work in generic case anyways.

> >

> > sk_psock_strp_read(), will you have UDP, AF_UNIX stream parsers? Do these

> > even work outside TCP cases.

> >

> > For these ones: sk_psock_verdict_apply(), sk_psock_verdict_recv(),

> > sk_psock_backlog(), can't we just do some refactoring around their

> > hook points so we know the context. For example sk_psock_tls_verdict_apply

> > is calling sk_psock_skb_redirect(). Why not have a sk_psock_unix_redirect()

> > and a sk_psock_udp_redirect(). There are likely some optimizations we can

> > deploy this way. We've already don this for tls and sk_msg types for example.

> >

> > Then the helpers will know their types by program type, just use the right

> > variants.

> >

> > So not suggestiong if/else the checks so much as having per type hooks.

> >

> 

> Hmm, but sk_psock_backlog() is still the only one that handles all three

> above cases, right? It uses TCP_SKB_CB() too and more importantly it

> is also why we can't use a per-cpu struct here (see bpf_redirect_info).


Right, but the workqueue is created at init time where we will know the 
socket type based on the program/map types so can build the redirect
backlog queue there based on the type needed. I also have a patch in
mind that would do more specific TCP things in that code anyways. I
can flush it out this week if anyone cares. The idea is we are wasting
lots of cycles using skb_send_sock_locked when we can just inject
the packet directlyy into the tcp stack.

Also on the original patch here, we can't just kfree_skb() on alloc
errors because that will look like a data drop. Errors need to be
handled gracefully without dropping data. At least in the TCP case,
but probably also in UDP and AF_UNIX cases as well. Original code
was pretty loose in this regard, but it caused users to write bug
reports and then I've been fixing most of them. If you see more
cases let me know.

> 

> Thanks.
Cong Wang Feb. 16, 2021, 4:06 a.m. UTC | #8
On Mon, Feb 15, 2021 at 5:50 PM John Fastabend <john.fastabend@gmail.com> wrote:
>

> Cong Wang wrote:

> > On Mon, Feb 15, 2021 at 4:54 PM John Fastabend <john.fastabend@gmail.com> wrote:

> > >

> > > Cong Wang wrote:

> > > > On Mon, Feb 15, 2021 at 3:57 PM John Fastabend <john.fastabend@gmail.com> wrote:

> > > > >

> > > > > For TCP case we can continue to use CB and not pay the price. For UDP

> > > > > and AF_UNIX we can do the extra alloc.

> > > >

> > > > I see your point, but specializing TCP case does not give much benefit

> > > > here, the skmsg code would have to check skb->protocol etc. to decide

> > > > whether to use TCP_SKB_CB() or skb_ext:

> > > >

> > > > if (skb->protocol == ...)

> > > >   TCP_SKB_CB(skb) = ...;

> > > > else

> > > >   ext = skb_ext_find(skb);

> > > >

> > > > which looks ugly to me. And I doubt skb->protocol alone is sufficient to

> > > > distinguish TCP, so we may end up having more checks above.

> > > >

> > > > So do you really want to trade code readability with an extra alloc?

> > >

> > > Above is ugly. So I look at where the patch replaces things,

> > >

> > > sk_psock_tls_strp_read(), this is TLS specific read hook so can't really

> > > work in generic case anyways.

> > >

> > > sk_psock_strp_read(), will you have UDP, AF_UNIX stream parsers? Do these

> > > even work outside TCP cases.

> > >

> > > For these ones: sk_psock_verdict_apply(), sk_psock_verdict_recv(),

> > > sk_psock_backlog(), can't we just do some refactoring around their

> > > hook points so we know the context. For example sk_psock_tls_verdict_apply

> > > is calling sk_psock_skb_redirect(). Why not have a sk_psock_unix_redirect()

> > > and a sk_psock_udp_redirect(). There are likely some optimizations we can

> > > deploy this way. We've already don this for tls and sk_msg types for example.

> > >

> > > Then the helpers will know their types by program type, just use the right

> > > variants.

> > >

> > > So not suggestiong if/else the checks so much as having per type hooks.

> > >

> >

> > Hmm, but sk_psock_backlog() is still the only one that handles all three

> > above cases, right? It uses TCP_SKB_CB() too and more importantly it

> > is also why we can't use a per-cpu struct here (see bpf_redirect_info).

>

> Right, but the workqueue is created at init time where we will know the

> socket type based on the program/map types so can build the redirect

> backlog queue there based on the type needed. I also have a patch in


Hmm? How could a socket type match the skb type when we redirect
across-protocol?

In my use case, I want to redirect an AF_UNIX skb to a UDP socket,
clearly checking the UDP socket workqueue can't find out it is an
AF_UNIX skb. It has to be a per-skb check.

> mind that would do more specific TCP things in that code anyways. I

> can flush it out this week if anyone cares. The idea is we are wasting

> lots of cycles using skb_send_sock_locked when we can just inject

> the packet directlyy into the tcp stack.


Actually I did try the same, it clearly doesn't work for cross-protocol.

Anyway, please let me know what your suggestion for skb ext here?

It looks like we either have some ugly packet type checks, or just
use the skb ext. I don't see any other way yet, I also explored the
struct sk_buff again and still can not find anything we can reuse.
(_skb_refdst can only be reused very briefly with
tcp_skb_tsorted_save().)

Therefore, I believe using skb ext is still the best solution here.

>

> Also on the original patch here, we can't just kfree_skb() on alloc

> errors because that will look like a data drop. Errors need to be

> handled gracefully without dropping data. At least in the TCP case,

> but probably also in UDP and AF_UNIX cases as well. Original code

> was pretty loose in this regard, but it caused users to write bug

> reports and then I've been fixing most of them. If you see more

> cases let me know.


What's your suggestion here? Return -EAGAIN? But it requires
the caller put it in a loop to be graceful, but we can't do it in, for
example, sk_psock_tls_strp_read().

Thanks.
Lorenz Bauer Feb. 16, 2021, 8:56 a.m. UTC | #9
On Mon, 15 Feb 2021 at 19:20, John Fastabend <john.fastabend@gmail.com> wrote:
>

> Cong Wang wrote:

> > From: Cong Wang <cong.wang@bytedance.com>

> >

> > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly

> > does not work for any other non-TCP protocols. We can move them to

> > skb ext instead of playing with skb cb, which is harder to make

> > correct.

> >

> > Cc: John Fastabend <john.fastabend@gmail.com>

> > Cc: Daniel Borkmann <daniel@iogearbox.net>

> > Cc: Jakub Sitnicki <jakub@cloudflare.com>

> > Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>

> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>

> > ---

>

> I'm not seeing the advantage of doing this at the moment. We can

> continue to use cb[] here, which is simpler IMO and use the ext

> if needed for the other use cases. This is adding a per packet

> alloc cost that we don't have at the moment as I understand it.


John, do you have a benchmark we can look at? Right now we're arguing
in the abstract.
John Fastabend Feb. 17, 2021, 7:19 p.m. UTC | #10
Lorenz Bauer wrote:
> On Mon, 15 Feb 2021 at 19:20, John Fastabend <john.fastabend@gmail.com> wrote:

> >

> > Cong Wang wrote:

> > > From: Cong Wang <cong.wang@bytedance.com>

> > >

> > > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly

> > > does not work for any other non-TCP protocols. We can move them to

> > > skb ext instead of playing with skb cb, which is harder to make

> > > correct.

> > >

> > > Cc: John Fastabend <john.fastabend@gmail.com>

> > > Cc: Daniel Borkmann <daniel@iogearbox.net>

> > > Cc: Jakub Sitnicki <jakub@cloudflare.com>

> > > Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>

> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com>

> > > ---

> >

> > I'm not seeing the advantage of doing this at the moment. We can

> > continue to use cb[] here, which is simpler IMO and use the ext

> > if needed for the other use cases. This is adding a per packet

> > alloc cost that we don't have at the moment as I understand it.

> 

> John, do you have a benchmark we can look at? Right now we're arguing

> in the abstract.


Sure, but looks like Cong found some spare fields in sk_buff so
that looks much nicer.

I'll mess aound a bit with our benchmarks and see where we can
publish them. It would be good to have some repeatable tests
here folks can use.

Thanks,
John
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 46f901adf1a8..2d4ffe77ef47 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4166,6 +4166,9 @@  enum skb_ext_id {
 #endif
 #if IS_ENABLED(CONFIG_MPTCP)
 	SKB_EXT_MPTCP,
+#endif
+#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
+	SKB_EXT_BPF,
 #endif
 	SKB_EXT_NUM, /* must be last */
 };
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index e3bb712af257..d5c711ef6d4b 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -459,4 +459,44 @@  static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 		return false;
 	return !!psock->saved_data_ready;
 }
+
+struct skb_bpf_ext {
+	__u32 flags;
+	struct sock *sk_redir;
+};
+
+#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
+static inline
+bool skb_bpf_ext_ingress(const struct sk_buff *skb)
+{
+	struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);
+
+	return ext->flags & BPF_F_INGRESS;
+}
+
+static inline
+void skb_bpf_ext_set_ingress(const struct sk_buff *skb)
+{
+	struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);
+
+	ext->flags |= BPF_F_INGRESS;
+}
+
+static inline
+struct sock *skb_bpf_ext_redirect_fetch(struct sk_buff *skb)
+{
+	struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);
+
+	return ext->sk_redir;
+}
+
+static inline
+void skb_bpf_ext_redirect_clear(struct sk_buff *skb)
+{
+	struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);
+
+	ext->flags = 0;
+	ext->sk_redir = NULL;
+}
+#endif /* CONFIG_NET_SOCK_MSG */
 #endif /* _LINUX_SKMSG_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 697712178eff..e35881f837b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -882,30 +882,11 @@  struct tcp_skb_cb {
 			struct inet6_skb_parm	h6;
 #endif
 		} header;	/* For incoming skbs */
-		struct {
-			__u32 flags;
-			struct sock *sk_redir;
-		} bpf;
 	};
 };
 
 #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
 
-static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
-{
-	return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
-}
-
-static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
-{
-	return TCP_SKB_CB(skb)->bpf.sk_redir;
-}
-
-static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
-{
-	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
-}
-
 extern const struct inet_connection_sock_af_ops ipv4_specific;
 
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/Kconfig b/net/Kconfig
index a4f60d0c630f..9b4dd1ad2188 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -419,6 +419,7 @@  config SOCK_VALIDATE_XMIT
 
 config NET_SOCK_MSG
 	bool
+	select SKB_EXTENSIONS
 	default n
 	help
 	  The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 145503d3f06b..7695a2b65832 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -60,6 +60,7 @@ 
 #include <linux/prefetch.h>
 #include <linux/if_vlan.h>
 #include <linux/mpls.h>
+#include <linux/skmsg.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
@@ -4259,6 +4260,9 @@  static const u8 skb_ext_type_len[] = {
 #if IS_ENABLED(CONFIG_MPTCP)
 	[SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
 #endif
+#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
+	[SKB_EXT_BPF] = SKB_EXT_CHUNKSIZEOF(struct skb_bpf_ext),
+#endif
 };
 
 static __always_inline unsigned int skb_ext_total_length(void)
@@ -4275,6 +4279,9 @@  static __always_inline unsigned int skb_ext_total_length(void)
 #endif
 #if IS_ENABLED(CONFIG_MPTCP)
 		skb_ext_type_len[SKB_EXT_MPTCP] +
+#endif
+#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
+		skb_ext_type_len[SKB_EXT_BPF] +
 #endif
 		0;
 }
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 2d8bbb3fd87c..9404dbf5d57b 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -525,7 +525,8 @@  static void sk_psock_backlog(struct work_struct *work)
 		len = skb->len;
 		off = 0;
 start:
-		ingress = tcp_skb_bpf_ingress(skb);
+		ingress = skb_bpf_ext_ingress(skb);
+		skb_ext_del(skb, SKB_EXT_BPF);
 		do {
 			ret = -EIO;
 			if (likely(psock->sk->sk_socket))
@@ -752,7 +753,7 @@  static void sk_psock_skb_redirect(struct sk_buff *skb)
 	struct sk_psock *psock_other;
 	struct sock *sk_other;
 
-	sk_other = tcp_skb_bpf_redirect_fetch(skb);
+	sk_other = skb_bpf_ext_redirect_fetch(skb);
 	/* This error is a buggy BPF program, it returned a redirect
 	 * return code, but then didn't set a redirect interface.
 	 */
@@ -794,6 +795,9 @@  int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
 	struct bpf_prog *prog;
 	int ret = __SK_PASS;
 
+	if (!skb_ext_add(skb, SKB_EXT_BPF))
+		return __SK_DROP;
+
 	rcu_read_lock();
 	prog = READ_ONCE(psock->progs.skb_verdict);
 	if (likely(prog)) {
@@ -802,9 +806,9 @@  int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
 		 * TLS context.
 		 */
 		skb->sk = psock->sk;
-		tcp_skb_bpf_redirect_clear(skb);
+		skb_bpf_ext_redirect_clear(skb);
 		ret = sk_psock_bpf_run(psock, prog, skb);
-		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+		ret = sk_psock_map_verd(ret, skb_bpf_ext_redirect_fetch(skb));
 		skb->sk = NULL;
 	}
 	sk_psock_tls_verdict_apply(skb, psock->sk, ret);
@@ -816,7 +820,6 @@  EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
 static void sk_psock_verdict_apply(struct sk_psock *psock,
 				   struct sk_buff *skb, int verdict)
 {
-	struct tcp_skb_cb *tcp;
 	struct sock *sk_other;
 	int err = -EIO;
 
@@ -828,9 +831,7 @@  static void sk_psock_verdict_apply(struct sk_psock *psock,
 			goto out_free;
 		}
 
-		tcp = TCP_SKB_CB(skb);
-		tcp->bpf.flags |= BPF_F_INGRESS;
-
+		skb_bpf_ext_set_ingress(skb);
 		/* If the queue is empty then we can submit directly
 		 * into the msg queue. If its not empty we have to
 		 * queue work otherwise we may get OOO data. Otherwise,
@@ -888,11 +889,15 @@  static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
 		goto out;
 	}
 	skb_set_owner_r(skb, sk);
+	if (!skb_ext_add(skb, SKB_EXT_BPF)) {
+		kfree_skb(skb);
+		goto out;
+	}
 	prog = READ_ONCE(psock->progs.skb_verdict);
 	if (likely(prog)) {
-		tcp_skb_bpf_redirect_clear(skb);
+		skb_bpf_ext_redirect_clear(skb);
 		ret = sk_psock_bpf_run(psock, prog, skb);
-		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+		ret = sk_psock_map_verd(ret, skb_bpf_ext_redirect_fetch(skb));
 	}
 	sk_psock_verdict_apply(psock, skb, ret);
 out:
@@ -1003,11 +1008,17 @@  static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
 		goto out;
 	}
 	skb_set_owner_r(skb, sk);
+	if (!skb_ext_add(skb, SKB_EXT_BPF)) {
+		len = 0;
+		kfree_skb(skb);
+		goto out;
+	}
+
 	prog = READ_ONCE(psock->progs.skb_verdict);
 	if (likely(prog)) {
-		tcp_skb_bpf_redirect_clear(skb);
+		skb_bpf_ext_redirect_clear(skb);
 		ret = sk_psock_bpf_run(psock, prog, skb);
-		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+		ret = sk_psock_map_verd(ret, skb_bpf_ext_redirect_fetch(skb));
 	}
 	sk_psock_verdict_apply(psock, skb, ret);
 out:
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 1a28a5c2c61e..e9f2a17fb665 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -657,7 +657,7 @@  const struct bpf_func_proto bpf_sock_map_update_proto = {
 BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 	   struct bpf_map *, map, u32, key, u64, flags)
 {
-	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+	struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);
 	struct sock *sk;
 
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -667,8 +667,8 @@  BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 	if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
 		return SK_DROP;
 
-	tcb->bpf.flags = flags;
-	tcb->bpf.sk_redir = sk;
+	ext->flags = flags;
+	ext->sk_redir = sk;
 	return SK_PASS;
 }
 
@@ -1250,7 +1250,7 @@  const struct bpf_func_proto bpf_sock_hash_update_proto = {
 BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
 	   struct bpf_map *, map, void *, key, u64, flags)
 {
-	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+	struct skb_bpf_ext *ext = skb_ext_find(skb, SKB_EXT_BPF);
 	struct sock *sk;
 
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -1260,8 +1260,8 @@  BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
 	if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
 		return SK_DROP;
 
-	tcb->bpf.flags = flags;
-	tcb->bpf.sk_redir = sk;
+	ext->flags = flags;
+	ext->sk_redir = sk;
 	return SK_PASS;
 }