diff mbox series

[bpf-next,v4,3/3] bpf: remove extra lock_sock for TCP_ZEROCOPY_RECEIVE

Message ID 20210107184305.444635-4-sdf@google.com
State New
Headers show
Series None | expand

Commit Message

Stanislav Fomichev Jan. 7, 2021, 6:43 p.m. UTC
Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
call in do_tcp_getsockopt using the on-stack data. This removes
2% overhead for locking/unlocking the socket.

Also:
- Removed BUILD_BUG_ON (zerocopy doesn't depend on the buf size anymore)
- Separated on-stack buffer into bpf_sockopt_buf and downsized to 32 bytes
  (let's keep it to help with the other options)

(I can probably split this patch into two: add new features and rework
 bpf_sockopt_buf; can follow up if the approach in general sounds
 good).

Without this patch:
     1.87%     0.06%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt

With the patch applied:
     0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
---
 include/linux/bpf-cgroup.h                    | 25 ++++-
 include/linux/filter.h                        |  6 +-
 include/net/sock.h                            |  2 +
 include/net/tcp.h                             |  1 +
 kernel/bpf/cgroup.c                           | 93 +++++++++++++------
 net/ipv4/tcp.c                                | 14 +++
 net/ipv4/tcp_ipv4.c                           |  1 +
 net/ipv6/tcp_ipv6.c                           |  1 +
 .../selftests/bpf/prog_tests/sockopt_sk.c     | 22 +++++
 .../testing/selftests/bpf/progs/sockopt_sk.c  | 15 +++
 10 files changed, 147 insertions(+), 33 deletions(-)

Comments

Martin KaFai Lau Jan. 8, 2021, 1:08 a.m. UTC | #1
On Thu, Jan 07, 2021 at 10:43:05AM -0800, Stanislav Fomichev wrote:
> Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.

> We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom

> call in do_tcp_getsockopt using the on-stack data. This removes

> 2% overhead for locking/unlocking the socket.

> 

> Also:

> - Removed BUILD_BUG_ON (zerocopy doesn't depend on the buf size anymore)

> - Separated on-stack buffer into bpf_sockopt_buf and downsized to 32 bytes

>   (let's keep it to help with the other options)

> 

> (I can probably split this patch into two: add new features and rework

>  bpf_sockopt_buf; can follow up if the approach in general sounds

>  good).

> 

> Without this patch:

>      1.87%     0.06%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt

> 

> With the patch applied:

>      0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern

> 

> Signed-off-by: Stanislav Fomichev <sdf@google.com>

> Cc: Martin KaFai Lau <kafai@fb.com>

> Cc: Song Liu <songliubraving@fb.com>

> Cc: Eric Dumazet <edumazet@google.com>

> ---

>  include/linux/bpf-cgroup.h                    | 25 ++++-

>  include/linux/filter.h                        |  6 +-

>  include/net/sock.h                            |  2 +

>  include/net/tcp.h                             |  1 +

>  kernel/bpf/cgroup.c                           | 93 +++++++++++++------

>  net/ipv4/tcp.c                                | 14 +++

>  net/ipv4/tcp_ipv4.c                           |  1 +

>  net/ipv6/tcp_ipv6.c                           |  1 +

>  .../selftests/bpf/prog_tests/sockopt_sk.c     | 22 +++++

>  .../testing/selftests/bpf/progs/sockopt_sk.c  | 15 +++

>  10 files changed, 147 insertions(+), 33 deletions(-)

>


[ ... ]

> @@ -454,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,

>  #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })

>  #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \

>  				       optlen, max_optlen, retval) ({ retval; })

> +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \

> +					    optlen, retval) ({ retval; })

>  #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \

>  				       kernel_optval) ({ 0; })

>  

> diff --git a/include/linux/filter.h b/include/linux/filter.h

> index 54a4225f36d8..8739f1d4cac4 100644

> --- a/include/linux/filter.h

> +++ b/include/linux/filter.h

> @@ -1281,7 +1281,10 @@ struct bpf_sysctl_kern {

>  	u64 tmp_reg;

>  };

>  

> -#define BPF_SOCKOPT_KERN_BUF_SIZE	64

> +#define BPF_SOCKOPT_KERN_BUF_SIZE	32

It is reduced from patch 1 because there is no
need to use the buf (and copy from/to buf) in TCP_ZEROCOPY_RECEIVE?

Patch 1 is still desired (and kept in this set) because it may still
benefit other optname?

> +struct bpf_sockopt_buf {

> +	u8		data[BPF_SOCKOPT_KERN_BUF_SIZE];

> +};

>  

>  struct bpf_sockopt_kern {

>  	struct sock	*sk;

> @@ -1291,7 +1294,6 @@ struct bpf_sockopt_kern {

>  	s32		optname;

>  	s32		optlen;

>  	s32		retval;

> -	u8		buf[BPF_SOCKOPT_KERN_BUF_SIZE];

It is better to pick one way to do thing to avoid code
churn like this within the same series.

>  };

>  

>  int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);

> diff --git a/include/net/sock.h b/include/net/sock.h

> index bdc4323ce53c..ebf44d724845 100644

> --- a/include/net/sock.h

> +++ b/include/net/sock.h

> @@ -1174,6 +1174,8 @@ struct proto {

>  

>  	int			(*backlog_rcv) (struct sock *sk,

>  						struct sk_buff *skb);

> +	bool			(*bpf_bypass_getsockopt)(int level,

> +							 int optname);

>  

>  	void		(*release_cb)(struct sock *sk);

>  

> diff --git a/include/net/tcp.h b/include/net/tcp.h

> index 78d13c88720f..4bb42fb19711 100644

> --- a/include/net/tcp.h

> +++ b/include/net/tcp.h

> @@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock,

>  		      struct poll_table_struct *wait);

>  int tcp_getsockopt(struct sock *sk, int level, int optname,

>  		   char __user *optval, int __user *optlen);

> +bool tcp_bpf_bypass_getsockopt(int level, int optname);

>  int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,

>  		   unsigned int optlen);

>  void tcp_set_keepalive(struct sock *sk, int val);

> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c

> index adbecdcaa370..e82df63aedc7 100644

> --- a/kernel/bpf/cgroup.c

> +++ b/kernel/bpf/cgroup.c

> @@ -16,7 +16,6 @@

>  #include <linux/bpf-cgroup.h>

>  #include <net/sock.h>

>  #include <net/bpf_sk_storage.h>

> -#include <uapi/linux/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */

Can the patches be re-ordered a little to avoid code churn like this
in the same series?

It feels like this patch 3 should be the first patch instead.
The current patch 1 should be the second patch
but it can still use the tcp_mmap to show potential
benefit for other optnames.

>  

>  #include "../cgroup/cgroup-internal.h"

>  

> @@ -1299,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,

>  	return empty;

>  }

>  

> -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)

> +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,

> +			     struct bpf_sockopt_buf *buf)

>  {

>  	if (unlikely(max_optlen < 0))

>  		return -EINVAL;

> @@ -1311,18 +1311,11 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)

>  		max_optlen = PAGE_SIZE;

>  	}

>  

> -	if (max_optlen <= sizeof(ctx->buf)) {

> +	if (max_optlen <= sizeof(buf->data)) {

>  		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE

>  		 * bytes avoid the cost of kzalloc.

> -		 *

> -		 * In order to remove extra allocations from the TCP

> -		 * fast zero-copy path ensure that buffer covers

> -		 * the size of struct tcp_zerocopy_receive.

>  		 */

> -		BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >

> -			     BPF_SOCKOPT_KERN_BUF_SIZE);

> -

> -		ctx->optval = ctx->buf;

> +		ctx->optval = buf->data;

>  		ctx->optval_end = ctx->optval + max_optlen;

>  		return max_optlen;

>  	}

> @@ -1336,16 +1329,18 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)

>  	return max_optlen;

>  }

>  

> -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)

> +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,

> +			     struct bpf_sockopt_buf *buf)

>  {

> -	if (ctx->optval == ctx->buf)

> +	if (ctx->optval == buf->data)

>  		return;

>  	kfree(ctx->optval);

>  }

>  

> -static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx)

> +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,

> +				  struct bpf_sockopt_buf *buf)

>  {

> -	return ctx->optval != ctx->buf;

> +	return ctx->optval != buf->data;

>  }

>  

>  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,

> @@ -1353,6 +1348,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,

>  				       int *optlen, char **kernel_optval)

>  {

>  	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);

> +	struct bpf_sockopt_buf buf = {};

>  	struct bpf_sockopt_kern ctx = {

>  		.sk = sk,

>  		.level = *level,

> @@ -1373,7 +1369,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,

>  	 */

>  	max_optlen = max_t(int, 16, *optlen);

>  

> -	max_optlen = sockopt_alloc_buf(&ctx, max_optlen);

> +	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);

>  	if (max_optlen < 0)

>  		return max_optlen;

>  

> @@ -1419,7 +1415,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,

>  			 * No way to export on-stack buf, have to allocate a

>  			 * new buffer.

>  			 */

> -			if (!sockopt_buf_allocated(&ctx)) {

> +			if (!sockopt_buf_allocated(&ctx, &buf)) {

>  				void *p = kzalloc(ctx.optlen, GFP_USER);

>  

>  				if (!p) {

> @@ -1436,7 +1432,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,

>  

>  out:

>  	if (ret)

> -		sockopt_free_buf(&ctx);

> +		sockopt_free_buf(&ctx, &buf);

>  	return ret;

>  }

>  

> @@ -1445,15 +1441,20 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,

>  				       int __user *optlen, int max_optlen,

>  				       int retval)

>  {

> -	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);

> -	struct bpf_sockopt_kern ctx = {

> -		.sk = sk,

> -		.level = level,

> -		.optname = optname,

> -		.retval = retval,

> -	};

This change looks unnecessary?

> +	struct bpf_sockopt_kern ctx;

> +	struct bpf_sockopt_buf buf;

> +	struct cgroup *cgrp;

>  	int ret;

>  

> +	memset(&buf, 0, sizeof(buf));

> +	memset(&ctx, 0, sizeof(ctx));

> +

> +	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);

> +	ctx.sk = sk;

> +	ctx.level = level;

> +	ctx.optname = optname;

> +	ctx.retval = retval;

> +
Stanislav Fomichev Jan. 8, 2021, 1:25 a.m. UTC | #2
On Thu, Jan 7, 2021 at 5:09 PM Martin KaFai Lau <kafai@fb.com> wrote:
>

> On Thu, Jan 07, 2021 at 10:43:05AM -0800, Stanislav Fomichev wrote:

> > Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.

> > We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom

> > call in do_tcp_getsockopt using the on-stack data. This removes

> > 2% overhead for locking/unlocking the socket.

> >

> > Also:

> > - Removed BUILD_BUG_ON (zerocopy doesn't depend on the buf size anymore)

> > - Separated on-stack buffer into bpf_sockopt_buf and downsized to 32 bytes

> >   (let's keep it to help with the other options)

> >

> > (I can probably split this patch into two: add new features and rework

> >  bpf_sockopt_buf; can follow up if the approach in general sounds

> >  good).

> >

> > Without this patch:

> >      1.87%     0.06%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt

> >

> > With the patch applied:

> >      0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern

> >

> > Signed-off-by: Stanislav Fomichev <sdf@google.com>

> > Cc: Martin KaFai Lau <kafai@fb.com>

> > Cc: Song Liu <songliubraving@fb.com>

> > Cc: Eric Dumazet <edumazet@google.com>

> > ---

> >  include/linux/bpf-cgroup.h                    | 25 ++++-

> >  include/linux/filter.h                        |  6 +-

> >  include/net/sock.h                            |  2 +

> >  include/net/tcp.h                             |  1 +

> >  kernel/bpf/cgroup.c                           | 93 +++++++++++++------

> >  net/ipv4/tcp.c                                | 14 +++

> >  net/ipv4/tcp_ipv4.c                           |  1 +

> >  net/ipv6/tcp_ipv6.c                           |  1 +

> >  .../selftests/bpf/prog_tests/sockopt_sk.c     | 22 +++++

> >  .../testing/selftests/bpf/progs/sockopt_sk.c  | 15 +++

> >  10 files changed, 147 insertions(+), 33 deletions(-)

> >

>

> [ ... ]

>

> > @@ -454,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,

> >  #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })

> >  #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \

> >                                      optlen, max_optlen, retval) ({ retval; })

> > +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \

> > +                                         optlen, retval) ({ retval; })

> >  #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \

> >                                      kernel_optval) ({ 0; })

> >

> > diff --git a/include/linux/filter.h b/include/linux/filter.h

> > index 54a4225f36d8..8739f1d4cac4 100644

> > --- a/include/linux/filter.h

> > +++ b/include/linux/filter.h

> > @@ -1281,7 +1281,10 @@ struct bpf_sysctl_kern {

> >       u64 tmp_reg;

> >  };

> >

> > -#define BPF_SOCKOPT_KERN_BUF_SIZE    64

> > +#define BPF_SOCKOPT_KERN_BUF_SIZE    32

> It is reduced from patch 1 because there is no

> need to use the buf (and copy from/to buf) in TCP_ZEROCOPY_RECEIVE?

>

> Patch 1 is still desired (and kept in this set) because it may still

> benefit other optname?

Right, it seems like a good idea to keep it to help with the (majority?)
of small socket options.

> > +struct bpf_sockopt_buf {

> > +     u8              data[BPF_SOCKOPT_KERN_BUF_SIZE];

> > +};

> >

> >  struct bpf_sockopt_kern {

> >       struct sock     *sk;

> > @@ -1291,7 +1294,6 @@ struct bpf_sockopt_kern {

> >       s32             optname;

> >       s32             optlen;

> >       s32             retval;

> > -     u8              buf[BPF_SOCKOPT_KERN_BUF_SIZE];

> It is better to pick one way to do thing to avoid code

> churn like this within the same series.

Agreed. I pointed it out in the commit description that it might be a
good idea to separate those changes.
I wasn't sure about the fate of this patch when I first sent it out
and didn't spend too much time on this sort of stuff.
Let me simplify/reorder as you suggested below and resend.

> >  };

> >

> >  int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);

> > diff --git a/include/net/sock.h b/include/net/sock.h

> > index bdc4323ce53c..ebf44d724845 100644

> > --- a/include/net/sock.h

> > +++ b/include/net/sock.h

> > @@ -1174,6 +1174,8 @@ struct proto {

> >

> >       int                     (*backlog_rcv) (struct sock *sk,

> >                                               struct sk_buff *skb);

> > +     bool                    (*bpf_bypass_getsockopt)(int level,

> > +                                                      int optname);

> >

> >       void            (*release_cb)(struct sock *sk);

> >

> > diff --git a/include/net/tcp.h b/include/net/tcp.h

> > index 78d13c88720f..4bb42fb19711 100644

> > --- a/include/net/tcp.h

> > +++ b/include/net/tcp.h

> > @@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock,

> >                     struct poll_table_struct *wait);

> >  int tcp_getsockopt(struct sock *sk, int level, int optname,

> >                  char __user *optval, int __user *optlen);

> > +bool tcp_bpf_bypass_getsockopt(int level, int optname);

> >  int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,

> >                  unsigned int optlen);

> >  void tcp_set_keepalive(struct sock *sk, int val);

> > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c

> > index adbecdcaa370..e82df63aedc7 100644

> > --- a/kernel/bpf/cgroup.c

> > +++ b/kernel/bpf/cgroup.c

> > @@ -16,7 +16,6 @@

> >  #include <linux/bpf-cgroup.h>

> >  #include <net/sock.h>

> >  #include <net/bpf_sk_storage.h>

> > -#include <uapi/linux/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */

> Can the patches be re-ordered a little to avoid code churn like this

> in the same series?

>

> It feels like this patch 3 should be the first patch instead.

> The current patch 1 should be the second patch

> but it can still use the tcp_mmap to show potential

> benefit for other optnames.

>

> >

> >  #include "../cgroup/cgroup-internal.h"

> >

> > @@ -1299,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,

> >       return empty;

> >  }

> >

> > -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)

> > +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,

> > +                          struct bpf_sockopt_buf *buf)

> >  {

> >       if (unlikely(max_optlen < 0))

> >               return -EINVAL;

> > @@ -1311,18 +1311,11 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)

> >               max_optlen = PAGE_SIZE;

> >       }

> >

> > -     if (max_optlen <= sizeof(ctx->buf)) {

> > +     if (max_optlen <= sizeof(buf->data)) {

> >               /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE

> >                * bytes avoid the cost of kzalloc.

> > -              *

> > -              * In order to remove extra allocations from the TCP

> > -              * fast zero-copy path ensure that buffer covers

> > -              * the size of struct tcp_zerocopy_receive.

> >                */

> > -             BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >

> > -                          BPF_SOCKOPT_KERN_BUF_SIZE);

> > -

> > -             ctx->optval = ctx->buf;

> > +             ctx->optval = buf->data;

> >               ctx->optval_end = ctx->optval + max_optlen;

> >               return max_optlen;

> >       }

> > @@ -1336,16 +1329,18 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)

> >       return max_optlen;

> >  }

> >

> > -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)

> > +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,

> > +                          struct bpf_sockopt_buf *buf)

> >  {

> > -     if (ctx->optval == ctx->buf)

> > +     if (ctx->optval == buf->data)

> >               return;

> >       kfree(ctx->optval);

> >  }

> >

> > -static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx)

> > +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,

> > +                               struct bpf_sockopt_buf *buf)

> >  {

> > -     return ctx->optval != ctx->buf;

> > +     return ctx->optval != buf->data;

> >  }

> >

> >  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,

> > @@ -1353,6 +1348,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,

> >                                      int *optlen, char **kernel_optval)

> >  {

> >       struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);

> > +     struct bpf_sockopt_buf buf = {};

> >       struct bpf_sockopt_kern ctx = {

> >               .sk = sk,

> >               .level = *level,

> > @@ -1373,7 +1369,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,

> >        */

> >       max_optlen = max_t(int, 16, *optlen);

> >

> > -     max_optlen = sockopt_alloc_buf(&ctx, max_optlen);

> > +     max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);

> >       if (max_optlen < 0)

> >               return max_optlen;

> >

> > @@ -1419,7 +1415,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,

> >                        * No way to export on-stack buf, have to allocate a

> >                        * new buffer.

> >                        */

> > -                     if (!sockopt_buf_allocated(&ctx)) {

> > +                     if (!sockopt_buf_allocated(&ctx, &buf)) {

> >                               void *p = kzalloc(ctx.optlen, GFP_USER);

> >

> >                               if (!p) {

> > @@ -1436,7 +1432,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,

> >

> >  out:

> >       if (ret)

> > -             sockopt_free_buf(&ctx);

> > +             sockopt_free_buf(&ctx, &buf);

> >       return ret;

> >  }

> >

> > @@ -1445,15 +1441,20 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,

> >                                      int __user *optlen, int max_optlen,

> >                                      int retval)

> >  {

> > -     struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);

> > -     struct bpf_sockopt_kern ctx = {

> > -             .sk = sk,

> > -             .level = level,

> > -             .optname = optname,

> > -             .retval = retval,

> > -     };

> This change looks unnecessary?

>

> > +     struct bpf_sockopt_kern ctx;

> > +     struct bpf_sockopt_buf buf;

> > +     struct cgroup *cgrp;

> >       int ret;

> >

> > +     memset(&buf, 0, sizeof(buf));

> > +     memset(&ctx, 0, sizeof(ctx));

> > +

> > +     cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);

> > +     ctx.sk = sk;

> > +     ctx.level = level;

> > +     ctx.optname = optname;

> > +     ctx.retval = retval;

> > +
diff mbox series

Patch

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index dd4b8e300746..cbba9c9ab073 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -147,6 +147,10 @@  int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 				       int __user *optlen, int max_optlen,
 				       int retval);
 
+int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+					    int optname, void *optval,
+					    int *optlen, int retval);
+
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
 {
@@ -366,10 +370,21 @@  int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 ({									       \
 	int __ret = retval;						       \
 	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
-		__ret = __cgroup_bpf_run_filter_getsockopt(sock, level,	       \
-							   optname, optval,    \
-							   optlen, max_optlen, \
-							   retval);	       \
+		if (!(sock)->sk_prot->bpf_bypass_getsockopt ||		       \
+		    !(sock)->sk_prot->bpf_bypass_getsockopt(level, optname))   \
+			__ret = __cgroup_bpf_run_filter_getsockopt(	       \
+				sock, level, optname, optval, optlen,	       \
+				max_optlen, retval);			       \
+	__ret;								       \
+})
+
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval,      \
+					    optlen, retval)		       \
+({									       \
+	int __ret = retval;						       \
+	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
+		__ret = __cgroup_bpf_run_filter_getsockopt_kern(	       \
+			sock, level, optname, optval, optlen, retval);	       \
 	__ret;								       \
 })
 
@@ -454,6 +469,8 @@  static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
 				       optlen, max_optlen, retval) ({ retval; })
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
+					    optlen, retval) ({ retval; })
 #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
 				       kernel_optval) ({ 0; })
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 54a4225f36d8..8739f1d4cac4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1281,7 +1281,10 @@  struct bpf_sysctl_kern {
 	u64 tmp_reg;
 };
 
-#define BPF_SOCKOPT_KERN_BUF_SIZE	64
+#define BPF_SOCKOPT_KERN_BUF_SIZE	32
+struct bpf_sockopt_buf {
+	u8		data[BPF_SOCKOPT_KERN_BUF_SIZE];
+};
 
 struct bpf_sockopt_kern {
 	struct sock	*sk;
@@ -1291,7 +1294,6 @@  struct bpf_sockopt_kern {
 	s32		optname;
 	s32		optlen;
 	s32		retval;
-	u8		buf[BPF_SOCKOPT_KERN_BUF_SIZE];
 };
 
 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
diff --git a/include/net/sock.h b/include/net/sock.h
index bdc4323ce53c..ebf44d724845 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1174,6 +1174,8 @@  struct proto {
 
 	int			(*backlog_rcv) (struct sock *sk,
 						struct sk_buff *skb);
+	bool			(*bpf_bypass_getsockopt)(int level,
+							 int optname);
 
 	void		(*release_cb)(struct sock *sk);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 78d13c88720f..4bb42fb19711 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -403,6 +403,7 @@  __poll_t tcp_poll(struct file *file, struct socket *sock,
 		      struct poll_table_struct *wait);
 int tcp_getsockopt(struct sock *sk, int level, int optname,
 		   char __user *optval, int __user *optlen);
+bool tcp_bpf_bypass_getsockopt(int level, int optname);
 int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
 		   unsigned int optlen);
 void tcp_set_keepalive(struct sock *sk, int val);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index adbecdcaa370..e82df63aedc7 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -16,7 +16,6 @@ 
 #include <linux/bpf-cgroup.h>
 #include <net/sock.h>
 #include <net/bpf_sk_storage.h>
-#include <uapi/linux/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */
 
 #include "../cgroup/cgroup-internal.h"
 
@@ -1299,7 +1298,8 @@  static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
 	return empty;
 }
 
-static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
+			     struct bpf_sockopt_buf *buf)
 {
 	if (unlikely(max_optlen < 0))
 		return -EINVAL;
@@ -1311,18 +1311,11 @@  static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 		max_optlen = PAGE_SIZE;
 	}
 
-	if (max_optlen <= sizeof(ctx->buf)) {
+	if (max_optlen <= sizeof(buf->data)) {
 		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
 		 * bytes avoid the cost of kzalloc.
-		 *
-		 * In order to remove extra allocations from the TCP
-		 * fast zero-copy path ensure that buffer covers
-		 * the size of struct tcp_zerocopy_receive.
 		 */
-		BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >
-			     BPF_SOCKOPT_KERN_BUF_SIZE);
-
-		ctx->optval = ctx->buf;
+		ctx->optval = buf->data;
 		ctx->optval_end = ctx->optval + max_optlen;
 		return max_optlen;
 	}
@@ -1336,16 +1329,18 @@  static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 	return max_optlen;
 }
 
-static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
+			     struct bpf_sockopt_buf *buf)
 {
-	if (ctx->optval == ctx->buf)
+	if (ctx->optval == buf->data)
 		return;
 	kfree(ctx->optval);
 }
 
-static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx)
+static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
+				  struct bpf_sockopt_buf *buf)
 {
-	return ctx->optval != ctx->buf;
+	return ctx->optval != buf->data;
 }
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
@@ -1353,6 +1348,7 @@  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 				       int *optlen, char **kernel_optval)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_sockopt_buf buf = {};
 	struct bpf_sockopt_kern ctx = {
 		.sk = sk,
 		.level = *level,
@@ -1373,7 +1369,7 @@  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	 */
 	max_optlen = max_t(int, 16, *optlen);
 
-	max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
 	if (max_optlen < 0)
 		return max_optlen;
 
@@ -1419,7 +1415,7 @@  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 			 * No way to export on-stack buf, have to allocate a
 			 * new buffer.
 			 */
-			if (!sockopt_buf_allocated(&ctx)) {
+			if (!sockopt_buf_allocated(&ctx, &buf)) {
 				void *p = kzalloc(ctx.optlen, GFP_USER);
 
 				if (!p) {
@@ -1436,7 +1432,7 @@  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 
 out:
 	if (ret)
-		sockopt_free_buf(&ctx);
+		sockopt_free_buf(&ctx, &buf);
 	return ret;
 }
 
@@ -1445,15 +1441,20 @@  int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 				       int __user *optlen, int max_optlen,
 				       int retval)
 {
-	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_sockopt_kern ctx = {
-		.sk = sk,
-		.level = level,
-		.optname = optname,
-		.retval = retval,
-	};
+	struct bpf_sockopt_kern ctx;
+	struct bpf_sockopt_buf buf;
+	struct cgroup *cgrp;
 	int ret;
 
+	memset(&buf, 0, sizeof(buf));
+	memset(&ctx, 0, sizeof(ctx));
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	ctx.sk = sk;
+	ctx.level = level;
+	ctx.optname = optname;
+	ctx.retval = retval;
+
 	/* Opportunistic check to see whether we have any BPF program
 	 * attached to the hook so we don't waste time allocating
 	 * memory and locking the socket.
@@ -1463,7 +1464,7 @@  int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 
 	ctx.optlen = max_optlen;
 
-	max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
 	if (max_optlen < 0)
 		return max_optlen;
 
@@ -1521,9 +1522,47 @@  int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	ret = ctx.retval;
 
 out:
-	sockopt_free_buf(&ctx);
+	sockopt_free_buf(&ctx, &buf);
 	return ret;
 }
+
+int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+					    int optname, void *optval,
+					    int *optlen, int retval)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_sockopt_kern ctx = {
+		.sk = sk,
+		.level = level,
+		.optname = optname,
+		.retval = retval,
+		.optlen = *optlen,
+		.optval = optval,
+		.optval_end = optval + *optlen,
+	};
+	int ret;
+
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+				 &ctx, BPF_PROG_RUN);
+	if (!ret)
+		return -EPERM;
+
+	if (ctx.optlen > *optlen)
+		return -EFAULT;
+
+	/* BPF programs only allowed to set retval to 0, not some
+	 * arbitrary value.
+	 */
+	if (ctx.retval != 0 && ctx.retval != retval)
+		return -EFAULT;
+
+	/* BPF programs can shrink the buffer, export the modifications.
+	 */
+	if (ctx.optlen != 0)
+		*optlen = ctx.optlen;
+
+	return ctx.retval;
+}
 #endif
 
 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ed42d2193c5c..ef3c895b66c1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4098,6 +4098,8 @@  static int do_tcp_getsockopt(struct sock *sk, int level,
 			return -EFAULT;
 		lock_sock(sk);
 		err = tcp_zerocopy_receive(sk, &zc);
+		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
+							  &zc, &len, err);
 		release_sock(sk);
 		if (len >= offsetofend(struct tcp_zerocopy_receive, err))
 			goto zerocopy_rcv_sk_err;
@@ -4132,6 +4134,18 @@  static int do_tcp_getsockopt(struct sock *sk, int level,
 	return 0;
 }
 
+bool tcp_bpf_bypass_getsockopt(int level, int optname)
+{
+	/* TCP do_tcp_getsockopt has optimized getsockopt implementation
+	 * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
+	 */
+	if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
+
 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		   int __user *optlen)
 {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 58207c7769d0..8b4906980fce 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2792,6 +2792,7 @@  struct proto tcp_prot = {
 	.shutdown		= tcp_shutdown,
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
+	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
 	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e254569a3005..6624eccff85b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2121,6 +2121,7 @@  struct proto tcpv6_prot = {
 	.shutdown		= tcp_shutdown,
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
+	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
 	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
index b25c9c45c148..6bb18b1d8578 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
@@ -11,6 +11,7 @@  static int getsetsockopt(void)
 		char u8[4];
 		__u32 u32;
 		char cc[16]; /* TCP_CA_NAME_MAX */
+		struct tcp_zerocopy_receive zc;
 	} buf = {};
 	socklen_t optlen;
 	char *big_buf = NULL;
@@ -154,6 +155,27 @@  static int getsetsockopt(void)
 		goto err;
 	}
 
+	/* TCP_ZEROCOPY_RECEIVE triggers */
+	memset(&buf, 0, sizeof(buf));
+	optlen = sizeof(buf.zc);
+	err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
+	if (err) {
+		log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
+			err, errno);
+		goto err;
+	}
+
+	memset(&buf, 0, sizeof(buf));
+	buf.zc.address = 12345; /* rejected by BPF */
+	optlen = sizeof(buf.zc);
+	errno = 0;
+	err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
+	if (errno != EPERM) {
+		log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
+			err, errno);
+		goto err;
+	}
+
 	free(big_buf);
 	close(fd);
 	return 0;
diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c
index 712df7b49cb1..c726f0763a13 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c
@@ -57,6 +57,21 @@  int _getsockopt(struct bpf_sockopt *ctx)
 		return 1;
 	}
 
+	if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) {
+		/* Verify that TCP_ZEROCOPY_RECEIVE triggers.
+		 * It has a custom implementation for performance
+		 * reasons.
+		 */
+
+		if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end)
+			return 0; /* EPERM, bounds check */
+
+		if (((struct tcp_zerocopy_receive *)optval)->address != 0)
+			return 0; /* EPERM, unexpected data */
+
+		return 1;
+	}
+
 	if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
 		if (optval + 1 > optval_end)
 			return 0; /* EPERM, bounds check */