diff mbox series

[bpf-next,v2,1/2] bpf: try to avoid kzalloc in cgroup/{s,g}etsockopt

Message ID 20210104221454.2204239-2-sdf@google.com
State Superseded
Headers show
Series [bpf-next,v2,1/2] bpf: try to avoid kzalloc in cgroup/{s,g}etsockopt | expand

Commit Message

Stanislav Fomichev Jan. 4, 2021, 10:14 p.m. UTC
When we attach a bpf program to cgroup/getsockopt any other getsockopt()
syscall starts incurring kzalloc/kfree cost. While, in general, it's
not an issue, sometimes it is, like in the case of TCP_ZEROCOPY_RECEIVE.
TCP_ZEROCOPY_RECEIVE (ab)uses getsockopt system call to implement
fastpath for incoming TCP, we don't want to have extra allocations in
there.

Let add a small buffer on the stack and use it for small (majority)
{s,g}etsockopt values. I've started with 128 bytes to cover
the options we care about (TCP_ZEROCOPY_RECEIVE which is 32 bytes
currently, with some planned extension to 64).

It seems natural to do the same for setsockopt, but it's a bit more
involved when the BPF program modifies the data (where we have to
kmalloc). The assumption is that for the majority of setsockopt
calls (which are doing pure BPF options or apply policy) this
will bring some benefit as well.

Collected some performance numbers using (on a 65k MTU localhost in a VM):
$ perf record -g -- ./tcp_mmap -s -z
$ ./tcp_mmap -H ::1 -z
$ ...
$ perf report --symbol-filter=__cgroup_bpf_run_filter_getsockopt

Without this patch:
     4.81%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_>
            |
             --4.74%--__cgroup_bpf_run_filter_getsockopt
                       |
                       |--1.06%--__kmalloc
                       |
                       |--0.71%--lock_sock_nested
                       |
                       |--0.62%--__might_fault
                       |
                        --0.52%--release_sock

With the patch applied:
     3.29%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
            |
             --3.22%--__cgroup_bpf_run_filter_getsockopt
                       |
                       |--0.66%--lock_sock_nested
                       |
                       |--0.57%--__might_fault
                       |
                        --0.56%--release_sock

So it saves about 1% of the system call. Unfortunately, we still get
2-3% of overhead due to another socket lock/unlock :-(

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Cc: Martin KaFai Lau <kafai@fb.com>
---
 include/linux/filter.h |  3 +++
 kernel/bpf/cgroup.c    | 40 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

Comments

Stanislav Fomichev Jan. 5, 2021, 12:19 a.m. UTC | #1
On Mon, Jan 4, 2021 at 4:03 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Mon, Jan 04, 2021 at 02:14:53PM -0800, Stanislav Fomichev wrote:
> > When we attach a bpf program to cgroup/getsockopt any other getsockopt()
> > syscall starts incurring kzalloc/kfree cost. While, in general, it's
> > not an issue, sometimes it is, like in the case of TCP_ZEROCOPY_RECEIVE.
> > TCP_ZEROCOPY_RECEIVE (ab)uses getsockopt system call to implement
> > fastpath for incoming TCP, we don't want to have extra allocations in
> > there.
> >
> > Let add a small buffer on the stack and use it for small (majority)
> > {s,g}etsockopt values. I've started with 128 bytes to cover
> > the options we care about (TCP_ZEROCOPY_RECEIVE which is 32 bytes
> > currently, with some planned extension to 64).
> >
> > It seems natural to do the same for setsockopt, but it's a bit more
> > involved when the BPF program modifies the data (where we have to
> > kmalloc). The assumption is that for the majority of setsockopt
> > calls (which are doing pure BPF options or apply policy) this
> > will bring some benefit as well.
> >
> > Collected some performance numbers using (on a 65k MTU localhost in a VM):
> > $ perf record -g -- ./tcp_mmap -s -z
> > $ ./tcp_mmap -H ::1 -z
> > $ ...
> > $ perf report --symbol-filter=__cgroup_bpf_run_filter_getsockopt
> >
> > Without this patch:
> >      4.81%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_>
> >             |
> >              --4.74%--__cgroup_bpf_run_filter_getsockopt
> >                        |
> >                        |--1.06%--__kmalloc
> >                        |
> >                        |--0.71%--lock_sock_nested
> >                        |
> >                        |--0.62%--__might_fault
> >                        |
> >                         --0.52%--release_sock
> >
> > With the patch applied:
> >      3.29%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
> >             |
> >              --3.22%--__cgroup_bpf_run_filter_getsockopt
> >                        |
> >                        |--0.66%--lock_sock_nested
> >                        |
> >                        |--0.57%--__might_fault
> >                        |
> >                         --0.56%--release_sock
> >
> > So it saves about 1% of the system call. Unfortunately, we still get
> > 2-3% of overhead due to another socket lock/unlock :-(
> That could be a future exercise to optimize the fast path sockopts. ;)
Yeah, I couldn't think about anything simple so far. The only idea I have
is to allow custom implementation for tcp/udp (where we do lock_sock)
and then have existing BPF_CGROUP_RUN_PROG_{S,G}ETSOCKOPT
in net/socket.c as a fallback. Need to experiment more with it.

> > --- a/kernel/bpf/cgroup.c
> > +++ b/kernel/bpf/cgroup.c
> > @@ -16,6 +16,7 @@
> >  #include <linux/bpf-cgroup.h>
> >  #include <net/sock.h>
> >  #include <net/bpf_sk_storage.h>
> > +#include <net/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */
> To be more specific, it should be <uapi/linux/tcp.h>.
Sure, let's do that. I went with net/tcp.h because
most of the code under net/* doesn't include uapi directly.

> >
> >  #include "../cgroup/cgroup-internal.h"
> >
> > @@ -1298,6 +1299,7 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
> >       return empty;
> >  }
> >
> > +
> Extra newline.
Oops, thanks, will fix.

> >  static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
> >  {
> >       if (unlikely(max_optlen < 0))
> > @@ -1310,6 +1312,18 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
> >               max_optlen = PAGE_SIZE;
> >       }
> >
> > +     if (max_optlen <= sizeof(ctx->buf)) {
> > +             /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
> > +              * bytes avoid the cost of kzalloc.
> > +              */
> If it needs to respin, it will be good to have a few words here on why
> it only BUILD_BUG checks for "struct tcp_zerocopy_receive".
Sounds good, will add. I'll wait a day to let others comment and will respin.

> > +             BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >
> > +                          BPF_SOCKOPT_KERN_BUF_SIZE);
> > +
> > +             ctx->optval = ctx->buf;
> > +             ctx->optval_end = ctx->optval + max_optlen;
> > +             return max_optlen;
> > +     }
> > +
diff mbox series

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 29c27656165b..54a4225f36d8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1281,6 +1281,8 @@  struct bpf_sysctl_kern {
 	u64 tmp_reg;
 };
 
+#define BPF_SOCKOPT_KERN_BUF_SIZE	64
+
 struct bpf_sockopt_kern {
 	struct sock	*sk;
 	u8		*optval;
@@ -1289,6 +1291,7 @@  struct bpf_sockopt_kern {
 	s32		optname;
 	s32		optlen;
 	s32		retval;
+	u8		buf[BPF_SOCKOPT_KERN_BUF_SIZE];
 };
 
 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6ec088a96302..e6a5c7aec1ec 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -16,6 +16,7 @@ 
 #include <linux/bpf-cgroup.h>
 #include <net/sock.h>
 #include <net/bpf_sk_storage.h>
+#include <net/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */
 
 #include "../cgroup/cgroup-internal.h"
 
@@ -1298,6 +1299,7 @@  static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
 	return empty;
 }
 
+
 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 {
 	if (unlikely(max_optlen < 0))
@@ -1310,6 +1312,18 @@  static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 		max_optlen = PAGE_SIZE;
 	}
 
+	if (max_optlen <= sizeof(ctx->buf)) {
+		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
+		 * bytes avoid the cost of kzalloc.
+		 */
+		BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >
+			     BPF_SOCKOPT_KERN_BUF_SIZE);
+
+		ctx->optval = ctx->buf;
+		ctx->optval_end = ctx->optval + max_optlen;
+		return max_optlen;
+	}
+
 	ctx->optval = kzalloc(max_optlen, GFP_USER);
 	if (!ctx->optval)
 		return -ENOMEM;
@@ -1321,9 +1335,16 @@  static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 
 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
 {
+	if (ctx->optval == ctx->buf)
+		return;
 	kfree(ctx->optval);
 }
 
+static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx)
+{
+	return ctx->optval != ctx->buf;
+}
+
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 				       int *optname, char __user *optval,
 				       int *optlen, char **kernel_optval)
@@ -1390,7 +1411,24 @@  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 		 */
 		if (ctx.optlen != 0) {
 			*optlen = ctx.optlen;
-			*kernel_optval = ctx.optval;
+			/* We've used bpf_sockopt_kern->buf as an intermediary
+			 * storage, but the BPF program indicates that we need
+			 * to pass this data to the kernel setsockopt handler.
+			 * No way to export on-stack buf, have to allocate a
+			 * new buffer.
+			 */
+			if (!sockopt_buf_allocated(&ctx)) {
+				void *p = kzalloc(ctx.optlen, GFP_USER);
+
+				if (!p) {
+					ret = -ENOMEM;
+					goto out;
+				}
+				memcpy(p, ctx.optval, ctx.optlen);
+				*kernel_optval = p;
+			} else {
+				*kernel_optval = ctx.optval;
+			}
 		}
 	}