Message ID | 20201118001742.85005-1-sdf@google.com |
---|---|
Headers | show |
Series | bpf: expose bpf_{s,g}etsockopt helpers to bind{4,6} hooks | expand |
On Tue, Nov 17, 2020 at 4:17 PM Stanislav Fomichev <sdf@google.com> wrote: > > I have to now lock/unlock socket for the bind hook execution. > That shouldn't cause any overhead because the socket is unbound > and shouldn't receive any traffic. > > Signed-off-by: Stanislav Fomichev <sdf@google.com> > --- > include/linux/bpf-cgroup.h | 12 ++++++------ > net/core/filter.c | 4 ++++ > net/ipv4/af_inet.c | 2 +- > net/ipv6/af_inet6.c | 2 +- > 4 files changed, 12 insertions(+), 8 deletions(-) > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h > index ed71bd1a0825..72e69a0e1e8c 100644 > --- a/include/linux/bpf-cgroup.h > +++ b/include/linux/bpf-cgroup.h > @@ -246,11 +246,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, > __ret; \ > }) > > -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ > - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) > +#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \ > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL) > > -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ > - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) > +#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) > > #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ > sk->sk_prot->pre_connect) > @@ -434,8 +434,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, > #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) > -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) > -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) > +#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; }) > +#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) > diff --git a/net/core/filter.c b/net/core/filter.c > index 2ca5eecebacf..21d91dcf0260 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -6995,6 +6995,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > return &bpf_sk_storage_delete_proto; > case BPF_FUNC_setsockopt: > switch (prog->expected_attach_type) { > + case BPF_CGROUP_INET4_BIND: > + case BPF_CGROUP_INET6_BIND: > case BPF_CGROUP_INET4_CONNECT: > case BPF_CGROUP_INET6_CONNECT: > return &bpf_sock_addr_setsockopt_proto; > @@ -7003,6 +7005,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > } > case BPF_FUNC_getsockopt: > switch (prog->expected_attach_type) { > + case BPF_CGROUP_INET4_BIND: > + case BPF_CGROUP_INET6_BIND: > case BPF_CGROUP_INET4_CONNECT: > case BPF_CGROUP_INET6_CONNECT: > return &bpf_sock_addr_getsockopt_proto; > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > index b7260c8cef2e..b94fa8eb831b 100644 > --- a/net/ipv4/af_inet.c > +++ b/net/ipv4/af_inet.c > @@ -450,7 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) > /* BPF prog is run before any checks are done so that if the prog > * changes context in a wrong way it will be caught. > */ > - err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); > + err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); I think it is ok, but I need to go through the locking paths more. Andrey, please take a look as well. > if (err) > return err; > > diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c > index e648fbebb167..a7e3d170af51 100644 > --- a/net/ipv6/af_inet6.c > +++ b/net/ipv6/af_inet6.c > @@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) > /* BPF prog is run before any checks are done so that if the prog > * changes context in a wrong way it will be caught. > */ > - err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); > + err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr); > if (err) > return err; > > -- > 2.29.2.299.gdc1121823c-goog >
Alexei Starovoitov <alexei.starovoitov@gmail.com> [Tue, 2020-11-17 20:05 -0800]: > On Tue, Nov 17, 2020 at 4:17 PM Stanislav Fomichev <sdf@google.com> wrote: > > > > I have to now lock/unlock socket for the bind hook execution. > > That shouldn't cause any overhead because the socket is unbound > > and shouldn't receive any traffic. > > > > Signed-off-by: Stanislav Fomichev <sdf@google.com> > > --- > > include/linux/bpf-cgroup.h | 12 ++++++------ > > net/core/filter.c | 4 ++++ > > net/ipv4/af_inet.c | 2 +- > > net/ipv6/af_inet6.c | 2 +- > > 4 files changed, 12 insertions(+), 8 deletions(-) > > > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h > > index ed71bd1a0825..72e69a0e1e8c 100644 > > --- a/include/linux/bpf-cgroup.h > > +++ b/include/linux/bpf-cgroup.h > > @@ -246,11 +246,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, > > __ret; \ > > }) > > > > -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ > > - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) > > +#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \ > > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL) > > > > -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ > > - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) > > +#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ > > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) > > > > #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ > > sk->sk_prot->pre_connect) > > @@ -434,8 +434,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, > > #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) > > #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) > > #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) > > -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) > > -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) > > +#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; }) > > +#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; }) > > #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) > > #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) > > #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) > > diff --git a/net/core/filter.c b/net/core/filter.c > > index 2ca5eecebacf..21d91dcf0260 100644 > > --- a/net/core/filter.c > > +++ b/net/core/filter.c > > @@ -6995,6 +6995,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > > return &bpf_sk_storage_delete_proto; > > case BPF_FUNC_setsockopt: > > switch (prog->expected_attach_type) { > > + case BPF_CGROUP_INET4_BIND: > > + case BPF_CGROUP_INET6_BIND: > > case BPF_CGROUP_INET4_CONNECT: > > case BPF_CGROUP_INET6_CONNECT: > > return &bpf_sock_addr_setsockopt_proto; > > @@ -7003,6 +7005,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > > } > > case BPF_FUNC_getsockopt: > > switch (prog->expected_attach_type) { > > + case BPF_CGROUP_INET4_BIND: > > + case BPF_CGROUP_INET6_BIND: > > case BPF_CGROUP_INET4_CONNECT: > > case BPF_CGROUP_INET6_CONNECT: > > return &bpf_sock_addr_getsockopt_proto; > > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > > index b7260c8cef2e..b94fa8eb831b 100644 > > --- a/net/ipv4/af_inet.c > > +++ b/net/ipv4/af_inet.c > > @@ -450,7 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) > > /* BPF prog is run before any checks are done so that if the prog > > * changes context in a wrong way it will be caught. > > */ > > - err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); > > + err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); > > I think it is ok, but I need to go through the locking paths more. > Andrey, > please take a look as well. Sorry for delay, I was offline for the last two weeks. From the correctness perspective it looks fine to me. From the performance perspective I can think of one relevant scenario. Quite common use-case in applications is to use bind(2) not before listen(2) but before connect(2) for client sockets so that connection can be set up from specific source IP and, optionally, port. Binding to both IP and port case is not interesting since it's already slow due to get_port(). But some applications do care about connection setup performance and at the same time need to set source IP only (no port). In this case they use IP_BIND_ADDRESS_NO_PORT socket option, what makes bind(2) fast (we've discussed it with Stanislav earlier in [0]). I can imagine some pathological case when an application sets up tons of connections with bind(2) before connect(2) for sockets with IP_BIND_ADDRESS_NO_PORT enabled (that by itself requires setsockopt(2) though, i.e. socket lock/unlock) and that another lock/unlock to run bind hook may add some overhead. Though I do not know how critical that overhead may be and whether it's worth to benchmark or not (maybe too much paranoia). [0] https://lore.kernel.org/bpf/20200505182010.GB55644@rdna-mbp/ > > if (err) > > return err; > > > > diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c > > index e648fbebb167..a7e3d170af51 100644 > > --- a/net/ipv6/af_inet6.c > > +++ b/net/ipv6/af_inet6.c > > @@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) > > /* BPF prog is run before any checks are done so that if the prog > > * changes context in a wrong way it will be caught. > > */ > > - err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); > > + err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr); > > if (err) > > return err; > > > > -- > > 2.29.2.299.gdc1121823c-goog > > -- Andrey Ignatov
On 11/29, Andrey Ignatov wrote: > Alexei Starovoitov <alexei.starovoitov@gmail.com> [Tue, 2020-11-17 20:05 > -0800]: > > On Tue, Nov 17, 2020 at 4:17 PM Stanislav Fomichev <sdf@google.com> > wrote: [..] > > > > I think it is ok, but I need to go through the locking paths more. > > Andrey, > > please take a look as well. > Sorry for delay, I was offline for the last two weeks. No worries, I was OOO myself last week, thanks for the feedback! > From the correctness perspective it looks fine to me. > From the performance perspective I can think of one relevant scenario. > Quite common use-case in applications is to use bind(2) not before > listen(2) but before connect(2) for client sockets so that connection > can be set up from specific source IP and, optionally, port. > Binding to both IP and port case is not interesting since it's already > slow due to get_port(). > But some applications do care about connection setup performance and at > the same time need to set source IP only (no port). In this case they > use IP_BIND_ADDRESS_NO_PORT socket option, what makes bind(2) fast > (we've discussed it with Stanislav earlier in [0]). > I can imagine some pathological case when an application sets up tons of > connections with bind(2) before connect(2) for sockets with > IP_BIND_ADDRESS_NO_PORT enabled (that by itself requires setsockopt(2) > though, i.e. socket lock/unlock) and that another lock/unlock to run > bind hook may add some overhead. Though I do not know how critical that > overhead may be and whether it's worth to benchmark or not (maybe too > much paranoia). > [0] https://lore.kernel.org/bpf/20200505182010.GB55644@rdna-mbp/ Even in case of IP_BIND_ADDRESS_NO_PORT, inet[6]_bind() does lock_sock down the line, so it's not like we are switching a lockless path to the one with the lock, right? And in this case, similar to listen, the socket is still uncontended and owned by the userspace. So that extra lock/unlock should be cheap enough to be ignored (spin_lock_bh on the warm cache line). Am I missing something?
sdf@google.com <sdf@google.com> [Mon, 2020-11-30 08:38 -0800]: > On 11/29, Andrey Ignatov wrote: > > Alexei Starovoitov <alexei.starovoitov@gmail.com> [Tue, 2020-11-17 20:05 > > -0800]: > > > On Tue, Nov 17, 2020 at 4:17 PM Stanislav Fomichev <sdf@google.com> > > wrote: > [..] > > > > > > I think it is ok, but I need to go through the locking paths more. > > > Andrey, > > > please take a look as well. > > > Sorry for delay, I was offline for the last two weeks. > No worries, I was OOO myself last week, thanks for the feedback! > > > From the correctness perspective it looks fine to me. > > > From the performance perspective I can think of one relevant scenario. > > Quite common use-case in applications is to use bind(2) not before > > listen(2) but before connect(2) for client sockets so that connection > > can be set up from specific source IP and, optionally, port. > > > Binding to both IP and port case is not interesting since it's already > > slow due to get_port(). > > > But some applications do care about connection setup performance and at > > the same time need to set source IP only (no port). In this case they > > use IP_BIND_ADDRESS_NO_PORT socket option, what makes bind(2) fast > > (we've discussed it with Stanislav earlier in [0]). > > > I can imagine some pathological case when an application sets up tons of > > connections with bind(2) before connect(2) for sockets with > > IP_BIND_ADDRESS_NO_PORT enabled (that by itself requires setsockopt(2) > > though, i.e. socket lock/unlock) and that another lock/unlock to run > > bind hook may add some overhead. Though I do not know how critical that > > overhead may be and whether it's worth to benchmark or not (maybe too > > much paranoia). > > > [0] https://lore.kernel.org/bpf/20200505182010.GB55644@rdna-mbp/ > Even in case of IP_BIND_ADDRESS_NO_PORT, inet[6]_bind() does > lock_sock down the line, so it's not like we are switching > a lockless path to the one with the lock, right? Right, I understand that it's going from one lock/unlock to two (not from zero to one), that's what I meant by "another". My point was about this one more lock. > And in this case, similar to listen, the socket is still uncontended and > owned by the userspace. So that extra lock/unlock should be cheap > enough to be ignored (spin_lock_bh on the warm cache line). > > Am I missing something? As I mentioned it may come up only in "pathological case" what is probably fine to ignore, i.e. I'd rather agree with "cheap enough to be ignored" and benchmark would likely confirm it, I just couldn't say that for sure w/o numbers so brought this point. Given that we both agree that it should be fine to ignore this +1 lock, IMO it should be good to go unless someone else has objections. -- Andrey Ignatov
On 11/30, Andrey Ignatov wrote: > sdf@google.com <sdf@google.com> [Mon, 2020-11-30 08:38 -0800]: > > On 11/29, Andrey Ignatov wrote: > > > Alexei Starovoitov <alexei.starovoitov@gmail.com> [Tue, 2020-11-17 > 20:05 > > > -0800]: > > > > On Tue, Nov 17, 2020 at 4:17 PM Stanislav Fomichev <sdf@google.com> > > > wrote: > > [..] > > > > > > > > I think it is ok, but I need to go through the locking paths more. > > > > Andrey, > > > > please take a look as well. > > > > > Sorry for delay, I was offline for the last two weeks. > > No worries, I was OOO myself last week, thanks for the feedback! > > > > > From the correctness perspective it looks fine to me. > > > > > From the performance perspective I can think of one relevant > scenario. > > > Quite common use-case in applications is to use bind(2) not before > > > listen(2) but before connect(2) for client sockets so that connection > > > can be set up from specific source IP and, optionally, port. > > > > > Binding to both IP and port case is not interesting since it's already > > > slow due to get_port(). > > > > > But some applications do care about connection setup performance and > at > > > the same time need to set source IP only (no port). In this case they > > > use IP_BIND_ADDRESS_NO_PORT socket option, what makes bind(2) fast > > > (we've discussed it with Stanislav earlier in [0]). > > > > > I can imagine some pathological case when an application sets up tons > of > > > connections with bind(2) before connect(2) for sockets with > > > IP_BIND_ADDRESS_NO_PORT enabled (that by itself requires setsockopt(2) > > > though, i.e. socket lock/unlock) and that another lock/unlock to run > > > bind hook may add some overhead. Though I do not know how critical > that > > > overhead may be and whether it's worth to benchmark or not (maybe too > > > much paranoia). > > > > > [0] https://lore.kernel.org/bpf/20200505182010.GB55644@rdna-mbp/ > > Even in case of IP_BIND_ADDRESS_NO_PORT, inet[6]_bind() does > > lock_sock down the line, so it's not like we are switching > > a lockless path to the one with the lock, right? > Right, I understand that it's going from one lock/unlock to two (not > from zero to one), that's what I meant by "another". My point was about > this one more lock. > > And in this case, similar to listen, the socket is still uncontended and > > owned by the userspace. So that extra lock/unlock should be cheap > > enough to be ignored (spin_lock_bh on the warm cache line). > > > > Am I missing something? > As I mentioned it may come up only in "pathological case" what is > probably fine to ignore, i.e. I'd rather agree with "cheap enough to be > ignored" and benchmark would likely confirm it, I just couldn't say that > for sure w/o numbers so brought this point. > Given that we both agree that it should be fine to ignore this +1 lock, > IMO it should be good to go unless someone else has objections. Thanks, agreed. Do you mind giving it an acked-by so it gets some attention in the patchwork? ;-)
Stanislav Fomichev <sdf@google.com> [Tue, 2020-11-17 16:18 -0800]: > I have to now lock/unlock socket for the bind hook execution. > That shouldn't cause any overhead because the socket is unbound > and shouldn't receive any traffic. > > Signed-off-by: Stanislav Fomichev <sdf@google.com> Acked-by: Andrey Ignatov <rdna@fb.com> > --- > include/linux/bpf-cgroup.h | 12 ++++++------ > net/core/filter.c | 4 ++++ > net/ipv4/af_inet.c | 2 +- > net/ipv6/af_inet6.c | 2 +- > 4 files changed, 12 insertions(+), 8 deletions(-) > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h > index ed71bd1a0825..72e69a0e1e8c 100644 > --- a/include/linux/bpf-cgroup.h > +++ b/include/linux/bpf-cgroup.h > @@ -246,11 +246,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, > __ret; \ > }) > > -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ > - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) > +#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \ > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL) > > -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ > - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) > +#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) > > #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ > sk->sk_prot->pre_connect) > @@ -434,8 +434,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, > #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) > -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) > -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) > +#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; }) > +#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) > diff --git a/net/core/filter.c b/net/core/filter.c > index 2ca5eecebacf..21d91dcf0260 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -6995,6 +6995,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > return &bpf_sk_storage_delete_proto; > case BPF_FUNC_setsockopt: > switch (prog->expected_attach_type) { > + case BPF_CGROUP_INET4_BIND: > + case BPF_CGROUP_INET6_BIND: > case BPF_CGROUP_INET4_CONNECT: > case BPF_CGROUP_INET6_CONNECT: > return &bpf_sock_addr_setsockopt_proto; > @@ -7003,6 +7005,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > } > case BPF_FUNC_getsockopt: > switch (prog->expected_attach_type) { > + case BPF_CGROUP_INET4_BIND: > + case BPF_CGROUP_INET6_BIND: > case BPF_CGROUP_INET4_CONNECT: > case BPF_CGROUP_INET6_CONNECT: > return &bpf_sock_addr_getsockopt_proto; > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > index b7260c8cef2e..b94fa8eb831b 100644 > --- a/net/ipv4/af_inet.c > +++ b/net/ipv4/af_inet.c > @@ -450,7 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) > /* BPF prog is run before any checks are done so that if the prog > * changes context in a wrong way it will be caught. > */ > - err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); > + err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); > if (err) > return err; > > diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c > index e648fbebb167..a7e3d170af51 100644 > --- a/net/ipv6/af_inet6.c > +++ b/net/ipv6/af_inet6.c > @@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) > /* BPF prog is run before any checks are done so that if the prog > * changes context in a wrong way it will be caught. > */ > - err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); > + err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr); > if (err) > return err; > > -- > 2.29.2.299.gdc1121823c-goog > -- Andrey Ignatov
sdf@google.com <sdf@google.com> [Tue, 2020-12-01 10:43 -0800]: > On 11/30, Andrey Ignatov wrote: > > sdf@google.com <sdf@google.com> [Mon, 2020-11-30 08:38 -0800]: > > > On 11/29, Andrey Ignatov wrote: > > > > Alexei Starovoitov <alexei.starovoitov@gmail.com> [Tue, 2020-11-17 > > 20:05 > > > > -0800]: > > > > > On Tue, Nov 17, 2020 at 4:17 PM Stanislav Fomichev <sdf@google.com> > > > > wrote: > > > [..] > > > > > > > > > > I think it is ok, but I need to go through the locking paths more. > > > > > Andrey, > > > > > please take a look as well. > > > > > > > Sorry for delay, I was offline for the last two weeks. > > > No worries, I was OOO myself last week, thanks for the feedback! > > > > > > > From the correctness perspective it looks fine to me. > > > > > > > From the performance perspective I can think of one relevant > > scenario. > > > > Quite common use-case in applications is to use bind(2) not before > > > > listen(2) but before connect(2) for client sockets so that connection > > > > can be set up from specific source IP and, optionally, port. > > > > > > > Binding to both IP and port case is not interesting since it's already > > > > slow due to get_port(). > > > > > > > But some applications do care about connection setup performance and > > at > > > > the same time need to set source IP only (no port). In this case they > > > > use IP_BIND_ADDRESS_NO_PORT socket option, what makes bind(2) fast > > > > (we've discussed it with Stanislav earlier in [0]). > > > > > > > I can imagine some pathological case when an application sets up > > tons of > > > > connections with bind(2) before connect(2) for sockets with > > > > IP_BIND_ADDRESS_NO_PORT enabled (that by itself requires setsockopt(2) > > > > though, i.e. socket lock/unlock) and that another lock/unlock to run > > > > bind hook may add some overhead. Though I do not know how critical > > that > > > > overhead may be and whether it's worth to benchmark or not (maybe too > > > > much paranoia). > > > > > > > [0] https://lore.kernel.org/bpf/20200505182010.GB55644@rdna-mbp/ > > > Even in case of IP_BIND_ADDRESS_NO_PORT, inet[6]_bind() does > > > lock_sock down the line, so it's not like we are switching > > > a lockless path to the one with the lock, right? > > > Right, I understand that it's going from one lock/unlock to two (not > > from zero to one), that's what I meant by "another". My point was about > > this one more lock. > > > > And in this case, similar to listen, the socket is still uncontended and > > > owned by the userspace. So that extra lock/unlock should be cheap > > > enough to be ignored (spin_lock_bh on the warm cache line). > > > > > > Am I missing something? > > > As I mentioned it may come up only in "pathological case" what is > > probably fine to ignore, i.e. I'd rather agree with "cheap enough to be > > ignored" and benchmark would likely confirm it, I just couldn't say that > > for sure w/o numbers so brought this point. > > > Given that we both agree that it should be fine to ignore this +1 lock, > > IMO it should be good to go unless someone else has objections. > Thanks, agreed. Do you mind giving it an acked-by so it gets some > attention in the patchwork? ;-) Sure. Acked this one. -- Andrey Ignatov
On Tue, Nov 17, 2020 at 4:20 PM Stanislav Fomichev <sdf@google.com> wrote: > > I'm planning to extend it in the next patches. It's much easier to > work with C than BPF assembly. > > Signed-off-by: Stanislav Fomichev <sdf@google.com> > --- With nits below: Acked-by: Andrii Nakryiko <andrii@kernel.org> > .../testing/selftests/bpf/progs/bind4_prog.c | 73 +++++++ > .../testing/selftests/bpf/progs/bind6_prog.c | 90 ++++++++ > tools/testing/selftests/bpf/test_sock_addr.c | 196 ++---------------- > 3 files changed, 175 insertions(+), 184 deletions(-) > create mode 100644 tools/testing/selftests/bpf/progs/bind4_prog.c > create mode 100644 tools/testing/selftests/bpf/progs/bind6_prog.c > > diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c > new file mode 100644 > index 000000000000..ff3def2ee6f9 > --- /dev/null > +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c > @@ -0,0 +1,73 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +#include <string.h> > + > +#include <linux/stddef.h> > +#include <linux/bpf.h> > +#include <linux/in.h> > +#include <linux/in6.h> > +#include <sys/socket.h> > +#include <netinet/tcp.h> > +#include <linux/if.h> > +#include <errno.h> > + > +#include <bpf/bpf_helpers.h> > +#include <bpf/bpf_endian.h> > + > +#define SERV4_IP 0xc0a801feU /* 192.168.1.254 */ > +#define SERV4_PORT 4040 > +#define SERV4_REWRITE_IP 0x7f000001U /* 127.0.0.1 */ > +#define SERV4_REWRITE_PORT 4444 > + > +int _version SEC("version") = 1; not needed, let's not add it to a new test prog > + [...] > diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c > new file mode 100644 > index 000000000000..97686baaae65 > --- /dev/null > +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c > @@ -0,0 +1,90 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +#include <string.h> > + > +#include <linux/stddef.h> > +#include <linux/bpf.h> > +#include <linux/in.h> > +#include <linux/in6.h> > +#include <sys/socket.h> > +#include <netinet/tcp.h> > +#include <linux/if.h> > +#include <errno.h> > + > +#include <bpf/bpf_helpers.h> > +#include <bpf/bpf_endian.h> > + > +#define SERV6_IP_0 0xfaceb00c /* face:b00c:1234:5678::abcd */ > +#define SERV6_IP_1 0x12345678 > +#define SERV6_IP_2 0x00000000 > +#define SERV6_IP_3 0x0000abcd > +#define SERV6_PORT 6060 > +#define SERV6_REWRITE_IP_0 0x00000000 > +#define SERV6_REWRITE_IP_1 0x00000000 > +#define SERV6_REWRITE_IP_2 0x00000000 > +#define SERV6_REWRITE_IP_3 0x00000001 > +#define SERV6_REWRITE_PORT 6666 > + > +int _version SEC("version") = 1; same > + > +SEC("cgroup/bind6") > +int bind_v6_prog(struct bpf_sock_addr *ctx) > +{ [...]
On 12/01, Andrii Nakryiko wrote: > On Tue, Nov 17, 2020 at 4:20 PM Stanislav Fomichev <sdf@google.com> wrote: > > > > I'm planning to extend it in the next patches. It's much easier to > > work with C than BPF assembly. > > > > Signed-off-by: Stanislav Fomichev <sdf@google.com> > > --- > With nits below: > Acked-by: Andrii Nakryiko <andrii@kernel.org> Thank you for the review! Will respin shortly with the nits addressed.