Message ID | 20201201144418.35045-4-kuniyu@amazon.co.jp |
---|---|
State | New |
Headers | show |
Series | Socket migration for SO_REUSEPORT. | expand |
On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote: [ ... ] > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c > index fd133516ac0e..60d7c1f28809 100644 > --- a/net/core/sock_reuseport.c > +++ b/net/core/sock_reuseport.c > @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) > } > EXPORT_SYMBOL(reuseport_add_sock); > > -void reuseport_detach_sock(struct sock *sk) > +struct sock *reuseport_detach_sock(struct sock *sk) > { > struct sock_reuseport *reuse; > + struct bpf_prog *prog; > + struct sock *nsk = NULL; > int i; > > spin_lock_bh(&reuseport_lock); > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk) > > reuse->num_socks--; > reuse->socks[i] = reuse->socks[reuse->num_socks]; > + prog = rcu_dereference(reuse->prog); Is it under rcu_read_lock() here? > > if (sk->sk_protocol == IPPROTO_TCP) { > + if (reuse->num_socks && !prog) > + nsk = i == reuse->num_socks ? reuse->socks[i - 1] : reuse->socks[i]; > + > reuse->num_closed_socks++; > reuse->socks[reuse->max_socks - reuse->num_closed_socks] = sk; > } else { > @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk) > call_rcu(&reuse->rcu, reuseport_free_rcu); > out: > spin_unlock_bh(&reuseport_lock); > + > + return nsk; > } > EXPORT_SYMBOL(reuseport_detach_sock); > > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c > index 1451aa9712b0..b27241ea96bd 100644 > --- a/net/ipv4/inet_connection_sock.c > +++ b/net/ipv4/inet_connection_sock.c > @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, > } > EXPORT_SYMBOL(inet_csk_reqsk_queue_add); > > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk) > +{ > + struct request_sock_queue *old_accept_queue, *new_accept_queue; > + > + old_accept_queue = &inet_csk(sk)->icsk_accept_queue; > + new_accept_queue = &inet_csk(nsk)->icsk_accept_queue; > + > + spin_lock(&old_accept_queue->rskq_lock); > + spin_lock(&new_accept_queue->rskq_lock); I am also not very thrilled on this double spin_lock. Can this be done in (or like) inet_csk_listen_stop() instead? > + > + if (old_accept_queue->rskq_accept_head) { > + if (new_accept_queue->rskq_accept_head) > + old_accept_queue->rskq_accept_tail->dl_next = > + new_accept_queue->rskq_accept_head; > + else > + new_accept_queue->rskq_accept_tail = old_accept_queue->rskq_accept_tail; > + > + new_accept_queue->rskq_accept_head = old_accept_queue->rskq_accept_head; > + old_accept_queue->rskq_accept_head = NULL; > + old_accept_queue->rskq_accept_tail = NULL; > + > + WRITE_ONCE(nsk->sk_ack_backlog, nsk->sk_ack_backlog + sk->sk_ack_backlog); > + WRITE_ONCE(sk->sk_ack_backlog, 0); > + } > + > + spin_unlock(&new_accept_queue->rskq_lock); > + spin_unlock(&old_accept_queue->rskq_lock); > +} > +EXPORT_SYMBOL(inet_csk_reqsk_queue_migrate); > + > struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, > struct request_sock *req, bool own_req) > { > diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c > index 45fb450b4522..545538a6bfac 100644 > --- a/net/ipv4/inet_hashtables.c > +++ b/net/ipv4/inet_hashtables.c > @@ -681,6 +681,7 @@ void inet_unhash(struct sock *sk) > { > struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; > struct inet_listen_hashbucket *ilb = NULL; > + struct sock *nsk; > spinlock_t *lock; > > if (sk_unhashed(sk)) > @@ -696,8 +697,12 @@ void inet_unhash(struct sock *sk) > if (sk_unhashed(sk)) > goto unlock; > > - if (rcu_access_pointer(sk->sk_reuseport_cb)) > - reuseport_detach_sock(sk); > + if (rcu_access_pointer(sk->sk_reuseport_cb)) { > + nsk = reuseport_detach_sock(sk); > + if (nsk) > + inet_csk_reqsk_queue_migrate(sk, nsk); > + } > + > if (ilb) { > inet_unhash2(hashinfo, sk); > ilb->count--; > -- > 2.17.2 (Apple Git-113) >
I'm sending this mail just for logging because I failed to send mails only to LKML, netdev, and bpf yesterday. From: Martin KaFai Lau <kafai@fb.com> Date: Fri, 4 Dec 2020 17:42:41 -0800 > On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote: > [ ... ] > > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c > > index fd133516ac0e..60d7c1f28809 100644 > > --- a/net/core/sock_reuseport.c > > +++ b/net/core/sock_reuseport.c > > @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) > > } > > EXPORT_SYMBOL(reuseport_add_sock); > > > > -void reuseport_detach_sock(struct sock *sk) > > +struct sock *reuseport_detach_sock(struct sock *sk) > > { > > struct sock_reuseport *reuse; > > + struct bpf_prog *prog; > > + struct sock *nsk = NULL; > > int i; > > > > spin_lock_bh(&reuseport_lock); > > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk) > > > > reuse->num_socks--; > > reuse->socks[i] = reuse->socks[reuse->num_socks]; > > + prog = rcu_dereference(reuse->prog); > Is it under rcu_read_lock() here? reuseport_lock is locked in this function, and we do not modify the prog, but is rcu_dereference_protected() preferable? ---8<--- prog = rcu_dereference_protected(reuse->prog, lockdep_is_held(&reuseport_lock)); ---8<--- > > if (sk->sk_protocol == IPPROTO_TCP) { > > + if (reuse->num_socks && !prog) > > + nsk = i == reuse->num_socks ? reuse->socks[i - 1] : reuse->socks[i]; > > + > > reuse->num_closed_socks++; > > reuse->socks[reuse->max_socks - reuse->num_closed_socks] = sk; > > } else { > > @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk) > > call_rcu(&reuse->rcu, reuseport_free_rcu); > > out: > > spin_unlock_bh(&reuseport_lock); > > + > > + return nsk; > > } > > EXPORT_SYMBOL(reuseport_detach_sock); > > > > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c > > index 1451aa9712b0..b27241ea96bd 100644 > > --- a/net/ipv4/inet_connection_sock.c > > +++ b/net/ipv4/inet_connection_sock.c > > @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, > > } > > EXPORT_SYMBOL(inet_csk_reqsk_queue_add); > > > > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk) > > +{ > > + struct request_sock_queue *old_accept_queue, *new_accept_queue; > > + > > + old_accept_queue = &inet_csk(sk)->icsk_accept_queue; > > + new_accept_queue = &inet_csk(nsk)->icsk_accept_queue; > > + > > + spin_lock(&old_accept_queue->rskq_lock); > > + spin_lock(&new_accept_queue->rskq_lock); > I am also not very thrilled on this double spin_lock. > Can this be done in (or like) inet_csk_listen_stop() instead? It will be possible to migrate sockets in inet_csk_listen_stop(), but I think it is better to do it just after reuseport_detach_sock() becuase we can select a different listener (almost) every time at a lower cost by selecting the moved socket and pass it to inet_csk_reqsk_queue_migrate() easily. sk_hash of the listener is 0, so we would have to generate a random number in inet_csk_listen_stop().
On Sun, Dec 06, 2020 at 01:03:07AM +0900, Kuniyuki Iwashima wrote: > From: Martin KaFai Lau <kafai@fb.com> > Date: Fri, 4 Dec 2020 17:42:41 -0800 > > On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote: > > [ ... ] > > > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c > > > index fd133516ac0e..60d7c1f28809 100644 > > > --- a/net/core/sock_reuseport.c > > > +++ b/net/core/sock_reuseport.c > > > @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) > > > } > > > EXPORT_SYMBOL(reuseport_add_sock); > > > > > > -void reuseport_detach_sock(struct sock *sk) > > > +struct sock *reuseport_detach_sock(struct sock *sk) > > > { > > > struct sock_reuseport *reuse; > > > + struct bpf_prog *prog; > > > + struct sock *nsk = NULL; > > > int i; > > > > > > spin_lock_bh(&reuseport_lock); > > > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk) > > > > > > reuse->num_socks--; > > > reuse->socks[i] = reuse->socks[reuse->num_socks]; > > > + prog = rcu_dereference(reuse->prog); > > Is it under rcu_read_lock() here? > > reuseport_lock is locked in this function, and we do not modify the prog, > but is rcu_dereference_protected() preferable? > > ---8<--- > prog = rcu_dereference_protected(reuse->prog, > lockdep_is_held(&reuseport_lock)); > ---8<--- It is not only reuse->prog. Other things also require rcu_read_lock(), e.g. please take a look at __htab_map_lookup_elem(). The TCP_LISTEN sk (selected by bpf to be the target of the migration) is also protected by rcu. I am surprised there is no WARNING in the test. Do you have the needed DEBUG_LOCK* config enabled? > > > if (sk->sk_protocol == IPPROTO_TCP) { > > > + if (reuse->num_socks && !prog) > > > + nsk = i == reuse->num_socks ? reuse->socks[i - 1] : reuse->socks[i]; > > > + > > > reuse->num_closed_socks++; > > > reuse->socks[reuse->max_socks - reuse->num_closed_socks] = sk; > > > } else { > > > @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk) > > > call_rcu(&reuse->rcu, reuseport_free_rcu); > > > out: > > > spin_unlock_bh(&reuseport_lock); > > > + > > > + return nsk; > > > } > > > EXPORT_SYMBOL(reuseport_detach_sock); > > > > > > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c > > > index 1451aa9712b0..b27241ea96bd 100644 > > > --- a/net/ipv4/inet_connection_sock.c > > > +++ b/net/ipv4/inet_connection_sock.c > > > @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, > > > } > > > EXPORT_SYMBOL(inet_csk_reqsk_queue_add); > > > > > > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk) > > > +{ > > > + struct request_sock_queue *old_accept_queue, *new_accept_queue; > > > + > > > + old_accept_queue = &inet_csk(sk)->icsk_accept_queue; > > > + new_accept_queue = &inet_csk(nsk)->icsk_accept_queue; > > > + > > > + spin_lock(&old_accept_queue->rskq_lock); > > > + spin_lock(&new_accept_queue->rskq_lock); > > I am also not very thrilled on this double spin_lock. > > Can this be done in (or like) inet_csk_listen_stop() instead? > > It will be possible to migrate sockets in inet_csk_listen_stop(), but I > think it is better to do it just after reuseport_detach_sock() becuase we > can select a different listener (almost) every time at a lower cost by > selecting the moved socket and pass it to inet_csk_reqsk_queue_migrate() > easily. I don't see the "lower cost" point. Please elaborate. > > sk_hash of the listener is 0, so we would have to generate a random number > in inet_csk_listen_stop(). If I read it correctly, it is also passing 0 as the sk_hash to bpf_run_sk_reuseport() from reuseport_detach_sock(). Also, how is the sk_hash expected to be used? I don't see it in the test.
From: Martin KaFai Lau <kafai@fb.com> Date: Mon, 7 Dec 2020 12:14:38 -0800 > On Sun, Dec 06, 2020 at 01:03:07AM +0900, Kuniyuki Iwashima wrote: > > From: Martin KaFai Lau <kafai@fb.com> > > Date: Fri, 4 Dec 2020 17:42:41 -0800 > > > On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote: > > > [ ... ] > > > > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c > > > > index fd133516ac0e..60d7c1f28809 100644 > > > > --- a/net/core/sock_reuseport.c > > > > +++ b/net/core/sock_reuseport.c > > > > @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) > > > > } > > > > EXPORT_SYMBOL(reuseport_add_sock); > > > > > > > > -void reuseport_detach_sock(struct sock *sk) > > > > +struct sock *reuseport_detach_sock(struct sock *sk) > > > > { > > > > struct sock_reuseport *reuse; > > > > + struct bpf_prog *prog; > > > > + struct sock *nsk = NULL; > > > > int i; > > > > > > > > spin_lock_bh(&reuseport_lock); > > > > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk) > > > > > > > > reuse->num_socks--; > > > > reuse->socks[i] = reuse->socks[reuse->num_socks]; > > > > + prog = rcu_dereference(reuse->prog); > > > Is it under rcu_read_lock() here? > > > > reuseport_lock is locked in this function, and we do not modify the prog, > > but is rcu_dereference_protected() preferable? > > > > ---8<--- > > prog = rcu_dereference_protected(reuse->prog, > > lockdep_is_held(&reuseport_lock)); > > ---8<--- > It is not only reuse->prog. Other things also require rcu_read_lock(), > e.g. please take a look at __htab_map_lookup_elem(). > > The TCP_LISTEN sk (selected by bpf to be the target of the migration) > is also protected by rcu. Thank you, I will use rcu_read_lock() and rcu_dereference() in v3 patchset. > I am surprised there is no WARNING in the test. > Do you have the needed DEBUG_LOCK* config enabled? Yes, DEBUG_LOCK* was 'y', but rcu_dereference() without rcu_read_lock() does not show warnings... > > > > if (sk->sk_protocol == IPPROTO_TCP) { > > > > + if (reuse->num_socks && !prog) > > > > + nsk = i == reuse->num_socks ? reuse->socks[i - 1] : reuse->socks[i]; > > > > + > > > > reuse->num_closed_socks++; > > > > reuse->socks[reuse->max_socks - reuse->num_closed_socks] = sk; > > > > } else { > > > > @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk) > > > > call_rcu(&reuse->rcu, reuseport_free_rcu); > > > > out: > > > > spin_unlock_bh(&reuseport_lock); > > > > + > > > > + return nsk; > > > > } > > > > EXPORT_SYMBOL(reuseport_detach_sock); > > > > > > > > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c > > > > index 1451aa9712b0..b27241ea96bd 100644 > > > > --- a/net/ipv4/inet_connection_sock.c > > > > +++ b/net/ipv4/inet_connection_sock.c > > > > @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, > > > > } > > > > EXPORT_SYMBOL(inet_csk_reqsk_queue_add); > > > > > > > > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk) > > > > +{ > > > > + struct request_sock_queue *old_accept_queue, *new_accept_queue; > > > > + > > > > + old_accept_queue = &inet_csk(sk)->icsk_accept_queue; > > > > + new_accept_queue = &inet_csk(nsk)->icsk_accept_queue; > > > > + > > > > + spin_lock(&old_accept_queue->rskq_lock); > > > > + spin_lock(&new_accept_queue->rskq_lock); > > > I am also not very thrilled on this double spin_lock. > > > Can this be done in (or like) inet_csk_listen_stop() instead? > > > > It will be possible to migrate sockets in inet_csk_listen_stop(), but I > > think it is better to do it just after reuseport_detach_sock() becuase we > > can select a different listener (almost) every time at a lower cost by > > selecting the moved socket and pass it to inet_csk_reqsk_queue_migrate() > > easily. > I don't see the "lower cost" point. Please elaborate. In reuseport_select_sock(), we pass sk_hash of the request socket to reciprocal_scale() and generate a random index for socks[] to select a different listener every time. On the other hand, we do not have request sockets in unhash path and sk_hash of the listener is always 0, so we have to generate a random number in another way. In reuseport_detach_sock(), we can use the index of the moved socket, but we do not have it in inet_csk_listen_stop(), so we have to generate a random number in inet_csk_listen_stop(). I think it is at lower cost to use the index of the moved socket. > > sk_hash of the listener is 0, so we would have to generate a random number > > in inet_csk_listen_stop(). > If I read it correctly, it is also passing 0 as the sk_hash to > bpf_run_sk_reuseport() from reuseport_detach_sock(). > > Also, how is the sk_hash expected to be used? I don't see > it in the test. I expected it should not be used in unhash path. We do not have the request socket in unhash path and cannot pass a proper sk_hash to bpf_run_sk_reuseport(). So, if u8 migration is BPF_SK_REUSEPORT_MIGRATE_QUEUE, we cannot use sk_hash.
On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote: > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk) > > reuse->num_socks--; > reuse->socks[i] = reuse->socks[reuse->num_socks]; > + prog = rcu_dereference(reuse->prog); > > if (sk->sk_protocol == IPPROTO_TCP) { > + if (reuse->num_socks && !prog) > + nsk = i == reuse->num_socks ? reuse->socks[i - 1] : reuse->socks[i]; I asked in the earlier thread if the primary use case is to only use the bpf prog to pick. That thread did not come to a solid answer but did conclude that the sysctl should not control the behavior of the BPF_SK_REUSEPORT_SELECT_OR_MIGRATE prog. From this change here, it seems it is still desired to only depend on the kernel to random pick even when no bpf prog is attached. If that is the case, a sysctl to guard here for not changing the current behavior makes sense. It should still only control the non-bpf-pick behavior: when the sysctl is on, the kernel will still do a random pick when there is no bpf prog attached to the reuseport group. Thoughts? > + > reuse->num_closed_socks++; > reuse->socks[reuse->max_socks - reuse->num_closed_socks] = sk; > } else { > @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk) > call_rcu(&reuse->rcu, reuseport_free_rcu); > out: > spin_unlock_bh(&reuseport_lock); > + > + return nsk; > } > EXPORT_SYMBOL(reuseport_detach_sock);
From: Martin KaFai Lau <kafai@fb.com> Date: Mon, 7 Dec 2020 22:54:18 -0800 > On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote: > > > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk) > > > > reuse->num_socks--; > > reuse->socks[i] = reuse->socks[reuse->num_socks]; > > + prog = rcu_dereference(reuse->prog); > > > > if (sk->sk_protocol == IPPROTO_TCP) { > > + if (reuse->num_socks && !prog) > > + nsk = i == reuse->num_socks ? reuse->socks[i - 1] : reuse->socks[i]; > I asked in the earlier thread if the primary use case is to only > use the bpf prog to pick. That thread did not come to > a solid answer but did conclude that the sysctl should not > control the behavior of the BPF_SK_REUSEPORT_SELECT_OR_MIGRATE prog. > > From this change here, it seems it is still desired to only depend > on the kernel to random pick even when no bpf prog is attached. I wrote this way only to split patches into tcp and bpf parts. So, in the 10th patch, eBPF prog is run if the type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE. https://lore.kernel.org/netdev/20201201144418.35045-11-kuniyu@amazon.co.jp/ But, it makes a breakage, so I will move BPF_SK_REUSEPORT_SELECT_OR_MIGRATE validation into 10th patch so that the type is only available after 10th patch. ---8<--- case BPF_PROG_TYPE_SK_REUSEPORT: switch (expected_attach_type) { case BPF_SK_REUSEPORT_SELECT: case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: <- move to 10th. return 0; default: return -EINVAL; } ---8<--- > If that is the case, a sysctl to guard here for not changing > the current behavior makes sense. > It should still only control the non-bpf-pick behavior: > when the sysctl is on, the kernel will still do a random pick > when there is no bpf prog attached to the reuseport group. > Thoughts? If different applications listen on the same port without eBPF prog, I think sysctl is necessary. But honestly, I am not sure there is really such a case and sysctl is necessary. If patcheset with sysctl is more acceptable, I will add it back in the next spin. > > + > > reuse->num_closed_socks++; > > reuse->socks[reuse->max_socks - reuse->num_closed_socks] = sk; > > } else { > > @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk) > > call_rcu(&reuse->rcu, reuseport_free_rcu); > > out: > > spin_unlock_bh(&reuseport_lock); > > + > > + return nsk; > > } > > EXPORT_SYMBOL(reuseport_detach_sock);
On Tue, Dec 08, 2020 at 03:27:14PM +0900, Kuniyuki Iwashima wrote: > From: Martin KaFai Lau <kafai@fb.com> > Date: Mon, 7 Dec 2020 12:14:38 -0800 > > On Sun, Dec 06, 2020 at 01:03:07AM +0900, Kuniyuki Iwashima wrote: > > > From: Martin KaFai Lau <kafai@fb.com> > > > Date: Fri, 4 Dec 2020 17:42:41 -0800 > > > > On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote: > > > > [ ... ] > > > > > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c > > > > > index fd133516ac0e..60d7c1f28809 100644 > > > > > --- a/net/core/sock_reuseport.c > > > > > +++ b/net/core/sock_reuseport.c > > > > > @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) > > > > > } > > > > > EXPORT_SYMBOL(reuseport_add_sock); > > > > > > > > > > -void reuseport_detach_sock(struct sock *sk) > > > > > +struct sock *reuseport_detach_sock(struct sock *sk) > > > > > { > > > > > struct sock_reuseport *reuse; > > > > > + struct bpf_prog *prog; > > > > > + struct sock *nsk = NULL; > > > > > int i; > > > > > > > > > > spin_lock_bh(&reuseport_lock); > > > > > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk) > > > > > > > > > > reuse->num_socks--; > > > > > reuse->socks[i] = reuse->socks[reuse->num_socks]; > > > > > + prog = rcu_dereference(reuse->prog); > > > > Is it under rcu_read_lock() here? > > > > > > reuseport_lock is locked in this function, and we do not modify the prog, > > > but is rcu_dereference_protected() preferable? > > > > > > ---8<--- > > > prog = rcu_dereference_protected(reuse->prog, > > > lockdep_is_held(&reuseport_lock)); > > > ---8<--- > > It is not only reuse->prog. Other things also require rcu_read_lock(), > > e.g. please take a look at __htab_map_lookup_elem(). > > > > The TCP_LISTEN sk (selected by bpf to be the target of the migration) > > is also protected by rcu. > > Thank you, I will use rcu_read_lock() and rcu_dereference() in v3 patchset. > > > > I am surprised there is no WARNING in the test. > > Do you have the needed DEBUG_LOCK* config enabled? > > Yes, DEBUG_LOCK* was 'y', but rcu_dereference() without rcu_read_lock() > does not show warnings... I would at least expect the "WARN_ON_ONCE(!rcu_read_lock_held() ...)" from __htab_map_lookup_elem() should fire in your test example in the last patch. It is better to check the config before sending v3. [ ... ] > > > > > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c > > > > > index 1451aa9712b0..b27241ea96bd 100644 > > > > > --- a/net/ipv4/inet_connection_sock.c > > > > > +++ b/net/ipv4/inet_connection_sock.c > > > > > @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, > > > > > } > > > > > EXPORT_SYMBOL(inet_csk_reqsk_queue_add); > > > > > > > > > > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk) > > > > > +{ > > > > > + struct request_sock_queue *old_accept_queue, *new_accept_queue; > > > > > + > > > > > + old_accept_queue = &inet_csk(sk)->icsk_accept_queue; > > > > > + new_accept_queue = &inet_csk(nsk)->icsk_accept_queue; > > > > > + > > > > > + spin_lock(&old_accept_queue->rskq_lock); > > > > > + spin_lock(&new_accept_queue->rskq_lock); > > > > I am also not very thrilled on this double spin_lock. > > > > Can this be done in (or like) inet_csk_listen_stop() instead? > > > > > > It will be possible to migrate sockets in inet_csk_listen_stop(), but I > > > think it is better to do it just after reuseport_detach_sock() becuase we > > > can select a different listener (almost) every time at a lower cost by > > > selecting the moved socket and pass it to inet_csk_reqsk_queue_migrate() > > > easily. > > I don't see the "lower cost" point. Please elaborate. > > In reuseport_select_sock(), we pass sk_hash of the request socket to > reciprocal_scale() and generate a random index for socks[] to select > a different listener every time. > On the other hand, we do not have request sockets in unhash path and > sk_hash of the listener is always 0, so we have to generate a random number > in another way. In reuseport_detach_sock(), we can use the index of the > moved socket, but we do not have it in inet_csk_listen_stop(), so we have > to generate a random number in inet_csk_listen_stop(). > I think it is at lower cost to use the index of the moved socket. Generate a random number is not a big deal for the migration code path. Also, I really still failed to see a particular way that the kernel pick will help in the migration case. The kernel has no clue on how to select the right process to migrate to without a proper policy signal from the user. They are all as bad as a random pick. I am not sure this migration feature is even useful if there is no bpf prog attached to define the policy. That said, if it is still desired to do a random pick by kernel when there is no bpf prog, it probably makes sense to guard it in a sysctl as suggested in another reply. To keep it simple, I would also keep this kernel-pick consistent instead of request socket is doing something different from the unhash path. > > > > > sk_hash of the listener is 0, so we would have to generate a random number > > > in inet_csk_listen_stop(). > > If I read it correctly, it is also passing 0 as the sk_hash to > > bpf_run_sk_reuseport() from reuseport_detach_sock(). > > > > Also, how is the sk_hash expected to be used? I don't see > > it in the test. > > I expected it should not be used in unhash path. > We do not have the request socket in unhash path and cannot pass a proper > sk_hash to bpf_run_sk_reuseport(). So, if u8 migration is > BPF_SK_REUSEPORT_MIGRATE_QUEUE, we cannot use sk_hash.
From: Martin KaFai Lau <kafai@fb.com> Date: Tue, 8 Dec 2020 00:13:28 -0800 > On Tue, Dec 08, 2020 at 03:27:14PM +0900, Kuniyuki Iwashima wrote: > > From: Martin KaFai Lau <kafai@fb.com> > > Date: Mon, 7 Dec 2020 12:14:38 -0800 > > > On Sun, Dec 06, 2020 at 01:03:07AM +0900, Kuniyuki Iwashima wrote: > > > > From: Martin KaFai Lau <kafai@fb.com> > > > > Date: Fri, 4 Dec 2020 17:42:41 -0800 > > > > > On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote: > > > > > [ ... ] > > > > > > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c > > > > > > index fd133516ac0e..60d7c1f28809 100644 > > > > > > --- a/net/core/sock_reuseport.c > > > > > > +++ b/net/core/sock_reuseport.c > > > > > > @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) > > > > > > } > > > > > > EXPORT_SYMBOL(reuseport_add_sock); > > > > > > > > > > > > -void reuseport_detach_sock(struct sock *sk) > > > > > > +struct sock *reuseport_detach_sock(struct sock *sk) > > > > > > { > > > > > > struct sock_reuseport *reuse; > > > > > > + struct bpf_prog *prog; > > > > > > + struct sock *nsk = NULL; > > > > > > int i; > > > > > > > > > > > > spin_lock_bh(&reuseport_lock); > > > > > > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk) > > > > > > > > > > > > reuse->num_socks--; > > > > > > reuse->socks[i] = reuse->socks[reuse->num_socks]; > > > > > > + prog = rcu_dereference(reuse->prog); > > > > > Is it under rcu_read_lock() here? > > > > > > > > reuseport_lock is locked in this function, and we do not modify the prog, > > > > but is rcu_dereference_protected() preferable? > > > > > > > > ---8<--- > > > > prog = rcu_dereference_protected(reuse->prog, > > > > lockdep_is_held(&reuseport_lock)); > > > > ---8<--- > > > It is not only reuse->prog. Other things also require rcu_read_lock(), > > > e.g. please take a look at __htab_map_lookup_elem(). > > > > > > The TCP_LISTEN sk (selected by bpf to be the target of the migration) > > > is also protected by rcu. > > > > Thank you, I will use rcu_read_lock() and rcu_dereference() in v3 patchset. > > > > > > > I am surprised there is no WARNING in the test. > > > Do you have the needed DEBUG_LOCK* config enabled? > > > > Yes, DEBUG_LOCK* was 'y', but rcu_dereference() without rcu_read_lock() > > does not show warnings... > I would at least expect the "WARN_ON_ONCE(!rcu_read_lock_held() ...)" > from __htab_map_lookup_elem() should fire in your test > example in the last patch. > > It is better to check the config before sending v3. It seems ok, but I will check it again. ---8<--- [ec2-user@ip-10-0-0-124 bpf-next]$ cat .config | grep DEBUG_LOCK CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_DEBUG_LOCKDEP=y CONFIG_DEBUG_LOCKING_API_SELFTESTS=y ---8<--- > > > > > > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c > > > > > > index 1451aa9712b0..b27241ea96bd 100644 > > > > > > --- a/net/ipv4/inet_connection_sock.c > > > > > > +++ b/net/ipv4/inet_connection_sock.c > > > > > > @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, > > > > > > } > > > > > > EXPORT_SYMBOL(inet_csk_reqsk_queue_add); > > > > > > > > > > > > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk) > > > > > > +{ > > > > > > + struct request_sock_queue *old_accept_queue, *new_accept_queue; > > > > > > + > > > > > > + old_accept_queue = &inet_csk(sk)->icsk_accept_queue; > > > > > > + new_accept_queue = &inet_csk(nsk)->icsk_accept_queue; > > > > > > + > > > > > > + spin_lock(&old_accept_queue->rskq_lock); > > > > > > + spin_lock(&new_accept_queue->rskq_lock); > > > > > I am also not very thrilled on this double spin_lock. > > > > > Can this be done in (or like) inet_csk_listen_stop() instead? > > > > > > > > It will be possible to migrate sockets in inet_csk_listen_stop(), but I > > > > think it is better to do it just after reuseport_detach_sock() becuase we > > > > can select a different listener (almost) every time at a lower cost by > > > > selecting the moved socket and pass it to inet_csk_reqsk_queue_migrate() > > > > easily. > > > I don't see the "lower cost" point. Please elaborate. > > > > In reuseport_select_sock(), we pass sk_hash of the request socket to > > reciprocal_scale() and generate a random index for socks[] to select > > a different listener every time. > > On the other hand, we do not have request sockets in unhash path and > > sk_hash of the listener is always 0, so we have to generate a random number > > in another way. In reuseport_detach_sock(), we can use the index of the > > moved socket, but we do not have it in inet_csk_listen_stop(), so we have > > to generate a random number in inet_csk_listen_stop(). > > I think it is at lower cost to use the index of the moved socket. > Generate a random number is not a big deal for the migration code path. > > Also, I really still failed to see a particular way that the kernel > pick will help in the migration case. The kernel has no clue > on how to select the right process to migrate to without > a proper policy signal from the user. They are all as bad as > a random pick. I am not sure this migration feature is > even useful if there is no bpf prog attached to define the policy. I think most applications start new listeners before closing listeners, in this case, selecting the moved socket as the new listener works well. > That said, if it is still desired to do a random pick by kernel when > there is no bpf prog, it probably makes sense to guard it in a sysctl as > suggested in another reply. To keep it simple, I would also keep this > kernel-pick consistent instead of request socket is doing something > different from the unhash path. Then, is this way better to keep kernel-pick consistent? 1. call reuseport_select_migrated_sock() without sk_hash from any path 2. generate a random number in reuseport_select_migrated_sock() 3. pass it to __reuseport_select_sock() only for select-by-hash (4. pass 0 as sk_hash to bpf_run_sk_reuseport not to use it) 5. do migration per queue in inet_csk_listen_stop() or per request in receive path. I understand it is beautiful to keep consistensy, but also think the kernel-pick with heuristic performs better than random-pick.
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 7338b3865a2a..2ea2d743f8fc 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -260,6 +260,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child); +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk); void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 0e558ca7afbf..09a1b1539d4c 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -31,7 +31,7 @@ struct sock_reuseport { extern int reuseport_alloc(struct sock *sk, bool bind_inany); extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany); -extern void reuseport_detach_sock(struct sock *sk); +extern struct sock *reuseport_detach_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index fd133516ac0e..60d7c1f28809 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) } EXPORT_SYMBOL(reuseport_add_sock); -void reuseport_detach_sock(struct sock *sk) +struct sock *reuseport_detach_sock(struct sock *sk) { struct sock_reuseport *reuse; + struct bpf_prog *prog; + struct sock *nsk = NULL; int i; spin_lock_bh(&reuseport_lock); @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk) reuse->num_socks--; reuse->socks[i] = reuse->socks[reuse->num_socks]; + prog = rcu_dereference(reuse->prog); if (sk->sk_protocol == IPPROTO_TCP) { + if (reuse->num_socks && !prog) + nsk = i == reuse->num_socks ? reuse->socks[i - 1] : reuse->socks[i]; + reuse->num_closed_socks++; reuse->socks[reuse->max_socks - reuse->num_closed_socks] = sk; } else { @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk) call_rcu(&reuse->rcu, reuseport_free_rcu); out: spin_unlock_bh(&reuseport_lock); + + return nsk; } EXPORT_SYMBOL(reuseport_detach_sock); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1451aa9712b0..b27241ea96bd 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, } EXPORT_SYMBOL(inet_csk_reqsk_queue_add); +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk) +{ + struct request_sock_queue *old_accept_queue, *new_accept_queue; + + old_accept_queue = &inet_csk(sk)->icsk_accept_queue; + new_accept_queue = &inet_csk(nsk)->icsk_accept_queue; + + spin_lock(&old_accept_queue->rskq_lock); + spin_lock(&new_accept_queue->rskq_lock); + + if (old_accept_queue->rskq_accept_head) { + if (new_accept_queue->rskq_accept_head) + old_accept_queue->rskq_accept_tail->dl_next = + new_accept_queue->rskq_accept_head; + else + new_accept_queue->rskq_accept_tail = old_accept_queue->rskq_accept_tail; + + new_accept_queue->rskq_accept_head = old_accept_queue->rskq_accept_head; + old_accept_queue->rskq_accept_head = NULL; + old_accept_queue->rskq_accept_tail = NULL; + + WRITE_ONCE(nsk->sk_ack_backlog, nsk->sk_ack_backlog + sk->sk_ack_backlog); + WRITE_ONCE(sk->sk_ack_backlog, 0); + } + + spin_unlock(&new_accept_queue->rskq_lock); + spin_unlock(&old_accept_queue->rskq_lock); +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_migrate); + struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 45fb450b4522..545538a6bfac 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -681,6 +681,7 @@ void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb = NULL; + struct sock *nsk; spinlock_t *lock; if (sk_unhashed(sk)) @@ -696,8 +697,12 @@ void inet_unhash(struct sock *sk) if (sk_unhashed(sk)) goto unlock; - if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_detach_sock(sk); + if (rcu_access_pointer(sk->sk_reuseport_cb)) { + nsk = reuseport_detach_sock(sk); + if (nsk) + inet_csk_reqsk_queue_migrate(sk, nsk); + } + if (ilb) { inet_unhash2(hashinfo, sk); ilb->count--;