diff mbox series

[v1,bpf-next,05/11] tcp: Migrate TCP_NEW_SYN_RECV requests.

Message ID 20201201144418.35045-6-kuniyu@amazon.co.jp
State New
Headers show
Series Socket migration for SO_REUSEPORT. | expand

Commit Message

Kuniyuki Iwashima Dec. 1, 2020, 2:44 p.m. UTC
This patch renames reuseport_select_sock() to __reuseport_select_sock() and
adds two wrapper function of it to pass the migration type defined in the
previous commit.

  reuseport_select_sock          : BPF_SK_REUSEPORT_MIGRATE_NO
  reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST

As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV
requests at receiving the final ACK or sending a SYN+ACK. Therefore, this
patch also changes the code to call reuseport_select_migrated_sock() even
if the listening socket is TCP_CLOSE. If we can pick out a listening socket
from the reuseport group, we rewrite request_sock.rsk_listener and resume
processing the request.

Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
---
 include/net/inet_connection_sock.h | 12 +++++++++++
 include/net/request_sock.h         | 13 ++++++++++++
 include/net/sock_reuseport.h       |  8 +++----
 net/core/sock_reuseport.c          | 34 ++++++++++++++++++++++++------
 net/ipv4/inet_connection_sock.c    | 13 ++++++++++--
 net/ipv4/tcp_ipv4.c                |  9 ++++++--
 net/ipv6/tcp_ipv6.c                |  9 ++++++--
 7 files changed, 81 insertions(+), 17 deletions(-)

Comments

Martin KaFai Lau Dec. 10, 2020, 12:07 a.m. UTC | #1
On Tue, Dec 01, 2020 at 11:44:12PM +0900, Kuniyuki Iwashima wrote:
> This patch renames reuseport_select_sock() to __reuseport_select_sock() and

> adds two wrapper function of it to pass the migration type defined in the

> previous commit.

> 

>   reuseport_select_sock          : BPF_SK_REUSEPORT_MIGRATE_NO

>   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST

> 

> As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV

> requests at receiving the final ACK or sending a SYN+ACK. Therefore, this

> patch also changes the code to call reuseport_select_migrated_sock() even

> if the listening socket is TCP_CLOSE. If we can pick out a listening socket

> from the reuseport group, we rewrite request_sock.rsk_listener and resume

> processing the request.

> 

> Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>

> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>

> ---

>  include/net/inet_connection_sock.h | 12 +++++++++++

>  include/net/request_sock.h         | 13 ++++++++++++

>  include/net/sock_reuseport.h       |  8 +++----

>  net/core/sock_reuseport.c          | 34 ++++++++++++++++++++++++------

>  net/ipv4/inet_connection_sock.c    | 13 ++++++++++--

>  net/ipv4/tcp_ipv4.c                |  9 ++++++--

>  net/ipv6/tcp_ipv6.c                |  9 ++++++--

>  7 files changed, 81 insertions(+), 17 deletions(-)

> 

> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h

> index 2ea2d743f8fc..1e0958f5eb21 100644

> --- a/include/net/inet_connection_sock.h

> +++ b/include/net/inet_connection_sock.h

> @@ -272,6 +272,18 @@ static inline void inet_csk_reqsk_queue_added(struct sock *sk)

>  	reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue);

>  }

>  

> +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,

> +						 struct sock *nsk,

> +						 struct request_sock *req)

> +{

> +	reqsk_queue_migrated(&inet_csk(sk)->icsk_accept_queue,

> +			     &inet_csk(nsk)->icsk_accept_queue,

> +			     req);

> +	sock_put(sk);

not sure if it is safe to do here.
IIUC, when the req->rsk_refcnt is held, it also holds a refcnt
to req->rsk_listener such that sock_hold(req->rsk_listener) is
safe because its sk_refcnt is not zero.

> +	sock_hold(nsk);

> +	req->rsk_listener = nsk;

> +}

> +


[ ... ]

> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c

> index 361efe55b1ad..e71653c6eae2 100644

> --- a/net/ipv4/inet_connection_sock.c

> +++ b/net/ipv4/inet_connection_sock.c

> @@ -743,8 +743,17 @@ static void reqsk_timer_handler(struct timer_list *t)

>  	struct request_sock_queue *queue = &icsk->icsk_accept_queue;

>  	int max_syn_ack_retries, qlen, expire = 0, resend = 0;

>  

> -	if (inet_sk_state_load(sk_listener) != TCP_LISTEN)

> -		goto drop;

> +	if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {

> +		sk_listener = reuseport_select_migrated_sock(sk_listener,

> +							     req_to_sk(req)->sk_hash, NULL);

> +		if (!sk_listener) {

> +			sk_listener = req->rsk_listener;

> +			goto drop;

> +		}

> +		inet_csk_reqsk_queue_migrated(req->rsk_listener, sk_listener, req);

> +		icsk = inet_csk(sk_listener);

> +		queue = &icsk->icsk_accept_queue;

> +	}

>  

>  	max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;

>  	/* Normally all the openreqs are young and become mature

> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

> index e4b31e70bd30..9a9aa27c6069 100644

> --- a/net/ipv4/tcp_ipv4.c

> +++ b/net/ipv4/tcp_ipv4.c

> @@ -1973,8 +1973,13 @@ int tcp_v4_rcv(struct sk_buff *skb)

>  			goto csum_error;

>  		}

>  		if (unlikely(sk->sk_state != TCP_LISTEN)) {

> -			inet_csk_reqsk_queue_drop_and_put(sk, req);

> -			goto lookup;

> +			nsk = reuseport_select_migrated_sock(sk, req_to_sk(req)->sk_hash, skb);

> +			if (!nsk) {

> +				inet_csk_reqsk_queue_drop_and_put(sk, req);

> +				goto lookup;

> +			}

> +			inet_csk_reqsk_queue_migrated(sk, nsk, req);

> +			sk = nsk;

>  		}

>  		/* We own a reference on the listener, increase it again

>  		 * as we might lose it too soon.

> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c

> index 992cbf3eb9e3..ff11f3c0cb96 100644

> --- a/net/ipv6/tcp_ipv6.c

> +++ b/net/ipv6/tcp_ipv6.c

> @@ -1635,8 +1635,13 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)

>  			goto csum_error;

>  		}

>  		if (unlikely(sk->sk_state != TCP_LISTEN)) {

> -			inet_csk_reqsk_queue_drop_and_put(sk, req);

> -			goto lookup;

> +			nsk = reuseport_select_migrated_sock(sk, req_to_sk(req)->sk_hash, skb);

> +			if (!nsk) {

> +				inet_csk_reqsk_queue_drop_and_put(sk, req);

> +				goto lookup;

> +			}

> +			inet_csk_reqsk_queue_migrated(sk, nsk, req);

> +			sk = nsk;

>  		}

>  		sock_hold(sk);

For example, this sock_hold(sk).  sk here is req->rsk_listener.

>  		refcounted = true;

> -- 

> 2.17.2 (Apple Git-113)

>
Kuniyuki Iwashima Dec. 10, 2020, 5:15 a.m. UTC | #2
From:   Martin KaFai Lau <kafai@fb.com>

Date:   Wed, 9 Dec 2020 16:07:07 -0800
> On Tue, Dec 01, 2020 at 11:44:12PM +0900, Kuniyuki Iwashima wrote:

> > This patch renames reuseport_select_sock() to __reuseport_select_sock() and

> > adds two wrapper function of it to pass the migration type defined in the

> > previous commit.

> > 

> >   reuseport_select_sock          : BPF_SK_REUSEPORT_MIGRATE_NO

> >   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST

> > 

> > As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV

> > requests at receiving the final ACK or sending a SYN+ACK. Therefore, this

> > patch also changes the code to call reuseport_select_migrated_sock() even

> > if the listening socket is TCP_CLOSE. If we can pick out a listening socket

> > from the reuseport group, we rewrite request_sock.rsk_listener and resume

> > processing the request.

> > 

> > Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>

> > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>

> > ---

> >  include/net/inet_connection_sock.h | 12 +++++++++++

> >  include/net/request_sock.h         | 13 ++++++++++++

> >  include/net/sock_reuseport.h       |  8 +++----

> >  net/core/sock_reuseport.c          | 34 ++++++++++++++++++++++++------

> >  net/ipv4/inet_connection_sock.c    | 13 ++++++++++--

> >  net/ipv4/tcp_ipv4.c                |  9 ++++++--

> >  net/ipv6/tcp_ipv6.c                |  9 ++++++--

> >  7 files changed, 81 insertions(+), 17 deletions(-)

> > 

> > diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h

> > index 2ea2d743f8fc..1e0958f5eb21 100644

> > --- a/include/net/inet_connection_sock.h

> > +++ b/include/net/inet_connection_sock.h

> > @@ -272,6 +272,18 @@ static inline void inet_csk_reqsk_queue_added(struct sock *sk)

> >  	reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue);

> >  }

> >  

> > +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,

> > +						 struct sock *nsk,

> > +						 struct request_sock *req)

> > +{

> > +	reqsk_queue_migrated(&inet_csk(sk)->icsk_accept_queue,

> > +			     &inet_csk(nsk)->icsk_accept_queue,

> > +			     req);

> > +	sock_put(sk);

> not sure if it is safe to do here.

> IIUC, when the req->rsk_refcnt is held, it also holds a refcnt

> to req->rsk_listener such that sock_hold(req->rsk_listener) is

> safe because its sk_refcnt is not zero.


I think it is safe to call sock_put() for the old listener here.

Without this patchset, at receiving the final ACK or retransmitting
SYN+ACK, if sk_state == TCP_CLOSE, sock_put(req->rsk_listener) is done
by calling reqsk_put() twice in inet_csk_reqsk_queue_drop_and_put(). And
then, we do `goto lookup;` and overwrite the sk.

In the v2 patchset, refcount_inc_not_zero() is done for the new listener in
reuseport_select_migrated_sock(), so we have to call sock_put() for the old
listener instead to free it properly.

---8<---
+struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,
+					    struct sk_buff *skb)
+{
+	struct sock *nsk;
+
+	nsk = __reuseport_select_sock(sk, hash, skb, 0, BPF_SK_REUSEPORT_MIGRATE_REQUEST);
+	if (nsk && likely(refcount_inc_not_zero(&nsk->sk_refcnt)))
+		return nsk;
+
+	return NULL;
+}
+EXPORT_SYMBOL(reuseport_select_migrated_sock);
---8<---
https://lore.kernel.org/netdev/20201207132456.65472-8-kuniyu@amazon.co.jp/


> > +	sock_hold(nsk);

> > +	req->rsk_listener = nsk;

> > +}

> > +

> 

> [ ... ]

> 

> > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c

> > index 361efe55b1ad..e71653c6eae2 100644

> > --- a/net/ipv4/inet_connection_sock.c

> > +++ b/net/ipv4/inet_connection_sock.c

> > @@ -743,8 +743,17 @@ static void reqsk_timer_handler(struct timer_list *t)

> >  	struct request_sock_queue *queue = &icsk->icsk_accept_queue;

> >  	int max_syn_ack_retries, qlen, expire = 0, resend = 0;

> >  

> > -	if (inet_sk_state_load(sk_listener) != TCP_LISTEN)

> > -		goto drop;

> > +	if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {

> > +		sk_listener = reuseport_select_migrated_sock(sk_listener,

> > +							     req_to_sk(req)->sk_hash, NULL);

> > +		if (!sk_listener) {

> > +			sk_listener = req->rsk_listener;

> > +			goto drop;

> > +		}

> > +		inet_csk_reqsk_queue_migrated(req->rsk_listener, sk_listener, req);

> > +		icsk = inet_csk(sk_listener);

> > +		queue = &icsk->icsk_accept_queue;

> > +	}

> >  

> >  	max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;

> >  	/* Normally all the openreqs are young and become mature

> > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

> > index e4b31e70bd30..9a9aa27c6069 100644

> > --- a/net/ipv4/tcp_ipv4.c

> > +++ b/net/ipv4/tcp_ipv4.c

> > @@ -1973,8 +1973,13 @@ int tcp_v4_rcv(struct sk_buff *skb)

> >  			goto csum_error;

> >  		}

> >  		if (unlikely(sk->sk_state != TCP_LISTEN)) {

> > -			inet_csk_reqsk_queue_drop_and_put(sk, req);

> > -			goto lookup;

> > +			nsk = reuseport_select_migrated_sock(sk, req_to_sk(req)->sk_hash, skb);

> > +			if (!nsk) {

> > +				inet_csk_reqsk_queue_drop_and_put(sk, req);

> > +				goto lookup;

> > +			}

> > +			inet_csk_reqsk_queue_migrated(sk, nsk, req);

> > +			sk = nsk;

> >  		}

> >  		/* We own a reference on the listener, increase it again

> >  		 * as we might lose it too soon.

> > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c

> > index 992cbf3eb9e3..ff11f3c0cb96 100644

> > --- a/net/ipv6/tcp_ipv6.c

> > +++ b/net/ipv6/tcp_ipv6.c

> > @@ -1635,8 +1635,13 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)

> >  			goto csum_error;

> >  		}

> >  		if (unlikely(sk->sk_state != TCP_LISTEN)) {

> > -			inet_csk_reqsk_queue_drop_and_put(sk, req);

> > -			goto lookup;

> > +			nsk = reuseport_select_migrated_sock(sk, req_to_sk(req)->sk_hash, skb);

> > +			if (!nsk) {

> > +				inet_csk_reqsk_queue_drop_and_put(sk, req);

> > +				goto lookup;

> > +			}

> > +			inet_csk_reqsk_queue_migrated(sk, nsk, req);

> > +			sk = nsk;

> >  		}

> >  		sock_hold(sk);

> For example, this sock_hold(sk).  sk here is req->rsk_listener.


After migration, this is for the new listener and it is safe because
refcount_inc_not_zero() for the new listener is called in
reuseport_select_migerate_sock().


> >  		refcounted = true;

> > -- 

> > 2.17.2 (Apple Git-113)
Martin KaFai Lau Dec. 10, 2020, 6:49 p.m. UTC | #3
On Thu, Dec 10, 2020 at 02:15:38PM +0900, Kuniyuki Iwashima wrote:
> From:   Martin KaFai Lau <kafai@fb.com>

> Date:   Wed, 9 Dec 2020 16:07:07 -0800

> > On Tue, Dec 01, 2020 at 11:44:12PM +0900, Kuniyuki Iwashima wrote:

> > > This patch renames reuseport_select_sock() to __reuseport_select_sock() and

> > > adds two wrapper function of it to pass the migration type defined in the

> > > previous commit.

> > > 

> > >   reuseport_select_sock          : BPF_SK_REUSEPORT_MIGRATE_NO

> > >   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST

> > > 

> > > As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV

> > > requests at receiving the final ACK or sending a SYN+ACK. Therefore, this

> > > patch also changes the code to call reuseport_select_migrated_sock() even

> > > if the listening socket is TCP_CLOSE. If we can pick out a listening socket

> > > from the reuseport group, we rewrite request_sock.rsk_listener and resume

> > > processing the request.

> > > 

> > > Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>

> > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>

> > > ---

> > >  include/net/inet_connection_sock.h | 12 +++++++++++

> > >  include/net/request_sock.h         | 13 ++++++++++++

> > >  include/net/sock_reuseport.h       |  8 +++----

> > >  net/core/sock_reuseport.c          | 34 ++++++++++++++++++++++++------

> > >  net/ipv4/inet_connection_sock.c    | 13 ++++++++++--

> > >  net/ipv4/tcp_ipv4.c                |  9 ++++++--

> > >  net/ipv6/tcp_ipv6.c                |  9 ++++++--

> > >  7 files changed, 81 insertions(+), 17 deletions(-)

> > > 

> > > diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h

> > > index 2ea2d743f8fc..1e0958f5eb21 100644

> > > --- a/include/net/inet_connection_sock.h

> > > +++ b/include/net/inet_connection_sock.h

> > > @@ -272,6 +272,18 @@ static inline void inet_csk_reqsk_queue_added(struct sock *sk)

> > >  	reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue);

> > >  }

> > >  

> > > +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,

> > > +						 struct sock *nsk,

> > > +						 struct request_sock *req)

> > > +{

> > > +	reqsk_queue_migrated(&inet_csk(sk)->icsk_accept_queue,

> > > +			     &inet_csk(nsk)->icsk_accept_queue,

> > > +			     req);

> > > +	sock_put(sk);

> > not sure if it is safe to do here.

> > IIUC, when the req->rsk_refcnt is held, it also holds a refcnt

> > to req->rsk_listener such that sock_hold(req->rsk_listener) is

> > safe because its sk_refcnt is not zero.

> 

> I think it is safe to call sock_put() for the old listener here.

> 

> Without this patchset, at receiving the final ACK or retransmitting

> SYN+ACK, if sk_state == TCP_CLOSE, sock_put(req->rsk_listener) is done

> by calling reqsk_put() twice in inet_csk_reqsk_queue_drop_and_put().

Note that in your example (final ACK), sock_put(req->rsk_listener) is
_only_ called when reqsk_put() can get refcount_dec_and_test(&req->rsk_refcnt)
to reach zero.

Here in this patch, it sock_put(req->rsk_listener) without req->rsk_refcnt
reaching zero.

Let says there are two cores holding two refcnt to req (one cnt for each core)
by looking up the req from ehash.  One of the core do this migrate and
sock_put(req->rsk_listener).  Another core does sock_hold(req->rsk_listener).

	Core1					Core2
						sock_put(req->rsk_listener)

	sock_hold(req->rsk_listener)

> And then, we do `goto lookup;` and overwrite the sk.

> 

> In the v2 patchset, refcount_inc_not_zero() is done for the new listener in

> reuseport_select_migrated_sock(), so we have to call sock_put() for the old

> listener instead to free it properly.

> 

> ---8<---

> +struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,

> +					    struct sk_buff *skb)

> +{

> +	struct sock *nsk;

> +

> +	nsk = __reuseport_select_sock(sk, hash, skb, 0, BPF_SK_REUSEPORT_MIGRATE_REQUEST);

> +	if (nsk && likely(refcount_inc_not_zero(&nsk->sk_refcnt)))

There is another potential issue here.  The TCP_LISTEN nsk is protected
by rcu.  refcount_inc_not_zero(&nsk->sk_refcnt) cannot be done if it
is not under rcu_read_lock().

The receive path may be ok as it is in rcu.  You may need to check for
others.

> +		return nsk;

> +

> +	return NULL;

> +}

> +EXPORT_SYMBOL(reuseport_select_migrated_sock);

> ---8<---

> https://lore.kernel.org/netdev/20201207132456.65472-8-kuniyu@amazon.co.jp/

> 

> 

> > > +	sock_hold(nsk);

> > > +	req->rsk_listener = nsk;

It looks like there is another race here.  What
if multiple cores try to update req->rsk_listener?
Kuniyuki Iwashima Dec. 14, 2020, 5:03 p.m. UTC | #4
From:   Martin KaFai Lau <kafai@fb.com>

Date:   Thu, 10 Dec 2020 10:49:15 -0800
> On Thu, Dec 10, 2020 at 02:15:38PM +0900, Kuniyuki Iwashima wrote:

> > From:   Martin KaFai Lau <kafai@fb.com>

> > Date:   Wed, 9 Dec 2020 16:07:07 -0800

> > > On Tue, Dec 01, 2020 at 11:44:12PM +0900, Kuniyuki Iwashima wrote:

> > > > This patch renames reuseport_select_sock() to __reuseport_select_sock() and

> > > > adds two wrapper function of it to pass the migration type defined in the

> > > > previous commit.

> > > > 

> > > >   reuseport_select_sock          : BPF_SK_REUSEPORT_MIGRATE_NO

> > > >   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST

> > > > 

> > > > As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV

> > > > requests at receiving the final ACK or sending a SYN+ACK. Therefore, this

> > > > patch also changes the code to call reuseport_select_migrated_sock() even

> > > > if the listening socket is TCP_CLOSE. If we can pick out a listening socket

> > > > from the reuseport group, we rewrite request_sock.rsk_listener and resume

> > > > processing the request.

> > > > 

> > > > Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>

> > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>

> > > > ---

> > > >  include/net/inet_connection_sock.h | 12 +++++++++++

> > > >  include/net/request_sock.h         | 13 ++++++++++++

> > > >  include/net/sock_reuseport.h       |  8 +++----

> > > >  net/core/sock_reuseport.c          | 34 ++++++++++++++++++++++++------

> > > >  net/ipv4/inet_connection_sock.c    | 13 ++++++++++--

> > > >  net/ipv4/tcp_ipv4.c                |  9 ++++++--

> > > >  net/ipv6/tcp_ipv6.c                |  9 ++++++--

> > > >  7 files changed, 81 insertions(+), 17 deletions(-)

> > > > 

> > > > diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h

> > > > index 2ea2d743f8fc..1e0958f5eb21 100644

> > > > --- a/include/net/inet_connection_sock.h

> > > > +++ b/include/net/inet_connection_sock.h

> > > > @@ -272,6 +272,18 @@ static inline void inet_csk_reqsk_queue_added(struct sock *sk)

> > > >  	reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue);

> > > >  }

> > > >  

> > > > +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,

> > > > +						 struct sock *nsk,

> > > > +						 struct request_sock *req)

> > > > +{

> > > > +	reqsk_queue_migrated(&inet_csk(sk)->icsk_accept_queue,

> > > > +			     &inet_csk(nsk)->icsk_accept_queue,

> > > > +			     req);

> > > > +	sock_put(sk);

> > > not sure if it is safe to do here.

> > > IIUC, when the req->rsk_refcnt is held, it also holds a refcnt

> > > to req->rsk_listener such that sock_hold(req->rsk_listener) is

> > > safe because its sk_refcnt is not zero.

> > 

> > I think it is safe to call sock_put() for the old listener here.

> > 

> > Without this patchset, at receiving the final ACK or retransmitting

> > SYN+ACK, if sk_state == TCP_CLOSE, sock_put(req->rsk_listener) is done

> > by calling reqsk_put() twice in inet_csk_reqsk_queue_drop_and_put().

> Note that in your example (final ACK), sock_put(req->rsk_listener) is

> _only_ called when reqsk_put() can get refcount_dec_and_test(&req->rsk_refcnt)

> to reach zero.

> 

> Here in this patch, it sock_put(req->rsk_listener) without req->rsk_refcnt

> reaching zero.

> 

> Let says there are two cores holding two refcnt to req (one cnt for each core)

> by looking up the req from ehash.  One of the core do this migrate and

> sock_put(req->rsk_listener).  Another core does sock_hold(req->rsk_listener).

> 

> 	Core1					Core2

> 						sock_put(req->rsk_listener)

> 

> 	sock_hold(req->rsk_listener)


I'm sorry for the late reply.

I missed this situation that different Cores get into NEW_SYN_RECV path,
but this does exist.
https://lore.kernel.org/netdev/1517977874.3715.153.camel@gmail.com/#t
https://lore.kernel.org/netdev/1518531252.3715.178.camel@gmail.com/


If close() is called for the listener and the request has the last refcount
for it, sock_put() by Core2 frees it, so Core1 cannot proceed with freed
listener. So, it is good to call refcount_inc_not_zero() instead of
sock_hold(). If refcount_inc_not_zero() fails, it means that the listener
is closed and the req->rsk_listener is changed in another place. Then, we
can continue processing the request by rewriting sk with rsk_listener and
calling sock_hold() for it.

Also, the migration by Core2 can be done after sock_hold() by Core1. Then
if Core1 win the race by removing the request from ehash,
in inet_csk_reqsk_queue_add(), instead of sk, req->rsk_listener should be
used as the proper listener to add the req into its queue. But if the
rsk_listener is also TCP_CLOSE, we have to call inet_child_forget().

Moreover, we have to check the listener is freed in the beginning of
reqsk_timer_handler() by refcount_inc_not_zero().


> > And then, we do `goto lookup;` and overwrite the sk.

> > 

> > In the v2 patchset, refcount_inc_not_zero() is done for the new listener in

> > reuseport_select_migrated_sock(), so we have to call sock_put() for the old

> > listener instead to free it properly.

> > 

> > ---8<---

> > +struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,

> > +					    struct sk_buff *skb)

> > +{

> > +	struct sock *nsk;

> > +

> > +	nsk = __reuseport_select_sock(sk, hash, skb, 0, BPF_SK_REUSEPORT_MIGRATE_REQUEST);

> > +	if (nsk && likely(refcount_inc_not_zero(&nsk->sk_refcnt)))

> There is another potential issue here.  The TCP_LISTEN nsk is protected

> by rcu.  refcount_inc_not_zero(&nsk->sk_refcnt) cannot be done if it

> is not under rcu_read_lock().

> 

> The receive path may be ok as it is in rcu.  You may need to check for

> others.


IIUC, is this mean nsk can be NULL after grace period of RCU? If so, I will
move rcu_read_lock/unlock() from __reuseport_select_sock() to
reuseport_select_sock() and reuseport_select_migrated_sock().


> > +		return nsk;

> > +

> > +	return NULL;

> > +}

> > +EXPORT_SYMBOL(reuseport_select_migrated_sock);

> > ---8<---

> > https://lore.kernel.org/netdev/20201207132456.65472-8-kuniyu@amazon.co.jp/

> > 

> > 

> > > > +	sock_hold(nsk);

> > > > +	req->rsk_listener = nsk;

> It looks like there is another race here.  What

> if multiple cores try to update req->rsk_listener?


I think we have to add a lock in struct request_sock, acquire it, check
if the rsk_listener is changed or not, and then do migration. Also, if the
listener has been changed, we have to tell the caller to use it as the new
listener.

---8<---
       spin_lock(&lock)
       if (sk != req->rsk_listener) {
               nsk = req->rsk_listener;
               goto out;
       }

       // do migration
out:
       spin_unlock(&lock)
       return nsk;
---8<---
Martin KaFai Lau Dec. 15, 2020, 2:58 a.m. UTC | #5
On Tue, Dec 15, 2020 at 02:03:13AM +0900, Kuniyuki Iwashima wrote:
> From:   Martin KaFai Lau <kafai@fb.com>

> Date:   Thu, 10 Dec 2020 10:49:15 -0800

> > On Thu, Dec 10, 2020 at 02:15:38PM +0900, Kuniyuki Iwashima wrote:

> > > From:   Martin KaFai Lau <kafai@fb.com>

> > > Date:   Wed, 9 Dec 2020 16:07:07 -0800

> > > > On Tue, Dec 01, 2020 at 11:44:12PM +0900, Kuniyuki Iwashima wrote:

> > > > > This patch renames reuseport_select_sock() to __reuseport_select_sock() and

> > > > > adds two wrapper function of it to pass the migration type defined in the

> > > > > previous commit.

> > > > > 

> > > > >   reuseport_select_sock          : BPF_SK_REUSEPORT_MIGRATE_NO

> > > > >   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST

> > > > > 

> > > > > As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV

> > > > > requests at receiving the final ACK or sending a SYN+ACK. Therefore, this

> > > > > patch also changes the code to call reuseport_select_migrated_sock() even

> > > > > if the listening socket is TCP_CLOSE. If we can pick out a listening socket

> > > > > from the reuseport group, we rewrite request_sock.rsk_listener and resume

> > > > > processing the request.

> > > > > 

> > > > > Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>

> > > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>

> > > > > ---

> > > > >  include/net/inet_connection_sock.h | 12 +++++++++++

> > > > >  include/net/request_sock.h         | 13 ++++++++++++

> > > > >  include/net/sock_reuseport.h       |  8 +++----

> > > > >  net/core/sock_reuseport.c          | 34 ++++++++++++++++++++++++------

> > > > >  net/ipv4/inet_connection_sock.c    | 13 ++++++++++--

> > > > >  net/ipv4/tcp_ipv4.c                |  9 ++++++--

> > > > >  net/ipv6/tcp_ipv6.c                |  9 ++++++--

> > > > >  7 files changed, 81 insertions(+), 17 deletions(-)

> > > > > 

> > > > > diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h

> > > > > index 2ea2d743f8fc..1e0958f5eb21 100644

> > > > > --- a/include/net/inet_connection_sock.h

> > > > > +++ b/include/net/inet_connection_sock.h

> > > > > @@ -272,6 +272,18 @@ static inline void inet_csk_reqsk_queue_added(struct sock *sk)

> > > > >  	reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue);

> > > > >  }

> > > > >  

> > > > > +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,

> > > > > +						 struct sock *nsk,

> > > > > +						 struct request_sock *req)

> > > > > +{

> > > > > +	reqsk_queue_migrated(&inet_csk(sk)->icsk_accept_queue,

> > > > > +			     &inet_csk(nsk)->icsk_accept_queue,

> > > > > +			     req);

> > > > > +	sock_put(sk);

> > > > not sure if it is safe to do here.

> > > > IIUC, when the req->rsk_refcnt is held, it also holds a refcnt

> > > > to req->rsk_listener such that sock_hold(req->rsk_listener) is

> > > > safe because its sk_refcnt is not zero.

> > > 

> > > I think it is safe to call sock_put() for the old listener here.

> > > 

> > > Without this patchset, at receiving the final ACK or retransmitting

> > > SYN+ACK, if sk_state == TCP_CLOSE, sock_put(req->rsk_listener) is done

> > > by calling reqsk_put() twice in inet_csk_reqsk_queue_drop_and_put().

> > Note that in your example (final ACK), sock_put(req->rsk_listener) is

> > _only_ called when reqsk_put() can get refcount_dec_and_test(&req->rsk_refcnt)

> > to reach zero.

> > 

> > Here in this patch, it sock_put(req->rsk_listener) without req->rsk_refcnt

> > reaching zero.

> > 

> > Let says there are two cores holding two refcnt to req (one cnt for each core)

> > by looking up the req from ehash.  One of the core do this migrate and

> > sock_put(req->rsk_listener).  Another core does sock_hold(req->rsk_listener).

> > 

> > 	Core1					Core2

> > 						sock_put(req->rsk_listener)

> > 

> > 	sock_hold(req->rsk_listener)

> 

> I'm sorry for the late reply.

> 

> I missed this situation that different Cores get into NEW_SYN_RECV path,

> but this does exist.

> https://lore.kernel.org/netdev/1517977874.3715.153.camel@gmail.com/#t

> https://lore.kernel.org/netdev/1518531252.3715.178.camel@gmail.com/

> 

> 

> If close() is called for the listener and the request has the last refcount

> for it, sock_put() by Core2 frees it, so Core1 cannot proceed with freed

> listener. So, it is good to call refcount_inc_not_zero() instead of

> sock_hold(). If refcount_inc_not_zero() fails, it means that the listener

_inc_not_zero() usually means it requires rcu_read_lock().
That may have rippling effect on other req->rsk_listener readers.

There may also be places assuming that the req->rsk_listener will never
change once it is assigned.  not sure.  have not looked closely yet.

It probably needs some more thoughts here to get a simpler solution.

> is closed and the req->rsk_listener is changed in another place. Then, we

> can continue processing the request by rewriting sk with rsk_listener and

> calling sock_hold() for it.

> 

> Also, the migration by Core2 can be done after sock_hold() by Core1. Then

> if Core1 win the race by removing the request from ehash,

> in inet_csk_reqsk_queue_add(), instead of sk, req->rsk_listener should be

> used as the proper listener to add the req into its queue. But if the

> rsk_listener is also TCP_CLOSE, we have to call inet_child_forget().

> 

> Moreover, we have to check the listener is freed in the beginning of

> reqsk_timer_handler() by refcount_inc_not_zero().

> 

> 

> > > And then, we do `goto lookup;` and overwrite the sk.

> > > 

> > > In the v2 patchset, refcount_inc_not_zero() is done for the new listener in

> > > reuseport_select_migrated_sock(), so we have to call sock_put() for the old

> > > listener instead to free it properly.

> > > 

> > > ---8<---

> > > +struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,

> > > +					    struct sk_buff *skb)

> > > +{

> > > +	struct sock *nsk;

> > > +

> > > +	nsk = __reuseport_select_sock(sk, hash, skb, 0, BPF_SK_REUSEPORT_MIGRATE_REQUEST);

> > > +	if (nsk && likely(refcount_inc_not_zero(&nsk->sk_refcnt)))

> > There is another potential issue here.  The TCP_LISTEN nsk is protected

> > by rcu.  refcount_inc_not_zero(&nsk->sk_refcnt) cannot be done if it

> > is not under rcu_read_lock().

> > 

> > The receive path may be ok as it is in rcu.  You may need to check for

> > others.

> 

> IIUC, is this mean nsk can be NULL after grace period of RCU? If so, I will

worse than NULL.  an invalid pointer.
 
> move rcu_read_lock/unlock() from __reuseport_select_sock() to

> reuseport_select_sock() and reuseport_select_migrated_sock().

ok.

> 

> 

> > > +		return nsk;

> > > +

> > > +	return NULL;

> > > +}

> > > +EXPORT_SYMBOL(reuseport_select_migrated_sock);

> > > ---8<---

> > > https://lore.kernel.org/netdev/20201207132456.65472-8-kuniyu@amazon.co.jp/

> > > 

> > > 

> > > > > +	sock_hold(nsk);

> > > > > +	req->rsk_listener = nsk;

> > It looks like there is another race here.  What

> > if multiple cores try to update req->rsk_listener?

> 

> I think we have to add a lock in struct request_sock, acquire it, check

> if the rsk_listener is changed or not, and then do migration. Also, if the

> listener has been changed, we have to tell the caller to use it as the new

> listener.

> 

> ---8<---

>        spin_lock(&lock)

>        if (sk != req->rsk_listener) {

>                nsk = req->rsk_listener;

>                goto out;

>        }

> 

>        // do migration

> out:

>        spin_unlock(&lock)

>        return nsk;

> ---8<---

cmpxchg may help here.
Kuniyuki Iwashima Dec. 16, 2020, 4:41 p.m. UTC | #6
From:   Martin KaFai Lau <kafai@fb.com>

Date:   Mon, 14 Dec 2020 18:58:37 -0800
> On Tue, Dec 15, 2020 at 02:03:13AM +0900, Kuniyuki Iwashima wrote:

> > From:   Martin KaFai Lau <kafai@fb.com>

> > Date:   Thu, 10 Dec 2020 10:49:15 -0800

> > > On Thu, Dec 10, 2020 at 02:15:38PM +0900, Kuniyuki Iwashima wrote:

> > > > From:   Martin KaFai Lau <kafai@fb.com>

> > > > Date:   Wed, 9 Dec 2020 16:07:07 -0800

> > > > > On Tue, Dec 01, 2020 at 11:44:12PM +0900, Kuniyuki Iwashima wrote:

> > > > > > This patch renames reuseport_select_sock() to __reuseport_select_sock() and

> > > > > > adds two wrapper function of it to pass the migration type defined in the

> > > > > > previous commit.

> > > > > > 

> > > > > >   reuseport_select_sock          : BPF_SK_REUSEPORT_MIGRATE_NO

> > > > > >   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST

> > > > > > 

> > > > > > As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV

> > > > > > requests at receiving the final ACK or sending a SYN+ACK. Therefore, this

> > > > > > patch also changes the code to call reuseport_select_migrated_sock() even

> > > > > > if the listening socket is TCP_CLOSE. If we can pick out a listening socket

> > > > > > from the reuseport group, we rewrite request_sock.rsk_listener and resume

> > > > > > processing the request.

> > > > > > 

> > > > > > Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>

> > > > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>

> > > > > > ---

> > > > > >  include/net/inet_connection_sock.h | 12 +++++++++++

> > > > > >  include/net/request_sock.h         | 13 ++++++++++++

> > > > > >  include/net/sock_reuseport.h       |  8 +++----

> > > > > >  net/core/sock_reuseport.c          | 34 ++++++++++++++++++++++++------

> > > > > >  net/ipv4/inet_connection_sock.c    | 13 ++++++++++--

> > > > > >  net/ipv4/tcp_ipv4.c                |  9 ++++++--

> > > > > >  net/ipv6/tcp_ipv6.c                |  9 ++++++--

> > > > > >  7 files changed, 81 insertions(+), 17 deletions(-)

> > > > > > 

> > > > > > diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h

> > > > > > index 2ea2d743f8fc..1e0958f5eb21 100644

> > > > > > --- a/include/net/inet_connection_sock.h

> > > > > > +++ b/include/net/inet_connection_sock.h

> > > > > > @@ -272,6 +272,18 @@ static inline void inet_csk_reqsk_queue_added(struct sock *sk)

> > > > > >  	reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue);

> > > > > >  }

> > > > > >  

> > > > > > +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,

> > > > > > +						 struct sock *nsk,

> > > > > > +						 struct request_sock *req)

> > > > > > +{

> > > > > > +	reqsk_queue_migrated(&inet_csk(sk)->icsk_accept_queue,

> > > > > > +			     &inet_csk(nsk)->icsk_accept_queue,

> > > > > > +			     req);

> > > > > > +	sock_put(sk);

> > > > > not sure if it is safe to do here.

> > > > > IIUC, when the req->rsk_refcnt is held, it also holds a refcnt

> > > > > to req->rsk_listener such that sock_hold(req->rsk_listener) is

> > > > > safe because its sk_refcnt is not zero.

> > > > 

> > > > I think it is safe to call sock_put() for the old listener here.

> > > > 

> > > > Without this patchset, at receiving the final ACK or retransmitting

> > > > SYN+ACK, if sk_state == TCP_CLOSE, sock_put(req->rsk_listener) is done

> > > > by calling reqsk_put() twice in inet_csk_reqsk_queue_drop_and_put().

> > > Note that in your example (final ACK), sock_put(req->rsk_listener) is

> > > _only_ called when reqsk_put() can get refcount_dec_and_test(&req->rsk_refcnt)

> > > to reach zero.

> > > 

> > > Here in this patch, it sock_put(req->rsk_listener) without req->rsk_refcnt

> > > reaching zero.

> > > 

> > > Let says there are two cores holding two refcnt to req (one cnt for each core)

> > > by looking up the req from ehash.  One of the core do this migrate and

> > > sock_put(req->rsk_listener).  Another core does sock_hold(req->rsk_listener).

> > > 

> > > 	Core1					Core2

> > > 						sock_put(req->rsk_listener)

> > > 

> > > 	sock_hold(req->rsk_listener)

> > 

> > I'm sorry for the late reply.

> > 

> > I missed this situation that different Cores get into NEW_SYN_RECV path,

> > but this does exist.

> > https://lore.kernel.org/netdev/1517977874.3715.153.camel@gmail.com/#t

> > https://lore.kernel.org/netdev/1518531252.3715.178.camel@gmail.com/

> > 

> > 

> > If close() is called for the listener and the request has the last refcount

> > for it, sock_put() by Core2 frees it, so Core1 cannot proceed with freed

> > listener. So, it is good to call refcount_inc_not_zero() instead of

> > sock_hold(). If refcount_inc_not_zero() fails, it means that the listener

> _inc_not_zero() usually means it requires rcu_read_lock().

> That may have rippling effect on other req->rsk_listener readers.

> 

> There may also be places assuming that the req->rsk_listener will never

> change once it is assigned.  not sure.  have not looked closely yet.


I have checked this again. There are no functions that expect explicitly
req->rsk_listener never change except for BUG_ON in inet_child_forget().
No BUG_ON/WARN_ON does not mean they does not assume listener never
change, but such functions still work properly if rsk_listener is changed.


> It probably needs some more thoughts here to get a simpler solution.


Is it fine to move sock_hold() before assigning rsk_listener and defer
sock_put() to the end of tcp_v[46]_rcv() ?

Also, we have to rewrite rsk_listener first and then call sock_put() in
reqsk_timer_handler() so that rsk_listener always has refcount more than 1.

---8<---
	struct sock *nsk, *osk;
	bool migrated = false;
	...
	sock_hold(req->rsk_listener);  // (i)
	sk = req->rsk_listener;
	...
	if (sk->sk_state == TCP_CLOSE) {
		osk = sk;
		// do migration without sock_put()
		sock_hold(nsk);  // (ii) (as with (i))
		sk = nsk;
		migrated = true;
	}
	...
	if (migrated) {
		sock_put(sk);  // pair with (ii)
		sock_put(osk); // decrement old listener's refcount
		sk = osk;
	}
	sock_put(sk);  // pair with (i)
---8<---


> > is closed and the req->rsk_listener is changed in another place. Then, we

> > can continue processing the request by rewriting sk with rsk_listener and

> > calling sock_hold() for it.

> > 

> > Also, the migration by Core2 can be done after sock_hold() by Core1. Then

> > if Core1 win the race by removing the request from ehash,

> > in inet_csk_reqsk_queue_add(), instead of sk, req->rsk_listener should be

> > used as the proper listener to add the req into its queue. But if the

> > rsk_listener is also TCP_CLOSE, we have to call inet_child_forget().

> > 

> > Moreover, we have to check the listener is freed in the beginning of

> > reqsk_timer_handler() by refcount_inc_not_zero().

> > 

> > 

> > > > And then, we do `goto lookup;` and overwrite the sk.

> > > > 

> > > > In the v2 patchset, refcount_inc_not_zero() is done for the new listener in

> > > > reuseport_select_migrated_sock(), so we have to call sock_put() for the old

> > > > listener instead to free it properly.

> > > > 

> > > > ---8<---

> > > > +struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,

> > > > +					    struct sk_buff *skb)

> > > > +{

> > > > +	struct sock *nsk;

> > > > +

> > > > +	nsk = __reuseport_select_sock(sk, hash, skb, 0, BPF_SK_REUSEPORT_MIGRATE_REQUEST);

> > > > +	if (nsk && likely(refcount_inc_not_zero(&nsk->sk_refcnt)))

> > > There is another potential issue here.  The TCP_LISTEN nsk is protected

> > > by rcu.  refcount_inc_not_zero(&nsk->sk_refcnt) cannot be done if it

> > > is not under rcu_read_lock().

> > > 

> > > The receive path may be ok as it is in rcu.  You may need to check for

> > > others.

> > 

> > IIUC, is this mean nsk can be NULL after grace period of RCU? If so, I will

> worse than NULL.  an invalid pointer.

>  

> > move rcu_read_lock/unlock() from __reuseport_select_sock() to

> > reuseport_select_sock() and reuseport_select_migrated_sock().

> ok.

> 

> > 

> > 

> > > > +		return nsk;

> > > > +

> > > > +	return NULL;

> > > > +}

> > > > +EXPORT_SYMBOL(reuseport_select_migrated_sock);

> > > > ---8<---

> > > > https://lore.kernel.org/netdev/20201207132456.65472-8-kuniyu@amazon.co.jp/

> > > > 

> > > > 

> > > > > > +	sock_hold(nsk);

> > > > > > +	req->rsk_listener = nsk;

> > > It looks like there is another race here.  What

> > > if multiple cores try to update req->rsk_listener?

> > 

> > I think we have to add a lock in struct request_sock, acquire it, check

> > if the rsk_listener is changed or not, and then do migration. Also, if the

> > listener has been changed, we have to tell the caller to use it as the new

> > listener.

> > 

> > ---8<---

> >        spin_lock(&lock)

> >        if (sk != req->rsk_listener) {

> >                nsk = req->rsk_listener;

> >                goto out;

> >        }

> > 

> >        // do migration

> > out:

> >        spin_unlock(&lock)

> >        return nsk;

> > ---8<---

> cmpxchg may help here.


Thank you, I will use cmpxchg() to rewrite rsk_listener atomically and
check if req->rsk_listener is updated.
Martin KaFai Lau Dec. 16, 2020, 10:24 p.m. UTC | #7
On Thu, Dec 17, 2020 at 01:41:58AM +0900, Kuniyuki Iwashima wrote:
[ ... ]

> > There may also be places assuming that the req->rsk_listener will never

> > change once it is assigned.  not sure.  have not looked closely yet.

> 

> I have checked this again. There are no functions that expect explicitly

> req->rsk_listener never change except for BUG_ON in inet_child_forget().

> No BUG_ON/WARN_ON does not mean they does not assume listener never

> change, but such functions still work properly if rsk_listener is changed.

The migration not only changes the ptr value of req->rsk_listener, it also
means req is moved to another listener. (e.g. by updating the qlen of
the old sk and new sk)

Lets reuse the example about two cores at the TCP_NEW_SYN_RECV path
racing to finish up the 3WHS.

One core is already at inet_csk_complete_hashdance() doing
"reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req))".
What happen if another core migrates the req to another listener?
Would the "reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req))"
doing thing on the accept_queue that this req no longer belongs to?

Also, from a quick look at reqsk_timer_handler() on how
queue->young and req->num_timeout are updated, I am not sure
the reqsk_queue_migrated() will work also:

+static inline void reqsk_queue_migrated(struct request_sock_queue *old_accept_queue,
+					struct request_sock_queue *new_accept_queue,
+					const struct request_sock *req)
+{
+	atomic_dec(&old_accept_queue->qlen);
+	atomic_inc(&new_accept_queue->qlen);
+
+	if (req->num_timeout == 0) {
What if reqsk_timer_handler() is running in parallel
and updating req->num_timeout?

+		atomic_dec(&old_accept_queue->young);
+		atomic_inc(&new_accept_queue->young);
+	}
+}


It feels like some of the "own_req" related logic may be useful here.
not sure.  could be something worth to think about.

> 

> 

> > It probably needs some more thoughts here to get a simpler solution.

> 

> Is it fine to move sock_hold() before assigning rsk_listener and defer

> sock_put() to the end of tcp_v[46]_rcv() ?

I don't see how this ordering helps, considering the migration can happen
any time at another core.

> 

> Also, we have to rewrite rsk_listener first and then call sock_put() in

> reqsk_timer_handler() so that rsk_listener always has refcount more than 1.

> 

> ---8<---

> 	struct sock *nsk, *osk;

> 	bool migrated = false;

> 	...

> 	sock_hold(req->rsk_listener);  // (i)

> 	sk = req->rsk_listener;

> 	...

> 	if (sk->sk_state == TCP_CLOSE) {

> 		osk = sk;

> 		// do migration without sock_put()

> 		sock_hold(nsk);  // (ii) (as with (i))

> 		sk = nsk;

> 		migrated = true;

> 	}

> 	...

> 	if (migrated) {

> 		sock_put(sk);  // pair with (ii)

> 		sock_put(osk); // decrement old listener's refcount

> 		sk = osk;

> 	}

> 	sock_put(sk);  // pair with (i)

> ---8<---
diff mbox series

Patch

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 2ea2d743f8fc..1e0958f5eb21 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -272,6 +272,18 @@  static inline void inet_csk_reqsk_queue_added(struct sock *sk)
 	reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue);
 }
 
+static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,
+						 struct sock *nsk,
+						 struct request_sock *req)
+{
+	reqsk_queue_migrated(&inet_csk(sk)->icsk_accept_queue,
+			     &inet_csk(nsk)->icsk_accept_queue,
+			     req);
+	sock_put(sk);
+	sock_hold(nsk);
+	req->rsk_listener = nsk;
+}
+
 static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
 {
 	return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 29e41ff3ec93..d18ba0b857cc 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -226,6 +226,19 @@  static inline void reqsk_queue_added(struct request_sock_queue *queue)
 	atomic_inc(&queue->qlen);
 }
 
+static inline void reqsk_queue_migrated(struct request_sock_queue *old_accept_queue,
+					struct request_sock_queue *new_accept_queue,
+					const struct request_sock *req)
+{
+	atomic_dec(&old_accept_queue->qlen);
+	atomic_inc(&new_accept_queue->qlen);
+
+	if (req->num_timeout == 0) {
+		atomic_dec(&old_accept_queue->young);
+		atomic_inc(&new_accept_queue->young);
+	}
+}
+
 static inline int reqsk_queue_len(const struct request_sock_queue *queue)
 {
 	return atomic_read(&queue->qlen);
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 09a1b1539d4c..a48259a974be 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -32,10 +32,10 @@  extern int reuseport_alloc(struct sock *sk, bool bind_inany);
 extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
 			      bool bind_inany);
 extern struct sock *reuseport_detach_sock(struct sock *sk);
-extern struct sock *reuseport_select_sock(struct sock *sk,
-					  u32 hash,
-					  struct sk_buff *skb,
-					  int hdr_len);
+extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash,
+					  struct sk_buff *skb, int hdr_len);
+extern struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,
+						   struct sk_buff *skb);
 extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
 extern int reuseport_detach_prog(struct sock *sk);
 
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 60d7c1f28809..b4fe0829c9ab 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -202,7 +202,7 @@  int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 	}
 
 	reuse->socks[reuse->num_socks] = sk;
-	/* paired with smp_rmb() in reuseport_select_sock() */
+	/* paired with smp_rmb() in __reuseport_select_sock() */
 	smp_wmb();
 	reuse->num_socks++;
 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
@@ -313,12 +313,13 @@  static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
  *  @hdr_len: BPF filter expects skb data pointer at payload data.  If
  *    the skb does not yet point at the payload, this parameter represents
  *    how far the pointer needs to advance to reach the payload.
+ *  @migration: represents if it is selecting a listener for SYN or
+ *    migrating ESTABLISHED/SYN_RECV sockets or NEW_SYN_RECV socket.
  *  Returns a socket that should receive the packet (or NULL on error).
  */
-struct sock *reuseport_select_sock(struct sock *sk,
-				   u32 hash,
-				   struct sk_buff *skb,
-				   int hdr_len)
+struct sock *__reuseport_select_sock(struct sock *sk, u32 hash,
+				     struct sk_buff *skb, int hdr_len,
+				     u8 migration)
 {
 	struct sock_reuseport *reuse;
 	struct bpf_prog *prog;
@@ -332,13 +333,19 @@  struct sock *reuseport_select_sock(struct sock *sk,
 	if (!reuse)
 		goto out;
 
-	prog = rcu_dereference(reuse->prog);
 	socks = READ_ONCE(reuse->num_socks);
 	if (likely(socks)) {
 		/* paired with smp_wmb() in reuseport_add_sock() */
 		smp_rmb();
 
-		if (!prog || !skb)
+		prog = rcu_dereference(reuse->prog);
+		if (!prog)
+			goto select_by_hash;
+
+		if (migration)
+			goto out;
+
+		if (!skb)
 			goto select_by_hash;
 
 		if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
@@ -367,8 +374,21 @@  struct sock *reuseport_select_sock(struct sock *sk,
 	rcu_read_unlock();
 	return sk2;
 }
+
+struct sock *reuseport_select_sock(struct sock *sk, u32 hash,
+				   struct sk_buff *skb, int hdr_len)
+{
+	return __reuseport_select_sock(sk, hash, skb, hdr_len, BPF_SK_REUSEPORT_MIGRATE_NO);
+}
 EXPORT_SYMBOL(reuseport_select_sock);
 
+struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,
+					    struct sk_buff *skb)
+{
+	return __reuseport_select_sock(sk, hash, skb, 0, BPF_SK_REUSEPORT_MIGRATE_REQUEST);
+}
+EXPORT_SYMBOL(reuseport_select_migrated_sock);
+
 int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
 {
 	struct sock_reuseport *reuse;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 361efe55b1ad..e71653c6eae2 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -743,8 +743,17 @@  static void reqsk_timer_handler(struct timer_list *t)
 	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 	int max_syn_ack_retries, qlen, expire = 0, resend = 0;
 
-	if (inet_sk_state_load(sk_listener) != TCP_LISTEN)
-		goto drop;
+	if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {
+		sk_listener = reuseport_select_migrated_sock(sk_listener,
+							     req_to_sk(req)->sk_hash, NULL);
+		if (!sk_listener) {
+			sk_listener = req->rsk_listener;
+			goto drop;
+		}
+		inet_csk_reqsk_queue_migrated(req->rsk_listener, sk_listener, req);
+		icsk = inet_csk(sk_listener);
+		queue = &icsk->icsk_accept_queue;
+	}
 
 	max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
 	/* Normally all the openreqs are young and become mature
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e4b31e70bd30..9a9aa27c6069 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1973,8 +1973,13 @@  int tcp_v4_rcv(struct sk_buff *skb)
 			goto csum_error;
 		}
 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
-			inet_csk_reqsk_queue_drop_and_put(sk, req);
-			goto lookup;
+			nsk = reuseport_select_migrated_sock(sk, req_to_sk(req)->sk_hash, skb);
+			if (!nsk) {
+				inet_csk_reqsk_queue_drop_and_put(sk, req);
+				goto lookup;
+			}
+			inet_csk_reqsk_queue_migrated(sk, nsk, req);
+			sk = nsk;
 		}
 		/* We own a reference on the listener, increase it again
 		 * as we might lose it too soon.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 992cbf3eb9e3..ff11f3c0cb96 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1635,8 +1635,13 @@  INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
 			goto csum_error;
 		}
 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
-			inet_csk_reqsk_queue_drop_and_put(sk, req);
-			goto lookup;
+			nsk = reuseport_select_migrated_sock(sk, req_to_sk(req)->sk_hash, skb);
+			if (!nsk) {
+				inet_csk_reqsk_queue_drop_and_put(sk, req);
+				goto lookup;
+			}
+			inet_csk_reqsk_queue_migrated(sk, nsk, req);
+			sk = nsk;
 		}
 		sock_hold(sk);
 		refcounted = true;