diff mbox series

[RFC,v2,2/3] ipv6: Support setting src port in sendmsg().

Message ID 20240920-reverse-sk-lookup-v2-2-916a48c47d56@cloudflare.com
State New
Headers show
Series Allow sk_lookup UDP return traffic to egress when setting src port/address. | expand

Commit Message

Tiago Lam Sept. 20, 2024, 5:02 p.m. UTC
This follows the same rationale provided for the ipv4 counterpart, where
the sendmsg() path is also extended here to support the IPV6_ORIGDSTADDR
ancillary message to be able to specify a source address/port. This
allows users to configure the source address and/or port egress traffic
should be sent from.

To limit its usage, a reverse socket lookup is performed to check if the
configured egress source address and/or port have any ingress sk_lookup
match. If it does, traffic is allowed to proceed, otherwise it falls
back to the regular egress path.

Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
---
 net/ipv6/datagram.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/udp.c      |  8 ++++--
 2 files changed, 85 insertions(+), 2 deletions(-)

Comments

Willem de Bruijn Sept. 21, 2024, 9:13 a.m. UTC | #1
Tiago Lam wrote:
> This follows the same rationale provided for the ipv4 counterpart, where
> the sendmsg() path is also extended here to support the IPV6_ORIGDSTADDR
> ancillary message to be able to specify a source address/port. This
> allows users to configure the source address and/or port egress traffic
> should be sent from.
> 
> To limit its usage, a reverse socket lookup is performed to check if the
> configured egress source address and/or port have any ingress sk_lookup
> match. If it does, traffic is allowed to proceed, otherwise it falls
> back to the regular egress path.
> 
> Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
> ---
>  net/ipv6/datagram.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  net/ipv6/udp.c      |  8 ++++--
>  2 files changed, 85 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
> index fff78496803d..369c64a478ec 100644
> --- a/net/ipv6/datagram.c
> +++ b/net/ipv6/datagram.c
> @@ -756,6 +756,29 @@ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
>  }
>  EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
>  
> +static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
> +				     struct in6_addr *saddr, __be16 sport)
> +{
> +	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
> +	    (saddr && sport) &&
> +	    (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) ||
> +	    inet_sk(sk)->inet_sport != sport)) {
> +		struct sock *sk_egress;
> +
> +		bpf_sk_lookup_run_v6(sock_net(sk), IPPROTO_UDP, &fl6->daddr,
> +				     fl6->fl6_dport, saddr, ntohs(sport), 0,
> +				     &sk_egress);
> +		if (!IS_ERR_OR_NULL(sk_egress) && sk_egress == sk)
> +			return true;
> +
> +		net_info_ratelimited("No reverse socket lookup match for local addr %pI6:%d remote addr %pI6:%d\n",
> +				     &saddr, ntohs(sport), &fl6->daddr,
> +				     ntohs(fl6->fl6_dport));
> +	}
> +
> +	return false;
> +}
> +
>  int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>  			  struct msghdr *msg, struct flowi6 *fl6,
>  			  struct ipcm6_cookie *ipc6)
> @@ -844,7 +867,63 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>  
>  			break;
>  		    }
> +		case IPV6_ORIGDSTADDR:
> +			{
> +			struct sockaddr_in6 *sockaddr_in;
> +			struct net_device *dev = NULL;
> +
> +			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct sockaddr_in6))) {
> +				err = -EINVAL;
> +				goto exit_f;
> +			}
> +
> +			sockaddr_in = (struct sockaddr_in6 *)CMSG_DATA(cmsg);
> +
> +			addr_type = __ipv6_addr_type(&sockaddr_in->sin6_addr);
> +
> +			if (addr_type & IPV6_ADDR_LINKLOCAL)
> +				return -EINVAL;
> +
> +			/* If we're egressing with a different source address
> +			 * and/or port, we perform a reverse socket lookup. The
> +			 * rationale behind this is that we can allow return
> +			 * UDP traffic that has ingressed through sk_lookup to
> +			 * also egress correctly. In case the reverse lookup
> +			 * fails, we continue with the normal path.
> +			 *
> +			 * The lookup is performed if either source address
> +			 * and/or port changed, and neither is "0".
> +			 */
> +			if (reverse_sk_lookup(fl6, sk, &sockaddr_in->sin6_addr,
> +					      sockaddr_in->sin6_port)) {
> +				/* Override the source port and address to use
> +				 * with the one we got in cmsg and bail early.
> +				 */
> +				fl6->saddr = sockaddr_in->sin6_addr;
> +				fl6->fl6_sport = sockaddr_in->sin6_port;
> +				break;
> +			}
>  
> +			if (addr_type != IPV6_ADDR_ANY) {
> +				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
> +
> +				if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) &&
> +				    !ipv6_chk_addr_and_flags(net,
> +							     &sockaddr_in->sin6_addr,
> +							     dev, !strict, 0,
> +							     IFA_F_TENTATIVE) &&
> +				    !ipv6_chk_acast_addr_src(net, dev,
> +							     &sockaddr_in->sin6_addr))
> +					err = -EINVAL;
> +				else
> +					fl6->saddr = sockaddr_in->sin6_addr;
> +			}
> +
> +			if (err)
> +				goto exit_f;
> +
> +			break;
> +			}

How come IPv6 runs the check in the cmsg handler, but ipv4 in sendmsg
directly? Can the two be symmetric?

>  		case IPV6_FLOWINFO:
>  			if (cmsg->cmsg_len < CMSG_LEN(4)) {
>  				err = -EINVAL;
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 6602a2e9cdb5..6121cbb71ad3 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -1476,6 +1476,12 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  
>  	fl6->flowi6_uid = sk->sk_uid;
>  
> +	/* We use fl6's daddr and fl6_sport in the reverse sk_lookup done
> +	 * within ip6_datagram_send_ctl() now.
> +	 */
> +	fl6->daddr = *daddr;
> +	fl6->fl6_sport = inet->inet_sport;
> +
>  	if (msg->msg_controllen) {
>  		opt = &opt_space;
>  		memset(opt, 0, sizeof(struct ipv6_txoptions));
> @@ -1511,10 +1517,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  
>  	fl6->flowi6_proto = sk->sk_protocol;
>  	fl6->flowi6_mark = ipc6.sockc.mark;
> -	fl6->daddr = *daddr;
>  	if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
>  		fl6->saddr = np->saddr;
> -	fl6->fl6_sport = inet->inet_sport;
>  
>  	if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
>  		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
> 
> -- 
> 2.34.1
>
David Laight Sept. 23, 2024, 1:08 p.m. UTC | #2
From: Tiago Lam <tiagolam@cloudflare.com>
> Sent: 20 September 2024 18:02
> 
> This follows the same rationale provided for the ipv4 counterpart, where
> the sendmsg() path is also extended here to support the IPV6_ORIGDSTADDR
> ancillary message to be able to specify a source address/port. This
> allows users to configure the source address and/or port egress traffic
> should be sent from.

I'd missed that being added - could save us the horrid problem of getting
the UDP checksum correct when sending UDP over a raw socket.
(That isn't a problem for IPv6.)

> To limit its usage, a reverse socket lookup is performed to check if the
> configured egress source address and/or port have any ingress sk_lookup
> match. If it does, traffic is allowed to proceed, otherwise it falls
> back to the regular egress path.

Is that really useful/necessary?
The check (but not the commit message) implies that some 'bpf thingy'
also needs to be enabled.
Any check would need to include the test that the program sending the packet
has the ability to send a packet through the ingress socket.
Additionally a check for the sending process having (IIRC) CAP_NET_ADMIN
(which would let the process send the message by other means) would save the
slow path.

The code we have sends a lot of UDP RTP (typically 160 bytes of audio every 20ms).
There is actually no reason for there to be a valid matching ingress path.
(That code would benefit from being able to bind a lot of ports to the same
UDP socket.)

	David

> 
> Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
> ---
>  net/ipv6/datagram.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  net/ipv6/udp.c      |  8 ++++--
>  2 files changed, 85 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
> index fff78496803d..369c64a478ec 100644
> --- a/net/ipv6/datagram.c
> +++ b/net/ipv6/datagram.c
> @@ -756,6 +756,29 @@ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
>  }
>  EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
> 
> +static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
> +				     struct in6_addr *saddr, __be16 sport)
> +{
> +	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
> +	    (saddr && sport) &&
> +	    (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) ||
> +	    inet_sk(sk)->inet_sport != sport)) {
> +		struct sock *sk_egress;
> +
> +		bpf_sk_lookup_run_v6(sock_net(sk), IPPROTO_UDP, &fl6->daddr,
> +				     fl6->fl6_dport, saddr, ntohs(sport), 0,
> +				     &sk_egress);
> +		if (!IS_ERR_OR_NULL(sk_egress) && sk_egress == sk)
> +			return true;
> +
> +		net_info_ratelimited("No reverse socket lookup match for local addr %pI6:%d remote addr
> %pI6:%d\n",
> +				     &saddr, ntohs(sport), &fl6->daddr,
> +				     ntohs(fl6->fl6_dport));
> +	}
> +
> +	return false;
> +}
> +
>  int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>  			  struct msghdr *msg, struct flowi6 *fl6,
>  			  struct ipcm6_cookie *ipc6)
> @@ -844,7 +867,63 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
> 
>  			break;
>  		    }
> +		case IPV6_ORIGDSTADDR:
> +			{
> +			struct sockaddr_in6 *sockaddr_in;
> +			struct net_device *dev = NULL;
> +
> +			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct sockaddr_in6))) {
> +				err = -EINVAL;
> +				goto exit_f;
> +			}
> +
> +			sockaddr_in = (struct sockaddr_in6 *)CMSG_DATA(cmsg);
> +
> +			addr_type = __ipv6_addr_type(&sockaddr_in->sin6_addr);
> +
> +			if (addr_type & IPV6_ADDR_LINKLOCAL)
> +				return -EINVAL;
> +
> +			/* If we're egressing with a different source address
> +			 * and/or port, we perform a reverse socket lookup. The
> +			 * rationale behind this is that we can allow return
> +			 * UDP traffic that has ingressed through sk_lookup to
> +			 * also egress correctly. In case the reverse lookup
> +			 * fails, we continue with the normal path.
> +			 *
> +			 * The lookup is performed if either source address
> +			 * and/or port changed, and neither is "0".
> +			 */
> +			if (reverse_sk_lookup(fl6, sk, &sockaddr_in->sin6_addr,
> +					      sockaddr_in->sin6_port)) {
> +				/* Override the source port and address to use
> +				 * with the one we got in cmsg and bail early.
> +				 */
> +				fl6->saddr = sockaddr_in->sin6_addr;
> +				fl6->fl6_sport = sockaddr_in->sin6_port;
> +				break;
> +			}
> 
> +			if (addr_type != IPV6_ADDR_ANY) {
> +				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
> +
> +				if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) &&
> +				    !ipv6_chk_addr_and_flags(net,
> +							     &sockaddr_in->sin6_addr,
> +							     dev, !strict, 0,
> +							     IFA_F_TENTATIVE) &&
> +				    !ipv6_chk_acast_addr_src(net, dev,
> +							     &sockaddr_in->sin6_addr))
> +					err = -EINVAL;
> +				else
> +					fl6->saddr = sockaddr_in->sin6_addr;
> +			}
> +
> +			if (err)
> +				goto exit_f;
> +
> +			break;
> +			}
>  		case IPV6_FLOWINFO:
>  			if (cmsg->cmsg_len < CMSG_LEN(4)) {
>  				err = -EINVAL;
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 6602a2e9cdb5..6121cbb71ad3 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -1476,6 +1476,12 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> 
>  	fl6->flowi6_uid = sk->sk_uid;
> 
> +	/* We use fl6's daddr and fl6_sport in the reverse sk_lookup done
> +	 * within ip6_datagram_send_ctl() now.
> +	 */
> +	fl6->daddr = *daddr;
> +	fl6->fl6_sport = inet->inet_sport;
> +
>  	if (msg->msg_controllen) {
>  		opt = &opt_space;
>  		memset(opt, 0, sizeof(struct ipv6_txoptions));
> @@ -1511,10 +1517,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> 
>  	fl6->flowi6_proto = sk->sk_protocol;
>  	fl6->flowi6_mark = ipc6.sockc.mark;
> -	fl6->daddr = *daddr;
>  	if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
>  		fl6->saddr = np->saddr;
> -	fl6->fl6_sport = inet->inet_sport;
> 
>  	if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
>  		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
> 
> --
> 2.34.1
> 

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Jakub Sitnicki Sept. 23, 2024, 2:56 p.m. UTC | #3
On Mon, Sep 23, 2024 at 01:08 PM GMT, David Laight wrote:
> From: Tiago Lam <tiagolam@cloudflare.com>

[...]

>> To limit its usage, a reverse socket lookup is performed to check if the
>> configured egress source address and/or port have any ingress sk_lookup
>> match. If it does, traffic is allowed to proceed, otherwise it falls
>> back to the regular egress path.
>
> Is that really useful/necessary?

We've been asking ourselves the same question during Plumbers with
Martin.

Unprivileges processes can already source UDP traffic from (almost) any
IP & port by binding a socket to the desired source port and passing
IP_PKTINFO. So perhaps having a reverse socket lookup is an overkill.

We should probably respect net.ipv4.ip_local_reserved_ports and
net.ipv4.ip_unprivileged_port_start system settings, though, or check
for relevant caps.

Open question if it is acceptable to disregard exclusive UDP port
ownership by sockets binding to a wildcard address without SO_REUSEADDR?

[...]





> The check (but not the commit message) implies that some 'bpf thingy'
> also needs to be enabled.
> Any check would need to include the test that the program sending the packet
> has the ability to send a packet through the ingress socket.
> Additionally a check for the sending process having (IIRC) CAP_NET_ADMIN
> (which would let the process send the message by other means) would save the
> slow path.
>
> The code we have sends a lot of UDP RTP (typically 160 bytes of audio every 20ms).
> There is actually no reason for there to be a valid matching ingress path.
> (That code would benefit from being able to bind a lot of ports to the same
> UDP socket.)
>
> 	David
>
>> 
>> Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
>> Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
>> ---
>>  net/ipv6/datagram.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  net/ipv6/udp.c      |  8 ++++--
>>  2 files changed, 85 insertions(+), 2 deletions(-)
>> 
>> diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
>> index fff78496803d..369c64a478ec 100644
>> --- a/net/ipv6/datagram.c
>> +++ b/net/ipv6/datagram.c
>> @@ -756,6 +756,29 @@ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
>>  }
>>  EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
>> 
>> +static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
>> +				     struct in6_addr *saddr, __be16 sport)
>> +{
>> +	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
>> +	    (saddr && sport) &&
>> +	    (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) ||
>> +	    inet_sk(sk)->inet_sport != sport)) {
>> +		struct sock *sk_egress;
>> +
>> +		bpf_sk_lookup_run_v6(sock_net(sk), IPPROTO_UDP, &fl6->daddr,
>> +				     fl6->fl6_dport, saddr, ntohs(sport), 0,
>> +				     &sk_egress);
>> +		if (!IS_ERR_OR_NULL(sk_egress) && sk_egress == sk)
>> +			return true;
>> +
>> +		net_info_ratelimited("No reverse socket lookup match for local addr %pI6:%d remote addr
>> %pI6:%d\n",
>> +				     &saddr, ntohs(sport), &fl6->daddr,
>> +				     ntohs(fl6->fl6_dport));
>> +	}
>> +
>> +	return false;
>> +}
>> +
>>  int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>>  			  struct msghdr *msg, struct flowi6 *fl6,
>>  			  struct ipcm6_cookie *ipc6)
>> @@ -844,7 +867,63 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>> 
>>  			break;
>>  		    }
>> +		case IPV6_ORIGDSTADDR:
>> +			{
>> +			struct sockaddr_in6 *sockaddr_in;
>> +			struct net_device *dev = NULL;
>> +
>> +			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct sockaddr_in6))) {
>> +				err = -EINVAL;
>> +				goto exit_f;
>> +			}
>> +
>> +			sockaddr_in = (struct sockaddr_in6 *)CMSG_DATA(cmsg);
>> +
>> +			addr_type = __ipv6_addr_type(&sockaddr_in->sin6_addr);
>> +
>> +			if (addr_type & IPV6_ADDR_LINKLOCAL)
>> +				return -EINVAL;
>> +
>> +			/* If we're egressing with a different source address
>> +			 * and/or port, we perform a reverse socket lookup. The
>> +			 * rationale behind this is that we can allow return
>> +			 * UDP traffic that has ingressed through sk_lookup to
>> +			 * also egress correctly. In case the reverse lookup
>> +			 * fails, we continue with the normal path.
>> +			 *
>> +			 * The lookup is performed if either source address
>> +			 * and/or port changed, and neither is "0".
>> +			 */
>> +			if (reverse_sk_lookup(fl6, sk, &sockaddr_in->sin6_addr,
>> +					      sockaddr_in->sin6_port)) {
>> +				/* Override the source port and address to use
>> +				 * with the one we got in cmsg and bail early.
>> +				 */
>> +				fl6->saddr = sockaddr_in->sin6_addr;
>> +				fl6->fl6_sport = sockaddr_in->sin6_port;
>> +				break;
>> +			}
>> 
>> +			if (addr_type != IPV6_ADDR_ANY) {
>> +				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
>> +
>> +				if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) &&
>> +				    !ipv6_chk_addr_and_flags(net,
>> +							     &sockaddr_in->sin6_addr,
>> +							     dev, !strict, 0,
>> +							     IFA_F_TENTATIVE) &&
>> +				    !ipv6_chk_acast_addr_src(net, dev,
>> +							     &sockaddr_in->sin6_addr))
>> +					err = -EINVAL;
>> +				else
>> +					fl6->saddr = sockaddr_in->sin6_addr;
>> +			}
>> +
>> +			if (err)
>> +				goto exit_f;
>> +
>> +			break;
>> +			}
>>  		case IPV6_FLOWINFO:
>>  			if (cmsg->cmsg_len < CMSG_LEN(4)) {
>>  				err = -EINVAL;
>> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
>> index 6602a2e9cdb5..6121cbb71ad3 100644
>> --- a/net/ipv6/udp.c
>> +++ b/net/ipv6/udp.c
>> @@ -1476,6 +1476,12 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>> 
>>  	fl6->flowi6_uid = sk->sk_uid;
>> 
>> +	/* We use fl6's daddr and fl6_sport in the reverse sk_lookup done
>> +	 * within ip6_datagram_send_ctl() now.
>> +	 */
>> +	fl6->daddr = *daddr;
>> +	fl6->fl6_sport = inet->inet_sport;
>> +
>>  	if (msg->msg_controllen) {
>>  		opt = &opt_space;
>>  		memset(opt, 0, sizeof(struct ipv6_txoptions));
>> @@ -1511,10 +1517,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>> 
>>  	fl6->flowi6_proto = sk->sk_protocol;
>>  	fl6->flowi6_mark = ipc6.sockc.mark;
>> -	fl6->daddr = *daddr;
>>  	if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
>>  		fl6->saddr = np->saddr;
>> -	fl6->fl6_sport = inet->inet_sport;
>> 
>>  	if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
>>  		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
>> 
>> --
>> 2.34.1
>> 
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
David Laight Sept. 23, 2024, 3:45 p.m. UTC | #4
From: Jakub Sitnicki
> Sent: 23 September 2024 15:56
> 
> On Mon, Sep 23, 2024 at 01:08 PM GMT, David Laight wrote:
> > From: Tiago Lam <tiagolam@cloudflare.com>
> 
> [...]
> 
> >> To limit its usage, a reverse socket lookup is performed to check if the
> >> configured egress source address and/or port have any ingress sk_lookup
> >> match. If it does, traffic is allowed to proceed, otherwise it falls
> >> back to the regular egress path.
> >
> > Is that really useful/necessary?
> 
> We've been asking ourselves the same question during Plumbers with
> Martin.
> 
> Unprivileges processes can already source UDP traffic from (almost) any
> IP & port by binding a socket to the desired source port and passing
> IP_PKTINFO. So perhaps having a reverse socket lookup is an overkill.

Traditionally you'd need to bind to the source port on any local IP
(or the wildcard IP) that didn't have another socket bound to that port.
Modern Linux might have more restrictions and SO_REUSADDR muddies things.

And I don't think you can do a connect() on an unbound UDP socket to
set the source port at the same time as the destination IP+port.
(That would actually be useful.)

OTOH if you just want to send a UDP message you can just use another
system on the same network.
You might need to spoof the source mac - but that isn't hard (although
it might confuse any ethernet switches).

> We should probably respect net.ipv4.ip_local_reserved_ports and
> net.ipv4.ip_unprivileged_port_start system settings, though, or check
> for relevant caps.

True.

> Open question if it is acceptable to disregard exclusive UDP port
> ownership by sockets binding to a wildcard address without SO_REUSEADDR?

We've often suffered from the opposite - a program binds to the wildcard
IP and everything works until something else binds to the same port and
a specific local IP.
I'm sure this is grief some on both TCP and UDP - especially since you
often need to set SO_REUSADDR to stop other things breaking.

	David
 

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Jakub Sitnicki Sept. 23, 2024, 4:44 p.m. UTC | #5
On Mon, Sep 23, 2024 at 03:45 PM GMT, David Laight wrote:
> From: Jakub Sitnicki
>> Sent: 23 September 2024 15:56
>> 
>> On Mon, Sep 23, 2024 at 01:08 PM GMT, David Laight wrote:
>> > From: Tiago Lam <tiagolam@cloudflare.com>
>> 
>> [...]
>> 
>> >> To limit its usage, a reverse socket lookup is performed to check if the
>> >> configured egress source address and/or port have any ingress sk_lookup
>> >> match. If it does, traffic is allowed to proceed, otherwise it falls
>> >> back to the regular egress path.
>> >
>> > Is that really useful/necessary?
>> 
>> We've been asking ourselves the same question during Plumbers with
>> Martin.
>> 
>> Unprivileges processes can already source UDP traffic from (almost) any
>> IP & port by binding a socket to the desired source port and passing
>> IP_PKTINFO. So perhaps having a reverse socket lookup is an overkill.
>
> Traditionally you'd need to bind to the source port on any local IP
> (or the wildcard IP) that didn't have another socket bound to that port.

Right. Linux IP_PKTINFO extension relaxes this requirement. You can bind
to some local IP (whichever is free, plently to choose from in 127/8
local subnet), and specify the source address to use OOB at sendmsg()
time (as long as the address is local to the host, otherwise you need
additional capabilities).

> Modern Linux might have more restrictions and SO_REUSADDR muddies things.
>
> And I don't think you can do a connect() on an unbound UDP socket to
> set the source port at the same time as the destination IP+port.
> (That would actually be useful.)

You can. It's somewhat recent (v6.3+) [1]:

https://manpages.debian.org/unstable/manpages/ip.7.en.html#IP_LOCAL_PORT_RANGE

It's not on par with TCP when it comes to local port sharing because we
hash UDP sockets only by 2-tuple. Though, some effort to improve that is
taking place I see.

The recipe is:

1. delay the auto-bind until connect() time with IP_BIND_ADDRESS_NO_PORT
   socket option, and
2. tell the udp stack to consider only a single local port during the
   free port search with IP_LOCAL_PORT_RANGE option.

That amounts to something like (in pseudocode):

  s = socket(AF_INET, SOCK_DGRAM)
  s.setsockopt(SOL_IP, IP_BIND_ADDRESS_NO_PORT, 1)
  s.setsockopt(SOL_IP, IP_LOCAL_PORT_RANGE, 44_444 << 16 | 44_444)
  s.bind(("192.0.2.42", 0))
  s.connect(("1.1.1.1", 53))

You can combine it with SO_REUSEADDR to share the local address between
sockets, but you have to ensure manually that you don't run into
conflicts between sockets (two sockets using the same 4-tuple). That's
something we're hoping to improve in the future.

> OTOH if you just want to send a UDP message you can just use another
> system on the same network.
> You might need to spoof the source mac - but that isn't hard (although
> it might confuse any ethernet switches).
>
>> We should probably respect net.ipv4.ip_local_reserved_ports and
>> net.ipv4.ip_unprivileged_port_start system settings, though, or check
>> for relevant caps.
>
> True.
>
>> Open question if it is acceptable to disregard exclusive UDP port
>> ownership by sockets binding to a wildcard address without SO_REUSEADDR?
>
> We've often suffered from the opposite - a program binds to the wildcard
> IP and everything works until something else binds to the same port and
> a specific local IP.

Let me see if I understand - what would happen today for UDP is:

app #1 - bind(("0.0.0.0", 53)) -> OK
app #2 - bind(("192.0.2.1", 53)) -> EADDRINUSE

... unless both are setting SO_REUSEADDR (or SO_REUSEPORT and run under
same UID).

That is why if we allow selecting the source port at sendmsg() time, we
would be relaxing the existing UDP port ownership guarantees for
wildcard binds.

Perhaps this merits a sysctl, so the admin can decide if it is an
acceptable trade-off in their environment.

> I'm sure this is grief some on both TCP and UDP - especially since you
> often need to set SO_REUSADDR to stop other things breaking.
>
> 	David
>  
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
diff mbox series

Patch

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index fff78496803d..369c64a478ec 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -756,6 +756,29 @@  void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
 }
 EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
 
+static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
+				     struct in6_addr *saddr, __be16 sport)
+{
+	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
+	    (saddr && sport) &&
+	    (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) ||
+	    inet_sk(sk)->inet_sport != sport)) {
+		struct sock *sk_egress;
+
+		bpf_sk_lookup_run_v6(sock_net(sk), IPPROTO_UDP, &fl6->daddr,
+				     fl6->fl6_dport, saddr, ntohs(sport), 0,
+				     &sk_egress);
+		if (!IS_ERR_OR_NULL(sk_egress) && sk_egress == sk)
+			return true;
+
+		net_info_ratelimited("No reverse socket lookup match for local addr %pI6:%d remote addr %pI6:%d\n",
+				     &saddr, ntohs(sport), &fl6->daddr,
+				     ntohs(fl6->fl6_dport));
+	}
+
+	return false;
+}
+
 int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 			  struct msghdr *msg, struct flowi6 *fl6,
 			  struct ipcm6_cookie *ipc6)
@@ -844,7 +867,63 @@  int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 
 			break;
 		    }
+		case IPV6_ORIGDSTADDR:
+			{
+			struct sockaddr_in6 *sockaddr_in;
+			struct net_device *dev = NULL;
+
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct sockaddr_in6))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			sockaddr_in = (struct sockaddr_in6 *)CMSG_DATA(cmsg);
+
+			addr_type = __ipv6_addr_type(&sockaddr_in->sin6_addr);
+
+			if (addr_type & IPV6_ADDR_LINKLOCAL)
+				return -EINVAL;
+
+			/* If we're egressing with a different source address
+			 * and/or port, we perform a reverse socket lookup. The
+			 * rationale behind this is that we can allow return
+			 * UDP traffic that has ingressed through sk_lookup to
+			 * also egress correctly. In case the reverse lookup
+			 * fails, we continue with the normal path.
+			 *
+			 * The lookup is performed if either source address
+			 * and/or port changed, and neither is "0".
+			 */
+			if (reverse_sk_lookup(fl6, sk, &sockaddr_in->sin6_addr,
+					      sockaddr_in->sin6_port)) {
+				/* Override the source port and address to use
+				 * with the one we got in cmsg and bail early.
+				 */
+				fl6->saddr = sockaddr_in->sin6_addr;
+				fl6->fl6_sport = sockaddr_in->sin6_port;
+				break;
+			}
 
+			if (addr_type != IPV6_ADDR_ANY) {
+				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
+
+				if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) &&
+				    !ipv6_chk_addr_and_flags(net,
+							     &sockaddr_in->sin6_addr,
+							     dev, !strict, 0,
+							     IFA_F_TENTATIVE) &&
+				    !ipv6_chk_acast_addr_src(net, dev,
+							     &sockaddr_in->sin6_addr))
+					err = -EINVAL;
+				else
+					fl6->saddr = sockaddr_in->sin6_addr;
+			}
+
+			if (err)
+				goto exit_f;
+
+			break;
+			}
 		case IPV6_FLOWINFO:
 			if (cmsg->cmsg_len < CMSG_LEN(4)) {
 				err = -EINVAL;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6602a2e9cdb5..6121cbb71ad3 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1476,6 +1476,12 @@  int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	fl6->flowi6_uid = sk->sk_uid;
 
+	/* We use fl6's daddr and fl6_sport in the reverse sk_lookup done
+	 * within ip6_datagram_send_ctl() now.
+	 */
+	fl6->daddr = *daddr;
+	fl6->fl6_sport = inet->inet_sport;
+
 	if (msg->msg_controllen) {
 		opt = &opt_space;
 		memset(opt, 0, sizeof(struct ipv6_txoptions));
@@ -1511,10 +1517,8 @@  int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	fl6->flowi6_proto = sk->sk_protocol;
 	fl6->flowi6_mark = ipc6.sockc.mark;
-	fl6->daddr = *daddr;
 	if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
 		fl6->saddr = np->saddr;
-	fl6->fl6_sport = inet->inet_sport;
 
 	if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,