diff mbox series

[bpf-next,V15,2/7] bpf: fix bpf_fib_lookup helper MTU check for SKB ctx

Message ID 161228321177.576669.11521750082473556168.stgit@firesoul
State New
Headers show
Series bpf: New approach for BPF MTU handling | expand

Commit Message

Jesper Dangaard Brouer Feb. 2, 2021, 4:26 p.m. UTC
BPF end-user on Cilium slack-channel (Carlo Carraro) wants to use
bpf_fib_lookup for doing MTU-check, but *prior* to extending packet size,
by adjusting fib_params 'tot_len' with the packet length plus the expected
encap size. (Just like the bpf_check_mtu helper supports). He discovered
that for SKB ctx the param->tot_len was not used, instead skb->len was used
(via MTU check in is_skb_forwardable() that checks against netdev MTU).

Fix this by using fib_params 'tot_len' for MTU check. If not provided (e.g.
zero) then keep existing TC behaviour intact. Notice that 'tot_len' for MTU
check is done like XDP code-path, which checks against FIB-dst MTU.

V13:
- Only do ifindex lookup one time, calling dev_get_by_index_rcu().

V10:
- Use same method as XDP for 'tot_len' MTU check

Fixes: 4c79579b44b1 ("bpf: Change bpf_fib_lookup to return lookup status")
Reported-by: Carlo Carraro <colrack@gmail.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 net/core/filter.c |   50 ++++++++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 22 deletions(-)

Comments

Daniel Borkmann Feb. 5, 2021, 12:06 a.m. UTC | #1
On 2/2/21 5:26 PM, Jesper Dangaard Brouer wrote:
> BPF end-user on Cilium slack-channel (Carlo Carraro) wants to use

> bpf_fib_lookup for doing MTU-check, but *prior* to extending packet size,

> by adjusting fib_params 'tot_len' with the packet length plus the expected

> encap size. (Just like the bpf_check_mtu helper supports). He discovered

> that for SKB ctx the param->tot_len was not used, instead skb->len was used

> (via MTU check in is_skb_forwardable() that checks against netdev MTU).

> 

> Fix this by using fib_params 'tot_len' for MTU check. If not provided (e.g.

> zero) then keep existing TC behaviour intact. Notice that 'tot_len' for MTU

> check is done like XDP code-path, which checks against FIB-dst MTU.

> 

> V13:

> - Only do ifindex lookup one time, calling dev_get_by_index_rcu().

> 

> V10:

> - Use same method as XDP for 'tot_len' MTU check

> 

> Fixes: 4c79579b44b1 ("bpf: Change bpf_fib_lookup to return lookup status")

> Reported-by: Carlo Carraro <colrack@gmail.com>

> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>

> Acked-by: John Fastabend <john.fastabend@gmail.com>

[...]

I was about to apply the series just now, but on a last double check there is
a subtle logic bug in here that still needs fixing unfortunately. :/ See below:

> @@ -5568,7 +5565,9 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,

>   	   struct bpf_fib_lookup *, params, int, plen, u32, flags)

>   {

>   	struct net *net = dev_net(skb->dev);

> +	struct net_device *dev;

>   	int rc = -EAFNOSUPPORT;

> +	bool check_mtu = false;

>   

>   	if (plen < sizeof(*params))

>   		return -EINVAL;

> @@ -5576,23 +5575,30 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,

>   	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))

>   		return -EINVAL;

>   

> +	dev = dev_get_by_index_rcu(net, params->ifindex);

> +	if (unlikely(!dev))

> +		return -ENODEV;


Based on your earlier idea, you try to avoid refetching the dev this way, so
here it's being looked up via params->ifindex provided from the BPF prog ...

> +	if (params->tot_len)

> +		check_mtu = true;

> +

>   	switch (params->family) {

>   #if IS_ENABLED(CONFIG_INET)

>   	case AF_INET:

> -		rc = bpf_ipv4_fib_lookup(net, params, flags, false);

> +		rc = bpf_ipv4_fib_lookup(net, dev, params, flags, check_mtu);


... however, bpf_ipv{4,6}_fib_lookup() might change params->ifindex here to
indicate nexthop output dev:

[...]
         dev = nhc->nhc_dev;

         params->rt_metric = res.fi->fib_priority;
         params->ifindex = dev->ifindex;
[...]

>   		break;

>   #endif

>   #if IS_ENABLED(CONFIG_IPV6)

>   	case AF_INET6:

> -		rc = bpf_ipv6_fib_lookup(net, params, flags, false);

> +		rc = bpf_ipv6_fib_lookup(net, dev, params, flags, check_mtu);

>   		break;

>   #endif

>   	}

>   

> -	if (!rc) {

> -		struct net_device *dev;

> -

> -		dev = dev_get_by_index_rcu(net, params->ifindex);

> +	if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {

> +		/* When tot_len isn't provided by user,

> +		 * check skb against net_device MTU

> +		 */

>   		if (!is_skb_forwardable(dev, skb))

>   			rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;


... so using old cached dev from above will result in wrong MTU check &
subsequent passing of wrong params->mtu_result = dev->mtu this way. So one
way to fix is that we would need to pass &dev to bpf_ipv{4,6}_fib_lookup().

>   	}

> 

> 


Thanks,
Daniel
Jesper Dangaard Brouer Feb. 8, 2021, 1:57 p.m. UTC | #2
On Fri, 5 Feb 2021 01:06:35 +0100
Daniel Borkmann <daniel@iogearbox.net> wrote:

> On 2/2/21 5:26 PM, Jesper Dangaard Brouer wrote:

> > BPF end-user on Cilium slack-channel (Carlo Carraro) wants to use

> > bpf_fib_lookup for doing MTU-check, but *prior* to extending packet size,

> > by adjusting fib_params 'tot_len' with the packet length plus the expected

> > encap size. (Just like the bpf_check_mtu helper supports). He discovered

> > that for SKB ctx the param->tot_len was not used, instead skb->len was used

> > (via MTU check in is_skb_forwardable() that checks against netdev MTU).

> > 

> > Fix this by using fib_params 'tot_len' for MTU check. If not provided (e.g.

> > zero) then keep existing TC behaviour intact. Notice that 'tot_len' for MTU

> > check is done like XDP code-path, which checks against FIB-dst MTU.

> > 

> > V13:

> > - Only do ifindex lookup one time, calling dev_get_by_index_rcu().

> > 

> > V10:

> > - Use same method as XDP for 'tot_len' MTU check

> > 

> > Fixes: 4c79579b44b1 ("bpf: Change bpf_fib_lookup to return lookup status")

> > Reported-by: Carlo Carraro <colrack@gmail.com>

> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>

> > Acked-by: John Fastabend <john.fastabend@gmail.com>  

> [...]

> 

> I was about to apply the series just now, but on a last double check there is

> a subtle logic bug in here that still needs fixing unfortunately. :/ See below:

> 

> > @@ -5568,7 +5565,9 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,

> >   	   struct bpf_fib_lookup *, params, int, plen, u32, flags)

> >   {

> >   	struct net *net = dev_net(skb->dev);

> > +	struct net_device *dev;

> >   	int rc = -EAFNOSUPPORT;

> > +	bool check_mtu = false;

> >   

> >   	if (plen < sizeof(*params))

> >   		return -EINVAL;

> > @@ -5576,23 +5575,30 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,

> >   	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))

> >   		return -EINVAL;

> >   

> > +	dev = dev_get_by_index_rcu(net, params->ifindex);

> > +	if (unlikely(!dev))

> > +		return -ENODEV;  

> 

> Based on your earlier idea, you try to avoid refetching the dev this way, so

> here it's being looked up via params->ifindex provided from the BPF prog ...

> 

> > +	if (params->tot_len)

> > +		check_mtu = true;

> > +

> >   	switch (params->family) {

> >   #if IS_ENABLED(CONFIG_INET)

> >   	case AF_INET:

> > -		rc = bpf_ipv4_fib_lookup(net, params, flags, false);

> > +		rc = bpf_ipv4_fib_lookup(net, dev, params, flags, check_mtu);  

> 

> ... however, bpf_ipv{4,6}_fib_lookup() might change params->ifindex here to

> indicate nexthop output dev:

> 

> [...]

>          dev = nhc->nhc_dev;

> 

>          params->rt_metric = res.fi->fib_priority;

>          params->ifindex = dev->ifindex;

> [...]


I want to hear David Ahern, what cases does this cover?


> >   		break;

> >   #endif

> >   #if IS_ENABLED(CONFIG_IPV6)

> >   	case AF_INET6:

> > -		rc = bpf_ipv6_fib_lookup(net, params, flags, false);

> > +		rc = bpf_ipv6_fib_lookup(net, dev, params, flags, check_mtu);

> >   		break;

> >   #endif

> >   	}

> >   

> > -	if (!rc) {

> > -		struct net_device *dev;

> > -

> > -		dev = dev_get_by_index_rcu(net, params->ifindex);

> > +	if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {

> > +		/* When tot_len isn't provided by user,

> > +		 * check skb against net_device MTU

> > +		 */

> >   		if (!is_skb_forwardable(dev, skb))

> >   			rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;  

> 

> ... so using old cached dev from above will result in wrong MTU check &

> subsequent passing of wrong params->mtu_result = dev->mtu this way. 


Yes, you are right, params->ifindex have a chance to change in the calls.
So, our attempt to save an ifindex lookup (dev_get_by_index_rcu) is not
correct.

> So one

> way to fix is that we would need to pass &dev to bpf_ipv{4,6}_fib_lookup().


Ok, I will try to code it up, and see how ugly it looks, but I'm no
longer sure that it is worth saving this ifindex lookup, as it will
only happen if BPF-prog didn't specify params->tot_len.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer
Jesper Dangaard Brouer Feb. 8, 2021, 3:20 p.m. UTC | #3
On Mon, 8 Feb 2021 14:57:13 +0100
Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> On Fri, 5 Feb 2021 01:06:35 +0100

> Daniel Borkmann <daniel@iogearbox.net> wrote:

> 

> > On 2/2/21 5:26 PM, Jesper Dangaard Brouer wrote:  

> > > BPF end-user on Cilium slack-channel (Carlo Carraro) wants to use

> > > bpf_fib_lookup for doing MTU-check, but *prior* to extending packet size,

> > > by adjusting fib_params 'tot_len' with the packet length plus the expected

> > > encap size. (Just like the bpf_check_mtu helper supports). He discovered

> > > that for SKB ctx the param->tot_len was not used, instead skb->len was used

> > > (via MTU check in is_skb_forwardable() that checks against netdev MTU).

> > > 

> > > Fix this by using fib_params 'tot_len' for MTU check. If not provided (e.g.

> > > zero) then keep existing TC behaviour intact. Notice that 'tot_len' for MTU

> > > check is done like XDP code-path, which checks against FIB-dst MTU.

> > > 

> > > V13:

> > > - Only do ifindex lookup one time, calling dev_get_by_index_rcu().

> > > 

> > > V10:

> > > - Use same method as XDP for 'tot_len' MTU check

> > > 

> > > Fixes: 4c79579b44b1 ("bpf: Change bpf_fib_lookup to return lookup status")

> > > Reported-by: Carlo Carraro <colrack@gmail.com>

> > > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>

> > > Acked-by: John Fastabend <john.fastabend@gmail.com>    

> > [...]

> > 

> > I was about to apply the series just now, but on a last double check there is

> > a subtle logic bug in here that still needs fixing unfortunately. :/ See below:

> >   

> > > @@ -5568,7 +5565,9 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,

> > >   	   struct bpf_fib_lookup *, params, int, plen, u32, flags)

> > >   {

> > >   	struct net *net = dev_net(skb->dev);

> > > +	struct net_device *dev;

> > >   	int rc = -EAFNOSUPPORT;

> > > +	bool check_mtu = false;

> > >   

> > >   	if (plen < sizeof(*params))

> > >   		return -EINVAL;

> > > @@ -5576,23 +5575,30 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,

> > >   	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))

> > >   		return -EINVAL;

> > >   

> > > +	dev = dev_get_by_index_rcu(net, params->ifindex);

> > > +	if (unlikely(!dev))

> > > +		return -ENODEV;    

> > 

> > Based on your earlier idea, you try to avoid refetching the dev this way, so

> > here it's being looked up via params->ifindex provided from the BPF prog ...

> >   

> > > +	if (params->tot_len)

> > > +		check_mtu = true;

> > > +

> > >   	switch (params->family) {

> > >   #if IS_ENABLED(CONFIG_INET)

> > >   	case AF_INET:

> > > -		rc = bpf_ipv4_fib_lookup(net, params, flags, false);

> > > +		rc = bpf_ipv4_fib_lookup(net, dev, params, flags, check_mtu);    

> > 

> > ... however, bpf_ipv{4,6}_fib_lookup() might change params->ifindex here to

> > indicate nexthop output dev:

> > 

> > [...]

> >          dev = nhc->nhc_dev;

> > 

> >          params->rt_metric = res.fi->fib_priority;

> >          params->ifindex = dev->ifindex;

> > [...]  

> 

> I want to hear David Ahern, what cases does this cover?


Let me answer this myself (as it should have be obvious to myself).
The ifindex that is used as part of return param, is "set to the device
index of the nexthop from the FIB lookup" (as man-page says).  The
params->ifindex will *always* be updated, and will very likely be
another ifindex, as on "input" it is the ingress-device and on
"output" it will be egress-device (result of FIB lookup).

 
> > >   		break;

> > >   #endif

> > >   #if IS_ENABLED(CONFIG_IPV6)

> > >   	case AF_INET6:

> > > -		rc = bpf_ipv6_fib_lookup(net, params, flags, false);

> > > +		rc = bpf_ipv6_fib_lookup(net, dev, params, flags, check_mtu);

> > >   		break;

> > >   #endif

> > >   	}

> > >   

> > > -	if (!rc) {

> > > -		struct net_device *dev;

> > > -

> > > -		dev = dev_get_by_index_rcu(net, params->ifindex);

> > > +	if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {

> > > +		/* When tot_len isn't provided by user,

> > > +		 * check skb against net_device MTU

> > > +		 */

> > >   		if (!is_skb_forwardable(dev, skb))

> > >   			rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;    

> > 

> > ... so using old cached dev from above will result in wrong MTU check &

> > subsequent passing of wrong params->mtu_result = dev->mtu this way.   

> 

> Yes, you are right, params->ifindex have a chance to change in the calls.

> So, our attempt to save an ifindex lookup (dev_get_by_index_rcu) is not

> correct.

> 

> > So one

> > way to fix is that we would need to pass &dev to bpf_ipv{4,6}_fib_lookup().  

> 

> Ok, I will try to code it up, and see how ugly it looks, but I'm no

> longer sure that it is worth saving this ifindex lookup, as it will

> only happen if BPF-prog didn't specify params->tot_len.


I guess we can still do this as an "optimization", but the dev/ifindex
will very likely be another at this point.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer
Daniel Borkmann Feb. 8, 2021, 3:41 p.m. UTC | #4
On 2/8/21 4:20 PM, Jesper Dangaard Brouer wrote:
> On Mon, 8 Feb 2021 14:57:13 +0100

> Jesper Dangaard Brouer <brouer@redhat.com> wrote:

>> On Fri, 5 Feb 2021 01:06:35 +0100

>> Daniel Borkmann <daniel@iogearbox.net> wrote:

>>> On 2/2/21 5:26 PM, Jesper Dangaard Brouer wrote:

>>>> BPF end-user on Cilium slack-channel (Carlo Carraro) wants to use

>>>> bpf_fib_lookup for doing MTU-check, but *prior* to extending packet size,

>>>> by adjusting fib_params 'tot_len' with the packet length plus the expected

>>>> encap size. (Just like the bpf_check_mtu helper supports). He discovered

>>>> that for SKB ctx the param->tot_len was not used, instead skb->len was used

>>>> (via MTU check in is_skb_forwardable() that checks against netdev MTU).

>>>>

>>>> Fix this by using fib_params 'tot_len' for MTU check. If not provided (e.g.

>>>> zero) then keep existing TC behaviour intact. Notice that 'tot_len' for MTU

>>>> check is done like XDP code-path, which checks against FIB-dst MTU.

[...]
>>>> -	if (!rc) {

>>>> -		struct net_device *dev;

>>>> -

>>>> -		dev = dev_get_by_index_rcu(net, params->ifindex);

>>>> +	if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {

>>>> +		/* When tot_len isn't provided by user,

>>>> +		 * check skb against net_device MTU

>>>> +		 */

>>>>    		if (!is_skb_forwardable(dev, skb))

>>>>    			rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;

>>>

>>> ... so using old cached dev from above will result in wrong MTU check &

>>> subsequent passing of wrong params->mtu_result = dev->mtu this way.

>>

>> Yes, you are right, params->ifindex have a chance to change in the calls.

>> So, our attempt to save an ifindex lookup (dev_get_by_index_rcu) is not

>> correct.

>>

>>> So one

>>> way to fix is that we would need to pass &dev to bpf_ipv{4,6}_fib_lookup().

>>

>> Ok, I will try to code it up, and see how ugly it looks, but I'm no

>> longer sure that it is worth saving this ifindex lookup, as it will

>> only happen if BPF-prog didn't specify params->tot_len.

> 

> I guess we can still do this as an "optimization", but the dev/ifindex

> will very likely be another at this point.


I would say for sake of progress, lets ship your series w/o this optimization so
it can land, and we revisit this later on independent from here. Actually DavidA
back then acked the old poc patch I posted, so maybe that's worth a revisit as
well but needs more testing first.

Thanks,
Daniel
Jesper Dangaard Brouer Feb. 8, 2021, 4:27 p.m. UTC | #5
On Mon, 8 Feb 2021 16:41:24 +0100
Daniel Borkmann <daniel@iogearbox.net> wrote:

> On 2/8/21 4:20 PM, Jesper Dangaard Brouer wrote:

> > On Mon, 8 Feb 2021 14:57:13 +0100

> > Jesper Dangaard Brouer <brouer@redhat.com> wrote:  

> >> On Fri, 5 Feb 2021 01:06:35 +0100

> >> Daniel Borkmann <daniel@iogearbox.net> wrote:  

> >>> On 2/2/21 5:26 PM, Jesper Dangaard Brouer wrote:  

> >>>> BPF end-user on Cilium slack-channel (Carlo Carraro) wants to use

> >>>> bpf_fib_lookup for doing MTU-check, but *prior* to extending packet size,

> >>>> by adjusting fib_params 'tot_len' with the packet length plus the expected

> >>>> encap size. (Just like the bpf_check_mtu helper supports). He discovered

> >>>> that for SKB ctx the param->tot_len was not used, instead skb->len was used

> >>>> (via MTU check in is_skb_forwardable() that checks against netdev MTU).

> >>>>

> >>>> Fix this by using fib_params 'tot_len' for MTU check. If not provided (e.g.

> >>>> zero) then keep existing TC behaviour intact. Notice that 'tot_len' for MTU

> >>>> check is done like XDP code-path, which checks against FIB-dst MTU.  

> [...]

> >>>> -	if (!rc) {

> >>>> -		struct net_device *dev;

> >>>> -

> >>>> -		dev = dev_get_by_index_rcu(net, params->ifindex);

> >>>> +	if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {

> >>>> +		/* When tot_len isn't provided by user,

> >>>> +		 * check skb against net_device MTU

> >>>> +		 */

> >>>>    		if (!is_skb_forwardable(dev, skb))

> >>>>    			rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;  

> >>>

> >>> ... so using old cached dev from above will result in wrong MTU check &

> >>> subsequent passing of wrong params->mtu_result = dev->mtu this way.  

> >>

> >> Yes, you are right, params->ifindex have a chance to change in the calls.

> >> So, our attempt to save an ifindex lookup (dev_get_by_index_rcu) is not

> >> correct.

> >>  

> >>> So one

> >>> way to fix is that we would need to pass &dev to bpf_ipv{4,6}_fib_lookup().  

> >>

> >> Ok, I will try to code it up, and see how ugly it looks, but I'm no

> >> longer sure that it is worth saving this ifindex lookup, as it will

> >> only happen if BPF-prog didn't specify params->tot_len.  

> > 

> > I guess we can still do this as an "optimization", but the dev/ifindex

> > will very likely be another at this point.  

> 

> I would say for sake of progress, lets ship your series w/o this optimization so

> it can land, and we revisit this later on independent from here. 


I would really really like to make progress for this patchset.  I
unfortunately finished coding this up (and tested with selftests)
before I noticed this request (without optimizations).

I guess, I can revert my recent work by pulling in V12 of the patch.
I'll do it tomorrow, as I want to have time to run my tests before
re-sending patchset.

> Actually DavidA back then acked the old poc patch I posted, so maybe

> that's worth a revisit as well but needs more testing first.


Yes, we can always revisit this as an optimization.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer
Daniel Borkmann Feb. 8, 2021, 4:46 p.m. UTC | #6
On 2/8/21 5:27 PM, Jesper Dangaard Brouer wrote:
> On Mon, 8 Feb 2021 16:41:24 +0100

> Daniel Borkmann <daniel@iogearbox.net> wrote:

>> On 2/8/21 4:20 PM, Jesper Dangaard Brouer wrote:

>>> On Mon, 8 Feb 2021 14:57:13 +0100

>>> Jesper Dangaard Brouer <brouer@redhat.com> wrote:

>>>> On Fri, 5 Feb 2021 01:06:35 +0100

>>>> Daniel Borkmann <daniel@iogearbox.net> wrote:

>>>>> On 2/2/21 5:26 PM, Jesper Dangaard Brouer wrote:

>>>>>> BPF end-user on Cilium slack-channel (Carlo Carraro) wants to use

>>>>>> bpf_fib_lookup for doing MTU-check, but *prior* to extending packet size,

>>>>>> by adjusting fib_params 'tot_len' with the packet length plus the expected

>>>>>> encap size. (Just like the bpf_check_mtu helper supports). He discovered

>>>>>> that for SKB ctx the param->tot_len was not used, instead skb->len was used

>>>>>> (via MTU check in is_skb_forwardable() that checks against netdev MTU).

>>>>>>

>>>>>> Fix this by using fib_params 'tot_len' for MTU check. If not provided (e.g.

>>>>>> zero) then keep existing TC behaviour intact. Notice that 'tot_len' for MTU

>>>>>> check is done like XDP code-path, which checks against FIB-dst MTU.

>> [...]

>>>>>> -	if (!rc) {

>>>>>> -		struct net_device *dev;

>>>>>> -

>>>>>> -		dev = dev_get_by_index_rcu(net, params->ifindex);

>>>>>> +	if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {

>>>>>> +		/* When tot_len isn't provided by user,

>>>>>> +		 * check skb against net_device MTU

>>>>>> +		 */

>>>>>>     		if (!is_skb_forwardable(dev, skb))

>>>>>>     			rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;

>>>>>

>>>>> ... so using old cached dev from above will result in wrong MTU check &

>>>>> subsequent passing of wrong params->mtu_result = dev->mtu this way.

>>>>

>>>> Yes, you are right, params->ifindex have a chance to change in the calls.

>>>> So, our attempt to save an ifindex lookup (dev_get_by_index_rcu) is not

>>>> correct.

>>>>   

>>>>> So one

>>>>> way to fix is that we would need to pass &dev to bpf_ipv{4,6}_fib_lookup().

>>>>

>>>> Ok, I will try to code it up, and see how ugly it looks, but I'm no

>>>> longer sure that it is worth saving this ifindex lookup, as it will

>>>> only happen if BPF-prog didn't specify params->tot_len.

>>>

>>> I guess we can still do this as an "optimization", but the dev/ifindex

>>> will very likely be another at this point.

>>

>> I would say for sake of progress, lets ship your series w/o this optimization so

>> it can land, and we revisit this later on independent from here.

> 

> I would really really like to make progress for this patchset.  I

> unfortunately finished coding this up (and tested with selftests)

> before I noticed this request (without optimizations).

> 

> I guess, I can revert my recent work by pulling in V12 of the patch.

> I'll do it tomorrow, as I want to have time to run my tests before

> re-sending patchset.


I'm okay either way - it was just a suggestion and not a request, of course.
If you already have it ready in the meantime, that is also not a problem, no
need to revert again.

Thanks,
Daniel
diff mbox series

Patch

diff --git a/net/core/filter.c b/net/core/filter.c
index 5beadd659091..252fbb294001 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5301,22 +5301,18 @@  static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
 #endif
 
 #if IS_ENABLED(CONFIG_INET)
-static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+static int bpf_ipv4_fib_lookup(struct net *net, struct net_device *dev,
+			       struct bpf_fib_lookup *params,
 			       u32 flags, bool check_mtu)
 {
 	struct fib_nh_common *nhc;
 	struct in_device *in_dev;
 	struct neighbour *neigh;
-	struct net_device *dev;
 	struct fib_result res;
 	struct flowi4 fl4;
 	int err;
 	u32 mtu;
 
-	dev = dev_get_by_index_rcu(net, params->ifindex);
-	if (unlikely(!dev))
-		return -ENODEV;
-
 	/* verify forwarding is enabled on this interface */
 	in_dev = __in_dev_get_rcu(dev);
 	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
@@ -5418,14 +5414,14 @@  static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 #endif
 
 #if IS_ENABLED(CONFIG_IPV6)
-static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+static int bpf_ipv6_fib_lookup(struct net *net, struct net_device *dev,
+			       struct bpf_fib_lookup *params,
 			       u32 flags, bool check_mtu)
 {
 	struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
 	struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
 	struct fib6_result res = {};
 	struct neighbour *neigh;
-	struct net_device *dev;
 	struct inet6_dev *idev;
 	struct flowi6 fl6;
 	int strict = 0;
@@ -5436,10 +5432,6 @@  static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	if (rt6_need_strict(dst) || rt6_need_strict(src))
 		return BPF_FIB_LKUP_RET_NOT_FWDED;
 
-	dev = dev_get_by_index_rcu(net, params->ifindex);
-	if (unlikely(!dev))
-		return -ENODEV;
-
 	idev = __in6_dev_get_safely(dev);
 	if (unlikely(!idev || !idev->cnf.forwarding))
 		return BPF_FIB_LKUP_RET_FWD_DISABLED;
@@ -5533,22 +5525,27 @@  static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
 	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
 {
+	struct net *net = dev_net(ctx->rxq->dev);
+	struct net_device *dev;
+
 	if (plen < sizeof(*params))
 		return -EINVAL;
 
 	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
 		return -EINVAL;
 
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
 	switch (params->family) {
 #if IS_ENABLED(CONFIG_INET)
 	case AF_INET:
-		return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
-					   flags, true);
+		return bpf_ipv4_fib_lookup(net, dev, params, flags, true);
 #endif
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
-					   flags, true);
+		return bpf_ipv6_fib_lookup(net, dev, params, flags, true);
 #endif
 	}
 	return -EAFNOSUPPORT;
@@ -5568,7 +5565,9 @@  BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
 {
 	struct net *net = dev_net(skb->dev);
+	struct net_device *dev;
 	int rc = -EAFNOSUPPORT;
+	bool check_mtu = false;
 
 	if (plen < sizeof(*params))
 		return -EINVAL;
@@ -5576,23 +5575,30 @@  BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
 		return -EINVAL;
 
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	if (params->tot_len)
+		check_mtu = true;
+
 	switch (params->family) {
 #if IS_ENABLED(CONFIG_INET)
 	case AF_INET:
-		rc = bpf_ipv4_fib_lookup(net, params, flags, false);
+		rc = bpf_ipv4_fib_lookup(net, dev, params, flags, check_mtu);
 		break;
 #endif
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		rc = bpf_ipv6_fib_lookup(net, params, flags, false);
+		rc = bpf_ipv6_fib_lookup(net, dev, params, flags, check_mtu);
 		break;
 #endif
 	}
 
-	if (!rc) {
-		struct net_device *dev;
-
-		dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
+		/* When tot_len isn't provided by user,
+		 * check skb against net_device MTU
+		 */
 		if (!is_skb_forwardable(dev, skb))
 			rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
 	}