diff mbox series

[net,1/2] devlink: Hold rtnl lock while reading netdev attributes

Message ID 20201122061257.60425-2-parav@nvidia.com
State New
Headers show
Series [net,1/2] devlink: Hold rtnl lock while reading netdev attributes | expand

Commit Message

Parav Pandit Nov. 22, 2020, 6:12 a.m. UTC
A netdevice of a devlink port can be moved to different
net namespace than its parent devlink instance.
This scenario occurs when devlink reload is not used for
maintaining backward compatibility.

When netdevice is undergoing migration to net namespace,
its ifindex and name may change.

In such use case, devlink port query may read stale netdev
attributes.

Fix it by reading them under rtnl lock.

Fixes: bfcd3a466172 ("Introduce devlink infrastructure")
Signed-off-by: Parav Pandit <parav@nvidia.com>
---
 net/core/devlink.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

Comments

Jakub Kicinski Nov. 24, 2020, 10:29 p.m. UTC | #1
On Sun, 22 Nov 2020 08:12:56 +0200 Parav Pandit wrote:
> A netdevice of a devlink port can be moved to different

> net namespace than its parent devlink instance.

> This scenario occurs when devlink reload is not used for

> maintaining backward compatibility.

> 

> When netdevice is undergoing migration to net namespace,

> its ifindex and name may change.

> 

> In such use case, devlink port query may read stale netdev

> attributes.

> 

> Fix it by reading them under rtnl lock.

> 

> Fixes: bfcd3a466172 ("Introduce devlink infrastructure")

> Signed-off-by: Parav Pandit <parav@nvidia.com>

> ---

>  net/core/devlink.c | 30 ++++++++++++++++++++++++------

>  1 file changed, 24 insertions(+), 6 deletions(-)

> 

> diff --git a/net/core/devlink.c b/net/core/devlink.c

> index acc29d5157f4..6135ef5972ce 100644

> --- a/net/core/devlink.c

> +++ b/net/core/devlink.c

> @@ -775,6 +775,23 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por

>  	return err;

>  }

>  

> +static int devlink_nl_port_netdev_fill(struct sk_buff *msg, struct devlink_port *devlink_port)

> +{

> +	struct net_device *netdev = devlink_port->type_dev;

> +	int err;

> +

> +	ASSERT_RTNL();

> +	if (!netdev)

> +		return 0;

> +

> +	err = nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, netdev->ifindex);


The line wrapping was correct, please keep in under 80. Please tell
your colleges at Mellanox.

> +	if (err)

> +		goto done;


	return err;

> +	err = nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, netdev->name);


	return nla_put_...

> +done:

> +	return err;

> +}

> +

>  static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,

>  				struct devlink_port *devlink_port,

>  				enum devlink_command cmd, u32 portid,

> @@ -792,6 +809,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,

>  	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))

>  		goto nla_put_failure;

>  

> +	/* Hold rtnl lock while accessing port's netdev attributes. */

> +	rtnl_lock();

>  	spin_lock_bh(&devlink_port->type_lock);

>  	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))

>  		goto nla_put_failure_type_locked;

> @@ -800,13 +819,10 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,

>  			devlink_port->desired_type))

>  		goto nla_put_failure_type_locked;

>  	if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {

> -		struct net_device *netdev = devlink_port->type_dev;

> +		int err;


What's the point of this local variable?

> -		if (netdev &&

> -		    (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,

> -				 netdev->ifindex) ||

> -		     nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,

> -				    netdev->name)))

> +		err = devlink_nl_port_netdev_fill(msg, devlink_port);

> +		if (err)


just put the call in the if ()

>  			goto nla_put_failure_type_locked;

>  	}

>  	if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {



Honestly this patch is doing too much for a fix.

All you need is the RTNL lock and then add:

+               struct net *net = devlink_net(devlink_port->devlink);
                struct net_device *netdev = devlink_port->type_dev;
 
                if (netdev &&
+                   net_eq(net, dev_net(netdev)) &&
                    (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
                                 netdev->ifindex) ||
                     nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,


You can do refactoring later in net-next. Maybe even add a check that
drivers which support reload set namespace local on their netdevs.
Parav Pandit Nov. 25, 2020, 7:13 a.m. UTC | #2
> From: Jakub Kicinski <kuba@kernel.org>

> Sent: Wednesday, November 25, 2020 3:59 AM

> 

> On Sun, 22 Nov 2020 08:12:56 +0200 Parav Pandit wrote:

> > A netdevice of a devlink port can be moved to different net namespace

> > than its parent devlink instance.

> > This scenario occurs when devlink reload is not used for maintaining

> > backward compatibility.

> >

> > When netdevice is undergoing migration to net namespace, its ifindex

> > and name may change.

> >

> > In such use case, devlink port query may read stale netdev attributes.

> >

> > Fix it by reading them under rtnl lock.

> >

> > Fixes: bfcd3a466172 ("Introduce devlink infrastructure")

> > Signed-off-by: Parav Pandit <parav@nvidia.com>

> > ---

> >  net/core/devlink.c | 30 ++++++++++++++++++++++++------

> >  1 file changed, 24 insertions(+), 6 deletions(-)

> >

> > diff --git a/net/core/devlink.c b/net/core/devlink.c index

> > acc29d5157f4..6135ef5972ce 100644

> > --- a/net/core/devlink.c

> > +++ b/net/core/devlink.c

> > @@ -775,6 +775,23 @@ devlink_nl_port_function_attrs_put(struct sk_buff

> *msg, struct devlink_port *por

> >  	return err;

> >  }

> >

> > +static int devlink_nl_port_netdev_fill(struct sk_buff *msg, struct

> > +devlink_port *devlink_port) {

> > +	struct net_device *netdev = devlink_port->type_dev;

> > +	int err;

> > +

> > +	ASSERT_RTNL();

> > +	if (!netdev)

> > +		return 0;

> > +

> > +	err = nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,

> > +netdev->ifindex);

> 

> The line wrapping was correct, please keep in under 80. Please tell your colleges

> at Mellanox.

> 

> > +	if (err)

> > +		goto done;

> 

> 	return err;

> 

> > +	err = nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,

> > +netdev->name);

> 

> 	return nla_put_...

> 

> > +done:

> > +	return err;

> > +}

> > +

> >  static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,

> >  				struct devlink_port *devlink_port,

> >  				enum devlink_command cmd, u32 portid, @@ -

> 792,6 +809,8 @@ static

> > int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,

> >  	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port-

> >index))

> >  		goto nla_put_failure;

> >

> > +	/* Hold rtnl lock while accessing port's netdev attributes. */

> > +	rtnl_lock();

> >  	spin_lock_bh(&devlink_port->type_lock);

> >  	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))

> >  		goto nla_put_failure_type_locked;

> > @@ -800,13 +819,10 @@ static int devlink_nl_port_fill(struct sk_buff *msg,

> struct devlink *devlink,

> >  			devlink_port->desired_type))

> >  		goto nla_put_failure_type_locked;

> >  	if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {

> > -		struct net_device *netdev = devlink_port->type_dev;

> > +		int err;

> 

> What's the point of this local variable?

> 

I will avoid refactor for now, so above comment doesn't need to be addressed.
> > -		if (netdev &&

> > -		    (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,

> > -				 netdev->ifindex) ||

> > -		     nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,

> > -				    netdev->name)))

> > +		err = devlink_nl_port_netdev_fill(msg, devlink_port);

> > +		if (err)

> 

> just put the call in the if ()

Ok.
> 

> >  			goto nla_put_failure_type_locked;

> >  	}

> >  	if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {

> 

> 

> Honestly this patch is doing too much for a fix.

> 

> All you need is the RTNL lock and then add:

> 

Ok. I will differ the refactor to later point.

> +               struct net *net = devlink_net(devlink_port->devlink);

>                 struct net_device *netdev = devlink_port->type_dev;

> 

>                 if (netdev &&

> +                   net_eq(net, dev_net(netdev)) &&

>                     (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,

>                                  netdev->ifindex) ||

>                      nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,

> 

> 

> You can do refactoring later in net-next. Maybe even add a check that drivers

> which support reload set namespace local on their netdevs.

This will break the backward compatibility as orchestration for VFs are not using devlink reload, which is supported very recently.
But yes, for SF who doesn't have backward compatibility issue, as soon as initial series is merged, I will mark it as local, so that orchestration doesn't start on wrong foot.
Jakub Kicinski Nov. 25, 2020, 4:30 p.m. UTC | #3
On Wed, 25 Nov 2020 07:13:40 +0000 Parav Pandit wrote:
> > Maybe even add a check that drivers

> > which support reload set namespace local on their netdevs.  

> This will break the backward compatibility as orchestration for VFs

> are not using devlink reload, which is supported very recently. But

> yes, for SF who doesn't have backward compatibility issue, as soon as

> initial series is merged, I will mark it as local, so that

> orchestration doesn't start on wrong foot.


Ah, right, that will not work because of the shenanigans you guys play
with the uplink port. If all reprs are NETNS_LOCAL it'd not be an issue.
Parav Pandit Nov. 25, 2020, 5:21 p.m. UTC | #4
> From: Jakub Kicinski <kuba@kernel.org>

> Sent: Wednesday, November 25, 2020 10:00 PM

> 

> On Wed, 25 Nov 2020 07:13:40 +0000 Parav Pandit wrote:

> > > Maybe even add a check that drivers

> > > which support reload set namespace local on their netdevs.

> > This will break the backward compatibility as orchestration for VFs

> > are not using devlink reload, which is supported very recently. But

> > yes, for SF who doesn't have backward compatibility issue, as soon as

> > initial series is merged, I will mark it as local, so that

> > orchestration doesn't start on wrong foot.

> 

> Ah, right, that will not work because of the shenanigans you guys play with

> the uplink port. If all reprs are NETNS_LOCAL it'd not be an issue.

I am not sure what secret are you talking about with uplink.
I am taking about the SF netdevice to have the NETNS_LOCAL not the SF rep.
SF rep anyway has NETNS_LOCAL set.

I do not follow your comment - 'that will not work'. Can you please explain?
Do you mean I should take care for SF's netdevice to have NETNS_LOCAL in first patchset or you mean setting NETNS_LOCAL for VF's Netdev will not work?
If its later, sure it will break the backward compatibility, so will not do as default.
But yes, SF I want to subsequently.
Jakub Kicinski Nov. 25, 2020, 5:41 p.m. UTC | #5
On Wed, 25 Nov 2020 17:21:41 +0000 Parav Pandit wrote:
> > From: Jakub Kicinski <kuba@kernel.org>

> > Sent: Wednesday, November 25, 2020 10:00 PM

> > 

> > On Wed, 25 Nov 2020 07:13:40 +0000 Parav Pandit wrote:  

> > > > Maybe even add a check that drivers

> > > > which support reload set namespace local on their netdevs.  

> > > This will break the backward compatibility as orchestration for VFs

> > > are not using devlink reload, which is supported very recently. But

> > > yes, for SF who doesn't have backward compatibility issue, as soon as

> > > initial series is merged, I will mark it as local, so that

> > > orchestration doesn't start on wrong foot.  

> > 

> > Ah, right, that will not work because of the shenanigans you guys play with

> > the uplink port. If all reprs are NETNS_LOCAL it'd not be an issue.  

> I am not sure what secret are you talking about with uplink.


I'm referring to Mellanox conflating PF with uplink. It's not a secret,
we argued about it in the past.

> I am taking about the SF netdevice to have the NETNS_LOCAL not the SF rep.

> SF rep anyway has NETNS_LOCAL set.


All reps build by mlx5e_build_rep_netdev() have NETNS_LOCAL.

> I do not follow your comment - 'that will not work'. Can you please explain?


My half-baked suggestion was to basically add a:

	WARN_ON(ops->reload_down && ops->reload_up &&
		!(netdev->priv & NETIF_F_NETNS_LOCAL));

to devlink_port_type_netdev_checks(). Given if device has a reload
callback devlink is the way to change netns. But yeah, we can't break
existing behavior so your uplink has to be movable and can't have
NETNS_LOCAL. IOW adding the WARN_ON() won't work.

Hope this is crystal clear now.

> Do you mean I should take care for SF's netdevice to have NETNS_LOCAL in first patchset or you mean setting NETNS_LOCAL for VF's Netdev will not work?

> If its later, sure it will break the backward compatibility, so will not do as default.
Parav Pandit Nov. 25, 2020, 6:17 p.m. UTC | #6
> From: Jakub Kicinski <kuba@kernel.org>

> Sent: Wednesday, November 25, 2020 11:12 PM

> 

> On Wed, 25 Nov 2020 17:21:41 +0000 Parav Pandit wrote:

> > > From: Jakub Kicinski <kuba@kernel.org>

> > > Sent: Wednesday, November 25, 2020 10:00 PM

> > >

> > > On Wed, 25 Nov 2020 07:13:40 +0000 Parav Pandit wrote:

> > > > > Maybe even add a check that drivers which support reload set

> > > > > namespace local on their netdevs.

> > > > This will break the backward compatibility as orchestration for

> > > > VFs are not using devlink reload, which is supported very

> > > > recently. But yes, for SF who doesn't have backward compatibility

> > > > issue, as soon as initial series is merged, I will mark it as

> > > > local, so that orchestration doesn't start on wrong foot.

> > >

> > > Ah, right, that will not work because of the shenanigans you guys

> > > play with the uplink port. If all reprs are NETNS_LOCAL it'd not be an

> issue.

> > I am not sure what secret are you talking about with uplink.

> 

> I'm referring to Mellanox conflating PF with uplink. It's not a secret,

Ok.
> we argued about it in the past.

> 

> > I am taking about the SF netdevice to have the NETNS_LOCAL not the SF

> rep.

> > SF rep anyway has NETNS_LOCAL set.

> 

> All reps build by mlx5e_build_rep_netdev() have NETNS_LOCAL.

> 

Yes. this is clear to me and we are good here. 😊

> > I do not follow your comment - 'that will not work'. Can you please explain?

> 

> My half-baked suggestion was to basically add a:

> 

> 	WARN_ON(ops->reload_down && ops->reload_up &&

> 		!(netdev->priv & NETIF_F_NETNS_LOCAL));

> 

> to devlink_port_type_netdev_checks(). Given if device has a reload

> callback devlink is the way to change netns. But yeah, we can't break

> existing behavior so your uplink has to be movable and can't have

> NETNS_LOCAL. IOW adding the WARN_ON() won't work.

> 

Right.

> Hope this is crystal clear now.

Yes, its clear. Thanks.

I addressed your comments and cut down both the fixes to merely 7 lines change. Sent v2.
diff mbox series

Patch

diff --git a/net/core/devlink.c b/net/core/devlink.c
index acc29d5157f4..6135ef5972ce 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -775,6 +775,23 @@  devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por
 	return err;
 }
 
+static int devlink_nl_port_netdev_fill(struct sk_buff *msg, struct devlink_port *devlink_port)
+{
+	struct net_device *netdev = devlink_port->type_dev;
+	int err;
+
+	ASSERT_RTNL();
+	if (!netdev)
+		return 0;
+
+	err = nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, netdev->ifindex);
+	if (err)
+		goto done;
+	err = nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, netdev->name);
+done:
+	return err;
+}
+
 static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 				struct devlink_port *devlink_port,
 				enum devlink_command cmd, u32 portid,
@@ -792,6 +809,8 @@  static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
 		goto nla_put_failure;
 
+	/* Hold rtnl lock while accessing port's netdev attributes. */
+	rtnl_lock();
 	spin_lock_bh(&devlink_port->type_lock);
 	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
 		goto nla_put_failure_type_locked;
@@ -800,13 +819,10 @@  static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 			devlink_port->desired_type))
 		goto nla_put_failure_type_locked;
 	if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
-		struct net_device *netdev = devlink_port->type_dev;
+		int err;
 
-		if (netdev &&
-		    (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
-				 netdev->ifindex) ||
-		     nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
-				    netdev->name)))
+		err = devlink_nl_port_netdev_fill(msg, devlink_port);
+		if (err)
 			goto nla_put_failure_type_locked;
 	}
 	if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
@@ -818,6 +834,7 @@  static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 			goto nla_put_failure_type_locked;
 	}
 	spin_unlock_bh(&devlink_port->type_lock);
+	rtnl_unlock();
 	if (devlink_nl_port_attrs_put(msg, devlink_port))
 		goto nla_put_failure;
 	if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack))
@@ -828,6 +845,7 @@  static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 
 nla_put_failure_type_locked:
 	spin_unlock_bh(&devlink_port->type_lock);
+	rtnl_unlock();
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
 	return -EMSGSIZE;