diff mbox series

[RFC,1/3] veth: implement support for set_channel ethtool op

Message ID 681c32be3a9172e9468893a89fb928b46c5c5ee6.1625823139.git.pabeni@redhat.com
State New
Headers show
Series veth: more flexible channels number configuration | expand

Commit Message

Paolo Abeni July 9, 2021, 9:39 a.m. UTC
This change implements the set_channel() ethtool operation,
preserving the current defaults values and allowing up set
the number of queues in the range set ad device creation
time.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/veth.c | 62 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 58 insertions(+), 4 deletions(-)

Comments

Paolo Abeni July 9, 2021, 10:49 a.m. UTC | #1
Hello,

On Fri, 2021-07-09 at 12:15 +0200, Toke Høiland-Jørgensen wrote:
> > @@ -224,10 +226,49 @@ static void veth_get_channels(struct net_device *dev,
> >  {
> >  	channels->tx_count = dev->real_num_tx_queues;
> >  	channels->rx_count = dev->real_num_rx_queues;
> > -	channels->max_tx = dev->real_num_tx_queues;
> > -	channels->max_rx = dev->real_num_rx_queues;
> > +	channels->max_tx = dev->num_tx_queues;
> > +	channels->max_rx = dev->num_rx_queues;
> >  	channels->combined_count = min(dev->real_num_rx_queues, dev->real_num_tx_queues);
> > -	channels->max_combined = min(dev->real_num_rx_queues, dev->real_num_tx_queues);
> > +	channels->max_combined = min(dev->num_rx_queues, dev->num_tx_queues);
> > +}
> > +
> > +static int veth_close(struct net_device *dev);
> > +static int veth_open(struct net_device *dev);
> > +
> > +static int veth_set_channels(struct net_device *dev,
> > +			     struct ethtool_channels *ch)
> > +{
> > +	struct veth_priv *priv = netdev_priv(dev);
> > +	struct veth_priv *peer_priv;
> > +
> > +	/* accept changes only on rx/tx */
> > +	if (ch->combined_count != min(dev->real_num_rx_queues, dev->real_num_tx_queues))
> > +		return -EINVAL;
> > +
> > +	/* respect contraint posed at device creation time */
> > +	if (ch->rx_count > dev->num_rx_queues || ch->tx_count > dev->num_tx_queues)
> > +		return -EINVAL;
> > +
> > +	if (!ch->rx_count || !ch->tx_count)
> > +		return -EINVAL;
> > +
> > +	/* avoid braking XDP, if that is enabled */
> > +	if (priv->_xdp_prog && ch->rx_count < priv->peer->real_num_tx_queues)
> > +		return -EINVAL;
> > +
> > +	peer_priv = netdev_priv(priv->peer);
> > +	if (peer_priv->_xdp_prog && ch->tx_count > priv->peer->real_num_rx_queues)
> > +		return -EINVAL;
> 
> An XDP program could be loaded later, so I think it's better to enforce
> this constraint unconditionally.

The relevant check is already present in veth_xdp_set(), the load will
be rejected with an appropriate extack message.

I enforcing the above contraint uncontiditionally will make impossible
changing the number of real queues at runtime, as we must update dev
and peer in different moments.

> (What happens on the regular xmit path if the number of TX queues is out
> of sync with the peer RX queues, BTW?)

if tx < rx, the higly number of rx queue will not be used, if rx < tx,
XDP/gro can't take place: the normal veth path is used.

> > +	if (netif_running(dev))
> > +		veth_close(dev);
> > +
> > +	priv->num_tx_queues = ch->tx_count;
> > +	priv->num_rx_queues = ch->rx_count;
> 
> Why can't just you use netif_set_real_num_*_queues() here directly (and
> get rid of the priv members as above)?

Uhm... I haven't thought about it. Let me try ;)

Thanks!

/P
Toke Høiland-Jørgensen July 9, 2021, 3:23 p.m. UTC | #2
Paolo Abeni <pabeni@redhat.com> writes:

> On Fri, 2021-07-09 at 12:49 +0200, Paolo Abeni wrote:
>> On Fri, 2021-07-09 at 12:15 +0200, Toke Høiland-Jørgensen wrote:
>> > > +	if (netif_running(dev))
>> > > +		veth_close(dev);
>> > > +
>> > > +	priv->num_tx_queues = ch->tx_count;
>> > > +	priv->num_rx_queues = ch->rx_count;
>> > 
>> > Why can't just you use netif_set_real_num_*_queues() here directly (and
>> > get rid of the priv members as above)?
>> 
>> Uhm... I haven't thought about it. Let me try ;)
>
> Here there is a possible problem: if the latter
> netif_set_real_num_*_queues() fails, we should not change the current
> configuration, so we should revert the
> first netif_set_real_num_*_queues() change.
>
> Even that additional revert operation could fail. If/when that happen
> set_channel() will leave the device in a different state from both the
> old one and the new one, possibly with an XDP-incompatible number of
> queues.
>
> Keeping the  netif_set_real_num_*_queues() calls in veth_open() avoid
> the above issue: if the queue creation is problematic, the device will
> stay down.
>
> I think the additional fields are worthy, WDYT?

Hmm, wouldn't the right thing to do be to back out the change and return
an error to userspace? Something like:

+	if (netif_running(dev))
+		veth_close(dev);
+
+	old_rx_queues = dev->real_num_rx_queues;
+	err = netif_set_real_num_rx_queues(dev, ch->rx_count);
+	if (err)
+		return err;
+
+	err = netif_set_real_num_tx_queues(dev, ch->tx_count);
+	if (err) {
+		netif_set_real_num_rx_queues(dev, old_rx_queues);
+		return err;
+	}
+
+	if (netif_running(dev))
+		veth_open(dev);
+	return 0;


(also, shouldn't the result of veth_open() be returned? bit weird if you
don't get an error but the device stays down...)

-Toke
Paolo Abeni July 9, 2021, 4:35 p.m. UTC | #3
On Fri, 2021-07-09 at 17:23 +0200, Toke Høiland-Jørgensen wrote:
> Paolo Abeni <pabeni@redhat.com> writes:
> > On Fri, 2021-07-09 at 12:49 +0200, Paolo Abeni wrote:
> > > On Fri, 2021-07-09 at 12:15 +0200, Toke Høiland-Jørgensen wrote:
> > > > > +	if (netif_running(dev))
> > > > > +		veth_close(dev);
> > > > > +
> > > > > +	priv->num_tx_queues = ch->tx_count;
> > > > > +	priv->num_rx_queues = ch->rx_count;
> > > > 
> > > > Why can't just you use netif_set_real_num_*_queues() here directly (and
> > > > get rid of the priv members as above)?
> > > 
> > > Uhm... I haven't thought about it. Let me try ;)
> > 
> > Here there is a possible problem: if the latter
> > netif_set_real_num_*_queues() fails, we should not change the current
> > configuration, so we should revert the
> > first netif_set_real_num_*_queues() change.
> > 
> > Even that additional revert operation could fail. If/when that happen
> > set_channel() will leave the device in a different state from both the
> > old one and the new one, possibly with an XDP-incompatible number of
> > queues.
> > 
> > Keeping the  netif_set_real_num_*_queues() calls in veth_open() avoid
> > the above issue: if the queue creation is problematic, the device will
> > stay down.
> > 
> > I think the additional fields are worthy, WDYT?
> 
> Hmm, wouldn't the right thing to do be to back out the change and return
> an error to userspace? Something like:
> 
> +	if (netif_running(dev))
> +		veth_close(dev);
> +
> +	old_rx_queues = dev->real_num_rx_queues;
> +	err = netif_set_real_num_rx_queues(dev, ch->rx_count);
> +	if (err)
> +		return err;
> +
> +	err = netif_set_real_num_tx_queues(dev, ch->tx_count);
> +	if (err) {
> +		netif_set_real_num_rx_queues(dev, old_rx_queues);

I'm sorry, I was not clear enough. I mean: even the
above netif_set_real_num_rx_queues() can fail. When that happen we will
leave the device in an inconsistent state, possibly even with an
"unsupported" queue setting.

> +		return err;
> +	}
> +
> +	if (netif_running(dev))
> +		veth_open(dev);
> +	return 0;
> 
> 
> (also, shouldn't the result of veth_open() be returned? bit weird if you
> don't get an error but the device stays down...)

Agreed.

Thanks!

Paolo
Jakub Kicinski July 9, 2021, 7:54 p.m. UTC | #4
On Fri,  9 Jul 2021 11:39:48 +0200 Paolo Abeni wrote:
> +	/* accept changes only on rx/tx */
> +	if (ch->combined_count != min(dev->real_num_rx_queues, dev->real_num_tx_queues))
> +		return -EINVAL;

Ah damn, I must have missed the get_channels being added. I believe the
correct interpretation of the params is rx means NAPI with just Rx
queue(s), tx NAPI with just Tx queue(s) and combined has both.
IOW combined != min(rx, tx).
Instead real_rx = combined + rx; real_tx = combined + tx.
Can we still change this?

> +	/* respect contraint posed at device creation time */
> +	if (ch->rx_count > dev->num_rx_queues || ch->tx_count > dev->num_tx_queues)
> +		return -EINVAL;

Could you lift this check into ethtool core?

> +	if (!ch->rx_count || !ch->tx_count)
> +		return -EINVAL;

You wouldn't need this with the right interpretation of combined :(

> +	/* avoid braking XDP, if that is enabled */
> +	if (priv->_xdp_prog && ch->rx_count < priv->peer->real_num_tx_queues)
> +		return -EINVAL;
> +
> +	peer_priv = netdev_priv(priv->peer);
> +	if (peer_priv->_xdp_prog && ch->tx_count > priv->peer->real_num_rx_queues)
> +		return -EINVAL;
> +
> +	if (netif_running(dev))
> +		veth_close(dev);
> +
> +	priv->num_tx_queues = ch->tx_count;
> +	priv->num_rx_queues = ch->rx_count;
> +
> +	if (netif_running(dev))
> +		veth_open(dev);
David Ahern July 12, 2021, 1:44 a.m. UTC | #5
On 7/9/21 1:54 PM, Jakub Kicinski wrote:
> On Fri,  9 Jul 2021 11:39:48 +0200 Paolo Abeni wrote:

>> +	/* accept changes only on rx/tx */

>> +	if (ch->combined_count != min(dev->real_num_rx_queues, dev->real_num_tx_queues))

>> +		return -EINVAL;

> 

> Ah damn, I must have missed the get_channels being added. I believe the

> correct interpretation of the params is rx means NAPI with just Rx

> queue(s), tx NAPI with just Tx queue(s) and combined has both.

> IOW combined != min(rx, tx).

> Instead real_rx = combined + rx; real_tx = combined + tx.

> Can we still change this?


Is it not an 'either' / 'or' situation? ie., you can either control the
number of Rx and Tx queues or you control the combined value but not
both. That is what I recall from nics (e.g., ConnectX).
Paolo Abeni July 12, 2021, 10:45 a.m. UTC | #6
Hello,

On Sun, 2021-07-11 at 19:44 -0600, David Ahern wrote:
> On 7/9/21 1:54 PM, Jakub Kicinski wrote:

> > On Fri,  9 Jul 2021 11:39:48 +0200 Paolo Abeni wrote:

> > > +	/* accept changes only on rx/tx */

> > > +	if (ch->combined_count != min(dev->real_num_rx_queues, dev->real_num_tx_queues))

> > > +		return -EINVAL;

> > 

> > Ah damn, I must have missed the get_channels being added. I believe the

> > correct interpretation of the params is rx means NAPI with just Rx

> > queue(s), tx NAPI with just Tx queue(s) and combined has both.

> > IOW combined != min(rx, tx).

> > Instead real_rx = combined + rx; real_tx = combined + tx.

> > Can we still change this?

> 

> Is it not an 'either' / 'or' situation? ie., you can either control the

> number of Rx and Tx queues or you control the combined value but not

> both. That is what I recall from nics (e.g., ConnectX).


Thanks for the feedback. My understanding was quite alike what David
stated - and indeed that is what ConnectX enforces AFAICS. Anyhow the
core ethtool code allows for what Jackub said, so I guess I need to
deal with that.

@Jakub: if we are still on time about changing the veth_get_channel()
exposed behaviour, what about just showing nr combined == 0 and
enforcing comined_max == 0? that would both describe more closely the
veth architecture and will make the code simpler - beyond fixing the
current uncorrect nr channels report.

Thanks!

Paolo
Jakub Kicinski July 12, 2021, 3:23 p.m. UTC | #7
On Mon, 12 Jul 2021 12:45:13 +0200 Paolo Abeni wrote:
> On Sun, 2021-07-11 at 19:44 -0600, David Ahern wrote:

> > On 7/9/21 1:54 PM, Jakub Kicinski wrote:  

> > > Ah damn, I must have missed the get_channels being added. I believe the

> > > correct interpretation of the params is rx means NAPI with just Rx

> > > queue(s), tx NAPI with just Tx queue(s) and combined has both.

> > > IOW combined != min(rx, tx).

> > > Instead real_rx = combined + rx; real_tx = combined + tx.

> > > Can we still change this?  

> > 

> > Is it not an 'either' / 'or' situation? ie., you can either control the

> > number of Rx and Tx queues or you control the combined value but not

> > both. That is what I recall from nics (e.g., ConnectX).  

> 

> Thanks for the feedback. My understanding was quite alike what David

> stated - and indeed that is what ConnectX enforces AFAICS. Anyhow the

> core ethtool code allows for what Jackub said, so I guess I need to

> deal with that.


I thought Mellanox was doing something funny to reuse the rx as the
number of AF_XDP queues. Normal rings are not reported twice, they're
only reported as combined.

ethtool man page is relatively clear, unfortunately the kernel code 
is not and few read the man page. A channel is approximately an IRQ, 
not a queue, and IRQ can't be dedicated and combined simultaneously:

 "A channel is an IRQ and the set of queues that can trigger that IRQ."

 " rx N   Changes the number of channels with only receive queues."

> @Jakub: if we are still on time about changing the veth_get_channel()

> exposed behaviour, what about just showing nr combined == 0 and

> enforcing comined_max == 0? that would both describe more closely the

> veth architecture and will make the code simpler - beyond fixing the

> current uncorrect nr channels report.


SGTM.
diff mbox series

Patch

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index bdb7ce3cb054..10360228a06a 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -72,6 +72,8 @@  struct veth_priv {
 	atomic64_t		dropped;
 	struct bpf_prog		*_xdp_prog;
 	struct veth_rq		*rq;
+	unsigned int		num_tx_queues;
+	unsigned int		num_rx_queues;
 	unsigned int		requested_headroom;
 };
 
@@ -224,10 +226,49 @@  static void veth_get_channels(struct net_device *dev,
 {
 	channels->tx_count = dev->real_num_tx_queues;
 	channels->rx_count = dev->real_num_rx_queues;
-	channels->max_tx = dev->real_num_tx_queues;
-	channels->max_rx = dev->real_num_rx_queues;
+	channels->max_tx = dev->num_tx_queues;
+	channels->max_rx = dev->num_rx_queues;
 	channels->combined_count = min(dev->real_num_rx_queues, dev->real_num_tx_queues);
-	channels->max_combined = min(dev->real_num_rx_queues, dev->real_num_tx_queues);
+	channels->max_combined = min(dev->num_rx_queues, dev->num_tx_queues);
+}
+
+static int veth_close(struct net_device *dev);
+static int veth_open(struct net_device *dev);
+
+static int veth_set_channels(struct net_device *dev,
+			     struct ethtool_channels *ch)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	struct veth_priv *peer_priv;
+
+	/* accept changes only on rx/tx */
+	if (ch->combined_count != min(dev->real_num_rx_queues, dev->real_num_tx_queues))
+		return -EINVAL;
+
+	/* respect contraint posed at device creation time */
+	if (ch->rx_count > dev->num_rx_queues || ch->tx_count > dev->num_tx_queues)
+		return -EINVAL;
+
+	if (!ch->rx_count || !ch->tx_count)
+		return -EINVAL;
+
+	/* avoid braking XDP, if that is enabled */
+	if (priv->_xdp_prog && ch->rx_count < priv->peer->real_num_tx_queues)
+		return -EINVAL;
+
+	peer_priv = netdev_priv(priv->peer);
+	if (peer_priv->_xdp_prog && ch->tx_count > priv->peer->real_num_rx_queues)
+		return -EINVAL;
+
+	if (netif_running(dev))
+		veth_close(dev);
+
+	priv->num_tx_queues = ch->tx_count;
+	priv->num_rx_queues = ch->rx_count;
+
+	if (netif_running(dev))
+		veth_open(dev);
+	return 0;
 }
 
 static const struct ethtool_ops veth_ethtool_ops = {
@@ -239,6 +280,7 @@  static const struct ethtool_ops veth_ethtool_ops = {
 	.get_link_ksettings	= veth_get_link_ksettings,
 	.get_ts_info		= ethtool_op_get_ts_info,
 	.get_channels		= veth_get_channels,
+	.set_channels		= veth_set_channels,
 };
 
 /* general routines */
@@ -1104,6 +1146,14 @@  static int veth_open(struct net_device *dev)
 	if (!peer)
 		return -ENOTCONN;
 
+	err = netif_set_real_num_rx_queues(dev, priv->num_rx_queues);
+	if (err)
+		return err;
+
+	err = netif_set_real_num_tx_queues(dev, priv->num_tx_queues);
+	if (err)
+		return err;
+
 	if (priv->_xdp_prog) {
 		err = veth_enable_xdp(dev);
 		if (err)
@@ -1551,14 +1601,18 @@  static int veth_newlink(struct net *src_net, struct net_device *dev,
 	netif_carrier_off(dev);
 
 	/*
-	 * tie the deviced together
+	 * tie the deviced together and init the default queue nr
 	 */
 
 	priv = netdev_priv(dev);
 	rcu_assign_pointer(priv->peer, peer);
+	priv->num_tx_queues = dev->num_tx_queues;
+	priv->num_rx_queues = dev->num_rx_queues;
 
 	priv = netdev_priv(peer);
 	rcu_assign_pointer(priv->peer, dev);
+	priv->num_tx_queues = peer->num_tx_queues;
+	priv->num_rx_queues = peer->num_rx_queues;
 
 	veth_disable_gro(dev);
 	return 0;