diff mbox series

[RFC,net-next,6/9] net: dsa: Forward offloading

Message ID 20210426170411.1789186-7-tobias@waldekranz.com
State New
Headers show
Series net: bridge: Forward offloading | expand

Commit Message

Tobias Waldekranz April 26, 2021, 5:04 p.m. UTC
Allow DSA drivers to support forward offloading from a bridge by:

- Passing calls to .ndo_dfwd_{add,del}_station to the drivers.

- Recording the subordinate device of offloaded skbs in the control
  buffer so that the tagger can take the appropriate action.

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
---
 include/net/dsa.h |  7 +++++++
 net/dsa/slave.c   | 36 ++++++++++++++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 2 deletions(-)

Comments

Vladimir Oltean April 27, 2021, 10:17 a.m. UTC | #1
On Mon, Apr 26, 2021 at 07:04:08PM +0200, Tobias Waldekranz wrote:
> Allow DSA drivers to support forward offloading from a bridge by:

> 

> - Passing calls to .ndo_dfwd_{add,del}_station to the drivers.

> 

> - Recording the subordinate device of offloaded skbs in the control

>   buffer so that the tagger can take the appropriate action.

> 

> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>

> ---

>  include/net/dsa.h |  7 +++++++

>  net/dsa/slave.c   | 36 ++++++++++++++++++++++++++++++++++--

>  2 files changed, 41 insertions(+), 2 deletions(-)

> 

> diff --git a/include/net/dsa.h b/include/net/dsa.h

> index 1f9ba9889034..77d4df819299 100644

> --- a/include/net/dsa.h

> +++ b/include/net/dsa.h

> @@ -119,6 +119,7 @@ struct dsa_netdevice_ops {

>  

>  struct dsa_skb_cb {

>  	struct sk_buff *clone;

> +	struct net_device *sb_dev;

>  };

>  

>  struct __dsa_skb_cb {

> @@ -828,6 +829,12 @@ struct dsa_switch_ops {

>  					  const struct switchdev_obj_ring_role_mrp *mrp);

>  	int	(*port_mrp_del_ring_role)(struct dsa_switch *ds, int port,

>  					  const struct switchdev_obj_ring_role_mrp *mrp);

> +

> +	/* L2 forward offloading */

> +	void *	(*dfwd_add_station)(struct dsa_switch *ds, int port,

> +				    struct net_device *sb_dev);

> +	void	(*dfwd_del_station)(struct dsa_switch *ds, int port,

> +				    struct net_device *sb_dev);

>  };

>  

>  #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\

> diff --git a/net/dsa/slave.c b/net/dsa/slave.c

> index 77b33bd161b8..3689ffa2dbb8 100644

> --- a/net/dsa/slave.c

> +++ b/net/dsa/slave.c

> @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)

>  	return dsa_enqueue_skb(nskb, dev);

>  }

>  

> +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb,

> +				  struct net_device *sb_dev)

> +{

> +	DSA_SKB_CB(skb)->sb_dev = sb_dev;

> +	return netdev_pick_tx(dev, skb, sb_dev);

> +}

> +


DSA_SKB_CB is going away:
https://patchwork.kernel.org/project/netdevbpf/patch/20210427042203.26258-5-yangbo.lu@nxp.com/

Let's either negotiate with Yangbo on keeping it, or make
.ndo_select_queue a bypass towards the tagger, where it can use its own
SKB_CB structure and be more flexible in general (I think I'm leaning
towards the latter).
Tobias Waldekranz May 4, 2021, 2:44 p.m. UTC | #2
On Tue, Apr 27, 2021 at 13:17, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Mon, Apr 26, 2021 at 07:04:08PM +0200, Tobias Waldekranz wrote:

>> Allow DSA drivers to support forward offloading from a bridge by:

>> 

>> - Passing calls to .ndo_dfwd_{add,del}_station to the drivers.

>> 

>> - Recording the subordinate device of offloaded skbs in the control

>>   buffer so that the tagger can take the appropriate action.

>> 

>> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>

>> ---

>>  include/net/dsa.h |  7 +++++++

>>  net/dsa/slave.c   | 36 ++++++++++++++++++++++++++++++++++--

>>  2 files changed, 41 insertions(+), 2 deletions(-)

>> 

>> diff --git a/include/net/dsa.h b/include/net/dsa.h

>> index 1f9ba9889034..77d4df819299 100644

>> --- a/include/net/dsa.h

>> +++ b/include/net/dsa.h

>> @@ -119,6 +119,7 @@ struct dsa_netdevice_ops {

>>  

>>  struct dsa_skb_cb {

>>  	struct sk_buff *clone;

>> +	struct net_device *sb_dev;

>>  };

>>  

>>  struct __dsa_skb_cb {

>> @@ -828,6 +829,12 @@ struct dsa_switch_ops {

>>  					  const struct switchdev_obj_ring_role_mrp *mrp);

>>  	int	(*port_mrp_del_ring_role)(struct dsa_switch *ds, int port,

>>  					  const struct switchdev_obj_ring_role_mrp *mrp);

>> +

>> +	/* L2 forward offloading */

>> +	void *	(*dfwd_add_station)(struct dsa_switch *ds, int port,

>> +				    struct net_device *sb_dev);

>> +	void	(*dfwd_del_station)(struct dsa_switch *ds, int port,

>> +				    struct net_device *sb_dev);

>>  };

>>  

>>  #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\

>> diff --git a/net/dsa/slave.c b/net/dsa/slave.c

>> index 77b33bd161b8..3689ffa2dbb8 100644

>> --- a/net/dsa/slave.c

>> +++ b/net/dsa/slave.c

>> @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)

>>  	return dsa_enqueue_skb(nskb, dev);

>>  }

>>  

>> +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb,

>> +				  struct net_device *sb_dev)

>> +{

>> +	DSA_SKB_CB(skb)->sb_dev = sb_dev;

>> +	return netdev_pick_tx(dev, skb, sb_dev);

>> +}

>> +

>

> DSA_SKB_CB is going away:

> https://patchwork.kernel.org/project/netdevbpf/patch/20210427042203.26258-5-yangbo.lu@nxp.com/

>

> Let's either negotiate with Yangbo on keeping it, or make

> .ndo_select_queue a bypass towards the tagger, where it can use its own

> SKB_CB structure and be more flexible in general (I think I'm leaning

> towards the latter).


Thus far, Yangbo is a tough negotiator, giving me the silent treatment:

https://lore.kernel.org/netdev/87y2d2noe5.fsf@waldekranz.com/

:)

That memset is giving me a hard time. I have just disabled it on my
branch at the moment. Any ideas on how to get rid of it without breaking
timestamping?
Vladimir Oltean May 4, 2021, 3:21 p.m. UTC | #3
On Tue, May 04, 2021 at 04:44:31PM +0200, Tobias Waldekranz wrote:
> On Tue, Apr 27, 2021 at 13:17, Vladimir Oltean <olteanv@gmail.com> wrote:

> > On Mon, Apr 26, 2021 at 07:04:08PM +0200, Tobias Waldekranz wrote:

> >> Allow DSA drivers to support forward offloading from a bridge by:

> >> 

> >> - Passing calls to .ndo_dfwd_{add,del}_station to the drivers.

> >> 

> >> - Recording the subordinate device of offloaded skbs in the control

> >>   buffer so that the tagger can take the appropriate action.

> >> 

> >> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>

> >> ---

> >>  include/net/dsa.h |  7 +++++++

> >>  net/dsa/slave.c   | 36 ++++++++++++++++++++++++++++++++++--

> >>  2 files changed, 41 insertions(+), 2 deletions(-)

> >> 

> >> diff --git a/include/net/dsa.h b/include/net/dsa.h

> >> index 1f9ba9889034..77d4df819299 100644

> >> --- a/include/net/dsa.h

> >> +++ b/include/net/dsa.h

> >> @@ -119,6 +119,7 @@ struct dsa_netdevice_ops {

> >>  

> >>  struct dsa_skb_cb {

> >>  	struct sk_buff *clone;

> >> +	struct net_device *sb_dev;

> >>  };

> >>  

> >>  struct __dsa_skb_cb {

> >> @@ -828,6 +829,12 @@ struct dsa_switch_ops {

> >>  					  const struct switchdev_obj_ring_role_mrp *mrp);

> >>  	int	(*port_mrp_del_ring_role)(struct dsa_switch *ds, int port,

> >>  					  const struct switchdev_obj_ring_role_mrp *mrp);

> >> +

> >> +	/* L2 forward offloading */

> >> +	void *	(*dfwd_add_station)(struct dsa_switch *ds, int port,

> >> +				    struct net_device *sb_dev);

> >> +	void	(*dfwd_del_station)(struct dsa_switch *ds, int port,

> >> +				    struct net_device *sb_dev);

> >>  };

> >>  

> >>  #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\

> >> diff --git a/net/dsa/slave.c b/net/dsa/slave.c

> >> index 77b33bd161b8..3689ffa2dbb8 100644

> >> --- a/net/dsa/slave.c

> >> +++ b/net/dsa/slave.c

> >> @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)

> >>  	return dsa_enqueue_skb(nskb, dev);

> >>  }

> >>  

> >> +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb,

> >> +				  struct net_device *sb_dev)

> >> +{

> >> +	DSA_SKB_CB(skb)->sb_dev = sb_dev;

> >> +	return netdev_pick_tx(dev, skb, sb_dev);

> >> +}

> >> +

> >

> > DSA_SKB_CB is going away:

> > https://patchwork.kernel.org/project/netdevbpf/patch/20210427042203.26258-5-yangbo.lu@nxp.com/

> >

> > Let's either negotiate with Yangbo on keeping it, or make

> > .ndo_select_queue a bypass towards the tagger, where it can use its own

> > SKB_CB structure and be more flexible in general (I think I'm leaning

> > towards the latter).

> 

> Thus far, Yangbo is a tough negotiator, giving me the silent treatment:

> 

> https://lore.kernel.org/netdev/87y2d2noe5.fsf@waldekranz.com/

> 

> :)

> 

> That memset is giving me a hard time. I have just disabled it on my

> branch at the moment. Any ideas on how to get rid of it without breaking

> timestamping?


:)

Is there any guarantee written somewhere that the ownership of skb->cb
belongs to the NIC driver at the time of the ndo_select_queue call?

If there is, then the trivial solution is to just move the memset in
ndo_select_queue.

If there isn't, then we've got bigger issues (such as, for example, the
qdisc layer being able to overwrite your DSA_SKB_CB(skb)->sb_dev).
Tobias Waldekranz May 4, 2021, 8:07 p.m. UTC | #4
On Tue, May 04, 2021 at 18:21, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Tue, May 04, 2021 at 04:44:31PM +0200, Tobias Waldekranz wrote:

>> On Tue, Apr 27, 2021 at 13:17, Vladimir Oltean <olteanv@gmail.com> wrote:

>> > On Mon, Apr 26, 2021 at 07:04:08PM +0200, Tobias Waldekranz wrote:

>> >> Allow DSA drivers to support forward offloading from a bridge by:

>> >> 

>> >> - Passing calls to .ndo_dfwd_{add,del}_station to the drivers.

>> >> 

>> >> - Recording the subordinate device of offloaded skbs in the control

>> >>   buffer so that the tagger can take the appropriate action.

>> >> 

>> >> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>

>> >> ---

>> >>  include/net/dsa.h |  7 +++++++

>> >>  net/dsa/slave.c   | 36 ++++++++++++++++++++++++++++++++++--

>> >>  2 files changed, 41 insertions(+), 2 deletions(-)

>> >> 

>> >> diff --git a/include/net/dsa.h b/include/net/dsa.h

>> >> index 1f9ba9889034..77d4df819299 100644

>> >> --- a/include/net/dsa.h

>> >> +++ b/include/net/dsa.h

>> >> @@ -119,6 +119,7 @@ struct dsa_netdevice_ops {

>> >>  

>> >>  struct dsa_skb_cb {

>> >>  	struct sk_buff *clone;

>> >> +	struct net_device *sb_dev;

>> >>  };

>> >>  

>> >>  struct __dsa_skb_cb {

>> >> @@ -828,6 +829,12 @@ struct dsa_switch_ops {

>> >>  					  const struct switchdev_obj_ring_role_mrp *mrp);

>> >>  	int	(*port_mrp_del_ring_role)(struct dsa_switch *ds, int port,

>> >>  					  const struct switchdev_obj_ring_role_mrp *mrp);

>> >> +

>> >> +	/* L2 forward offloading */

>> >> +	void *	(*dfwd_add_station)(struct dsa_switch *ds, int port,

>> >> +				    struct net_device *sb_dev);

>> >> +	void	(*dfwd_del_station)(struct dsa_switch *ds, int port,

>> >> +				    struct net_device *sb_dev);

>> >>  };

>> >>  

>> >>  #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\

>> >> diff --git a/net/dsa/slave.c b/net/dsa/slave.c

>> >> index 77b33bd161b8..3689ffa2dbb8 100644

>> >> --- a/net/dsa/slave.c

>> >> +++ b/net/dsa/slave.c

>> >> @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)

>> >>  	return dsa_enqueue_skb(nskb, dev);

>> >>  }

>> >>  

>> >> +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb,

>> >> +				  struct net_device *sb_dev)

>> >> +{

>> >> +	DSA_SKB_CB(skb)->sb_dev = sb_dev;

>> >> +	return netdev_pick_tx(dev, skb, sb_dev);

>> >> +}

>> >> +

>> >

>> > DSA_SKB_CB is going away:

>> > https://patchwork.kernel.org/project/netdevbpf/patch/20210427042203.26258-5-yangbo.lu@nxp.com/

>> >

>> > Let's either negotiate with Yangbo on keeping it, or make

>> > .ndo_select_queue a bypass towards the tagger, where it can use its own

>> > SKB_CB structure and be more flexible in general (I think I'm leaning

>> > towards the latter).

>> 

>> Thus far, Yangbo is a tough negotiator, giving me the silent treatment:

>> 

>> https://lore.kernel.org/netdev/87y2d2noe5.fsf@waldekranz.com/

>> 

>> :)

>> 

>> That memset is giving me a hard time. I have just disabled it on my

>> branch at the moment. Any ideas on how to get rid of it without breaking

>> timestamping?

>

> :)

>

> Is there any guarantee written somewhere that the ownership of skb->cb

> belongs to the NIC driver at the time of the ndo_select_queue call?

>

> If there is, then the trivial solution is to just move the memset in

> ndo_select_queue.

>

> If there isn't, then we've got bigger issues (such as, for example, the

> qdisc layer being able to overwrite your DSA_SKB_CB(skb)->sb_dev).


The comment says:

   "This is owned by whoever has the skb queued ATM."

But qdisc_skb_cb is a thing as it turns out - so I think I can kiss the
idea of stashing the pointer in the CB goodbye.

Looking at some of the other users of .ndo_select_queue, I get the
feeling that we should really:

- Pre-generate a FROM_CPU tag template and store it under "TxQ 0"
- Pre-generate a FORWARD tag template and store it under "TxQ 1"
- Redfine tag_dsa's .ndo_select_queue to be: `return sb_dev ? 1 : 0;`
- Fetch the template using skb_queue_mapping, fill in the VID, and send
  it.

There is really no need to recompute the static parts of the tags on
each skb. It would mean moving some knowledge of the tagging format to
the driver. But that boundary is pretty artificial for
mv88e6xxx. tag_dsa has no use outside of mv88e6xxx, and mv88e6xxx does
not work with any other tagger. I suppose you could even move the whole
tagger to drivers/net/dsa/mv88e6xxx/?

What do you think?

Andrew?
Andrew Lunn May 4, 2021, 8:33 p.m. UTC | #5
> There is really no need to recompute the static parts of the tags on

> each skb. It would mean moving some knowledge of the tagging format to

> the driver. But that boundary is pretty artificial for

> mv88e6xxx. tag_dsa has no use outside of mv88e6xxx, and mv88e6xxx does

> not work with any other tagger. I suppose you could even move the whole

> tagger to drivers/net/dsa/mv88e6xxx/?

> 

> What do you think?

> 

> Andrew?


We have resisted this before.

What information do you actually need to share between the tagger and
the driver? Both tag_lan9303.c and tag_ocelot_8021q.c do reference
their switch driver data structures, so some sharing is allowed. But
please try to keep the surface areas down.

       Andrew
Vladimir Oltean May 4, 2021, 8:58 p.m. UTC | #6
On Tue, May 04, 2021 at 10:07:14PM +0200, Tobias Waldekranz wrote:
> On Tue, May 04, 2021 at 18:21, Vladimir Oltean <olteanv@gmail.com> wrote:

> > On Tue, May 04, 2021 at 04:44:31PM +0200, Tobias Waldekranz wrote:

> >> On Tue, Apr 27, 2021 at 13:17, Vladimir Oltean <olteanv@gmail.com> wrote:

> >> > On Mon, Apr 26, 2021 at 07:04:08PM +0200, Tobias Waldekranz wrote:

> >> >> Allow DSA drivers to support forward offloading from a bridge by:

> >> >> 

> >> >> - Passing calls to .ndo_dfwd_{add,del}_station to the drivers.

> >> >> 

> >> >> - Recording the subordinate device of offloaded skbs in the control

> >> >>   buffer so that the tagger can take the appropriate action.

> >> >> 

> >> >> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>

> >> >> ---

> >> >>  include/net/dsa.h |  7 +++++++

> >> >>  net/dsa/slave.c   | 36 ++++++++++++++++++++++++++++++++++--

> >> >>  2 files changed, 41 insertions(+), 2 deletions(-)

> >> >> 

> >> >> diff --git a/include/net/dsa.h b/include/net/dsa.h

> >> >> index 1f9ba9889034..77d4df819299 100644

> >> >> --- a/include/net/dsa.h

> >> >> +++ b/include/net/dsa.h

> >> >> @@ -119,6 +119,7 @@ struct dsa_netdevice_ops {

> >> >>  

> >> >>  struct dsa_skb_cb {

> >> >>  	struct sk_buff *clone;

> >> >> +	struct net_device *sb_dev;

> >> >>  };

> >> >>  

> >> >>  struct __dsa_skb_cb {

> >> >> @@ -828,6 +829,12 @@ struct dsa_switch_ops {

> >> >>  					  const struct switchdev_obj_ring_role_mrp *mrp);

> >> >>  	int	(*port_mrp_del_ring_role)(struct dsa_switch *ds, int port,

> >> >>  					  const struct switchdev_obj_ring_role_mrp *mrp);

> >> >> +

> >> >> +	/* L2 forward offloading */

> >> >> +	void *	(*dfwd_add_station)(struct dsa_switch *ds, int port,

> >> >> +				    struct net_device *sb_dev);

> >> >> +	void	(*dfwd_del_station)(struct dsa_switch *ds, int port,

> >> >> +				    struct net_device *sb_dev);

> >> >>  };

> >> >>  

> >> >>  #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\

> >> >> diff --git a/net/dsa/slave.c b/net/dsa/slave.c

> >> >> index 77b33bd161b8..3689ffa2dbb8 100644

> >> >> --- a/net/dsa/slave.c

> >> >> +++ b/net/dsa/slave.c

> >> >> @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)

> >> >>  	return dsa_enqueue_skb(nskb, dev);

> >> >>  }

> >> >>  

> >> >> +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb,

> >> >> +				  struct net_device *sb_dev)

> >> >> +{

> >> >> +	DSA_SKB_CB(skb)->sb_dev = sb_dev;

> >> >> +	return netdev_pick_tx(dev, skb, sb_dev);

> >> >> +}

> >> >> +

> >> >

> >> > DSA_SKB_CB is going away:

> >> > https://patchwork.kernel.org/project/netdevbpf/patch/20210427042203.26258-5-yangbo.lu@nxp.com/

> >> >

> >> > Let's either negotiate with Yangbo on keeping it, or make

> >> > .ndo_select_queue a bypass towards the tagger, where it can use its own

> >> > SKB_CB structure and be more flexible in general (I think I'm leaning

> >> > towards the latter).

> >> 

> >> Thus far, Yangbo is a tough negotiator, giving me the silent treatment:

> >> 

> >> https://lore.kernel.org/netdev/87y2d2noe5.fsf@waldekranz.com/

> >> 

> >> :)

> >> 

> >> That memset is giving me a hard time. I have just disabled it on my

> >> branch at the moment. Any ideas on how to get rid of it without breaking

> >> timestamping?

> >

> > :)

> >

> > Is there any guarantee written somewhere that the ownership of skb->cb

> > belongs to the NIC driver at the time of the ndo_select_queue call?

> >

> > If there is, then the trivial solution is to just move the memset in

> > ndo_select_queue.

> >

> > If there isn't, then we've got bigger issues (such as, for example, the

> > qdisc layer being able to overwrite your DSA_SKB_CB(skb)->sb_dev).

> 

> The comment says:

> 

>    "This is owned by whoever has the skb queued ATM."

> 

> But qdisc_skb_cb is a thing as it turns out - so I think I can kiss the

> idea of stashing the pointer in the CB goodbye.

> 

> Looking at some of the other users of .ndo_select_queue, I get the

> feeling that we should really:

> 

> - Pre-generate a FROM_CPU tag template and store it under "TxQ 0"

> - Pre-generate a FORWARD tag template and store it under "TxQ 1"

> - Redfine tag_dsa's .ndo_select_queue to be: `return sb_dev ? 1 : 0;`

> - Fetch the template using skb_queue_mapping, fill in the VID, and send

>   it.


Different drivers use TX queues in different ways. For example, for the
switches with TSN offloads, we set ds->num_tx_queues to a value equal to
the number of hardware traffic classes, so that the CPU can inject
packets with a specific QOS_CLASS field in the DSA header (think VLAN PCP).
This is really visible with tc-taprio where some traffic classes can be
completely turned off, so you can easily tell which TC was a packet
enqueued to. Other switches use TX queues in other ways. Some Broadcom
tagging protocols use the skb queue_mapping to direct the packets to one
of multiple TX queues of the DSA master, in order to apply backpressure
in case there is congestion on the front port.

Selecting a TX queue based on which upper netdev the packet is coming
form sounds to me like the oddest of the bunch. It really adds one more
dimension to the existing uses, I am not sure that this is how it was
intended to be done [ and why, for example, if the sb_dev was propagated
so deeply into dev_queue_xmit, why was it not propagated all the way to
.ndo_start_xmit ], but on the other hand, you have more working
experience with the dev_queue_xmit_accel API than the zero I have.

By the way (to show how little I know) what does "d" in "dfwd" stand for?
It almost sounds to me like a typo that was carried along from
NETIF_F_HW_L2FW_DOFFLOAD_BIT.

We might need to ask for the input of some people from Intel who worked
on this offload framework. For example, I just added Alexander Duyck
hoping he can provide some suggestions. We just want the sb_dev in
ndo_start_xmit, and abusing ndo_select_queue seems like a huge hack just
to obtain that.

> There is really no need to recompute the static parts of the tags on

> each skb. It would mean moving some knowledge of the tagging format to

> the driver. But that boundary is pretty artificial for

> mv88e6xxx. tag_dsa has no use outside of mv88e6xxx, and mv88e6xxx does

> not work with any other tagger. I suppose you could even move the whole

> tagger to drivers/net/dsa/mv88e6xxx/?

> 

> What do you think?

> 

> Andrew?


[ not Andrew, but ]

I made that mistake so that you don't have to. You don't actually gain
as much as you think (performance is about the same, what you win in
instruction count and conditionals you lose in the memcpy), and you
create a dependency between the tagger and the switch driver which was
supposed by design to not exist. For my drivers I tried to remove this
dependency - see commit 7c4bb540e917 ("net: dsa: tag_ocelot: create
separate tagger for Seville"). Also, in the case of Ocelot switches,
a template was used to mask out handling differences between switch
generations, and present them to user space as "the same tagger".
Another bad idea. In general, if a tagging protocol is testable with
dsa_loop this is a plus. People at NXP wanted to see how their drivers
perform with Marvell switches (what are their options for balancing with
RFS/RSS) and this is what they did, changed DSA_TAG_PROTO_NONE from what
dsa_loop advertises. If they need the actual switch driver to initialize
the tagger's template, suddenly it's not so fun anymore.

If it ever becomes important enough, I suppose dsa_loop could even gain
support for the new .change_tag_protocol API to advertise the
feasibility of the idea in general, although given how DYI dsa_loop is
in general, maybe changing the tag protocol at runtime isn't so
important.
Tobias Waldekranz May 4, 2021, 9:24 p.m. UTC | #7
On Tue, May 04, 2021 at 22:33, Andrew Lunn <andrew@lunn.ch> wrote:
>> There is really no need to recompute the static parts of the tags on

>> each skb. It would mean moving some knowledge of the tagging format to

>> the driver. But that boundary is pretty artificial for

>> mv88e6xxx. tag_dsa has no use outside of mv88e6xxx, and mv88e6xxx does

>> not work with any other tagger. I suppose you could even move the whole

>> tagger to drivers/net/dsa/mv88e6xxx/?

>> 

>> What do you think?

>> 

>> Andrew?

>

> We have resisted this before.

>

> What information do you actually need to share between the tagger and

> the driver?


So far:

- Trunk/LAG ID to netdev mappings (this is stored on the dst now, but I
  think I have seen the light and agree with Vladimir that it really has
  no business there).

- DSA dev/port to bridge netdev mappings for the forwarding offloading
  in this RFC (or preferably the actual tag templates to use on egress
  since that would probably give you better performance)

In the future:

- Completions for in-flight remote management operations.

- FlowID to TC rule mappings (from the "Switch Egress header" when we
  enable that)

- In-band signaling between firmware running on the IMP and the driver
  for things like MRP and CFM offloading.

> Both tag_lan9303.c and tag_ocelot_8021q.c do reference

> their switch driver data structures, so some sharing is allowed. But

> please try to keep the surface areas down.


If you have a surface area keep it small, yes, agreed. I guess my
question is more why we should have any surface area at all? What do we
gain by the tagger/driver separation in the case of mv88e6xxx?

>        Andrew
Tobias Waldekranz May 4, 2021, 10:12 p.m. UTC | #8
On Tue, May 04, 2021 at 23:58, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Tue, May 04, 2021 at 10:07:14PM +0200, Tobias Waldekranz wrote:

>> On Tue, May 04, 2021 at 18:21, Vladimir Oltean <olteanv@gmail.com> wrote:

>> > On Tue, May 04, 2021 at 04:44:31PM +0200, Tobias Waldekranz wrote:

>> >> On Tue, Apr 27, 2021 at 13:17, Vladimir Oltean <olteanv@gmail.com> wrote:

>> >> > On Mon, Apr 26, 2021 at 07:04:08PM +0200, Tobias Waldekranz wrote:

>> >> >> Allow DSA drivers to support forward offloading from a bridge by:

>> >> >> 

>> >> >> - Passing calls to .ndo_dfwd_{add,del}_station to the drivers.

>> >> >> 

>> >> >> - Recording the subordinate device of offloaded skbs in the control

>> >> >>   buffer so that the tagger can take the appropriate action.

>> >> >> 

>> >> >> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>

>> >> >> ---

>> >> >>  include/net/dsa.h |  7 +++++++

>> >> >>  net/dsa/slave.c   | 36 ++++++++++++++++++++++++++++++++++--

>> >> >>  2 files changed, 41 insertions(+), 2 deletions(-)

>> >> >> 

>> >> >> diff --git a/include/net/dsa.h b/include/net/dsa.h

>> >> >> index 1f9ba9889034..77d4df819299 100644

>> >> >> --- a/include/net/dsa.h

>> >> >> +++ b/include/net/dsa.h

>> >> >> @@ -119,6 +119,7 @@ struct dsa_netdevice_ops {

>> >> >>  

>> >> >>  struct dsa_skb_cb {

>> >> >>  	struct sk_buff *clone;

>> >> >> +	struct net_device *sb_dev;

>> >> >>  };

>> >> >>  

>> >> >>  struct __dsa_skb_cb {

>> >> >> @@ -828,6 +829,12 @@ struct dsa_switch_ops {

>> >> >>  					  const struct switchdev_obj_ring_role_mrp *mrp);

>> >> >>  	int	(*port_mrp_del_ring_role)(struct dsa_switch *ds, int port,

>> >> >>  					  const struct switchdev_obj_ring_role_mrp *mrp);

>> >> >> +

>> >> >> +	/* L2 forward offloading */

>> >> >> +	void *	(*dfwd_add_station)(struct dsa_switch *ds, int port,

>> >> >> +				    struct net_device *sb_dev);

>> >> >> +	void	(*dfwd_del_station)(struct dsa_switch *ds, int port,

>> >> >> +				    struct net_device *sb_dev);

>> >> >>  };

>> >> >>  

>> >> >>  #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\

>> >> >> diff --git a/net/dsa/slave.c b/net/dsa/slave.c

>> >> >> index 77b33bd161b8..3689ffa2dbb8 100644

>> >> >> --- a/net/dsa/slave.c

>> >> >> +++ b/net/dsa/slave.c

>> >> >> @@ -657,6 +657,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)

>> >> >>  	return dsa_enqueue_skb(nskb, dev);

>> >> >>  }

>> >> >>  

>> >> >> +static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb,

>> >> >> +				  struct net_device *sb_dev)

>> >> >> +{

>> >> >> +	DSA_SKB_CB(skb)->sb_dev = sb_dev;

>> >> >> +	return netdev_pick_tx(dev, skb, sb_dev);

>> >> >> +}

>> >> >> +

>> >> >

>> >> > DSA_SKB_CB is going away:

>> >> > https://patchwork.kernel.org/project/netdevbpf/patch/20210427042203.26258-5-yangbo.lu@nxp.com/

>> >> >

>> >> > Let's either negotiate with Yangbo on keeping it, or make

>> >> > .ndo_select_queue a bypass towards the tagger, where it can use its own

>> >> > SKB_CB structure and be more flexible in general (I think I'm leaning

>> >> > towards the latter).

>> >> 

>> >> Thus far, Yangbo is a tough negotiator, giving me the silent treatment:

>> >> 

>> >> https://lore.kernel.org/netdev/87y2d2noe5.fsf@waldekranz.com/

>> >> 

>> >> :)

>> >> 

>> >> That memset is giving me a hard time. I have just disabled it on my

>> >> branch at the moment. Any ideas on how to get rid of it without breaking

>> >> timestamping?

>> >

>> > :)

>> >

>> > Is there any guarantee written somewhere that the ownership of skb->cb

>> > belongs to the NIC driver at the time of the ndo_select_queue call?

>> >

>> > If there is, then the trivial solution is to just move the memset in

>> > ndo_select_queue.

>> >

>> > If there isn't, then we've got bigger issues (such as, for example, the

>> > qdisc layer being able to overwrite your DSA_SKB_CB(skb)->sb_dev).

>> 

>> The comment says:

>> 

>>    "This is owned by whoever has the skb queued ATM."

>> 

>> But qdisc_skb_cb is a thing as it turns out - so I think I can kiss the

>> idea of stashing the pointer in the CB goodbye.

>> 

>> Looking at some of the other users of .ndo_select_queue, I get the

>> feeling that we should really:

>> 

>> - Pre-generate a FROM_CPU tag template and store it under "TxQ 0"

>> - Pre-generate a FORWARD tag template and store it under "TxQ 1"

>> - Redfine tag_dsa's .ndo_select_queue to be: `return sb_dev ? 1 : 0;`

>> - Fetch the template using skb_queue_mapping, fill in the VID, and send

>>   it.

>

> Different drivers use TX queues in different ways. For example, for the

> switches with TSN offloads, we set ds->num_tx_queues to a value equal to

> the number of hardware traffic classes, so that the CPU can inject

> packets with a specific QOS_CLASS field in the DSA header (think VLAN PCP).

> This is really visible with tc-taprio where some traffic classes can be

> completely turned off, so you can easily tell which TC was a packet

> enqueued to. Other switches use TX queues in other ways. Some Broadcom

> tagging protocols use the skb queue_mapping to direct the packets to one

> of multiple TX queues of the DSA master, in order to apply backpressure

> in case there is congestion on the front port.

>

> Selecting a TX queue based on which upper netdev the packet is coming

> form sounds to me like the oddest of the bunch. It really adds one more

> dimension to the existing uses, I am not sure that this is how it was

> intended to be done [ and why, for example, if the sb_dev was propagated

> so deeply into dev_queue_xmit, why was it not propagated all the way to

> .ndo_start_xmit ], but on the other hand, you have more working

> experience with the dev_queue_xmit_accel API than the zero I have.


Yeah it does not feel right. I expect mv88e6xxx will also want to expose
the real number of queues in the future. Some of the newer devices have
support for time aware shapers for example.

As for why sb_dev is not propagated to .ndo_start_xmit: I chalked it up
to the existing users managing the macvlan offloads by directing those
flows to a particular TxQ. I.e. they simply had no need for it.

Or perhaps they did not have the nerve to send the commit that changed
the signature of _every_ driver's .ndo_start_xmit :)

> By the way (to show how little I know) what does "d" in "dfwd" stand for?

> It almost sounds to me like a typo that was carried along from

> NETIF_F_HW_L2FW_DOFFLOAD_BIT.


That has been bugging me as well! I have no idea.

> We might need to ask for the input of some people from Intel who worked

> on this offload framework. For example, I just added Alexander Duyck

> hoping he can provide some suggestions. We just want the sb_dev in

> ndo_start_xmit, and abusing ndo_select_queue seems like a huge hack just

> to obtain that.


I think you are right.

>> There is really no need to recompute the static parts of the tags on

>> each skb. It would mean moving some knowledge of the tagging format to

>> the driver. But that boundary is pretty artificial for

>> mv88e6xxx. tag_dsa has no use outside of mv88e6xxx, and mv88e6xxx does

>> not work with any other tagger. I suppose you could even move the whole

>> tagger to drivers/net/dsa/mv88e6xxx/?

>> 

>> What do you think?

>> 

>> Andrew?

>

> [ not Andrew, but ]

>

> I made that mistake so that you don't have to. You don't actually gain

> as much as you think (performance is about the same, what you win in

> instruction count and conditionals you lose in the memcpy),


That is valuable info, thank you. But I think the most important
improvement I see would be the ability to couple the tagger tighter to
the driver when we add more complicated features.

> and you

> create a dependency between the tagger and the switch driver which was

> supposed by design to not exist.


Sure, but _why_ should it not exist? Many fields in the tag can only be
correctly generated/interpreted in combination with knowledge of the
current configuration, which is the driver's domain. The dependency is
already there, etched in silicon.

> For my drivers I tried to remove this

> dependency - see commit 7c4bb540e917 ("net: dsa: tag_ocelot: create

> separate tagger for Seville"). Also, in the case of Ocelot switches,

> a template was used to mask out handling differences between switch

> generations, and present them to user space as "the same tagger".

> Another bad idea. In general, if a tagging protocol is testable with

> dsa_loop this is a plus. People at NXP wanted to see how their drivers

> perform with Marvell switches (what are their options for balancing with

> RFS/RSS) and this is what they did, changed DSA_TAG_PROTO_NONE from what

> dsa_loop advertises. If they need the actual switch driver to initialize

> the tagger's template, suddenly it's not so fun anymore.


I shall have to look more closely at dsa_loop, so far I have just seen
the name float by on a few occasions.

> If it ever becomes important enough, I suppose dsa_loop could even gain

> support for the new .change_tag_protocol API to advertise the

> feasibility of the idea in general, although given how DYI dsa_loop is

> in general, maybe changing the tag protocol at runtime isn't so

> important.
Vladimir Oltean May 4, 2021, 11:04 p.m. UTC | #9
On Wed, May 05, 2021 at 12:12:15AM +0200, Tobias Waldekranz wrote:
> > and you create a dependency between the tagger and the switch driver

> > which was supposed by design to not exist.

> 

> Sure, but _why_ should it not exist? Many fields in the tag can only be

> correctly generated/interpreted in combination with knowledge of the

> current configuration, which is the driver's domain. The dependency is

> already there, etched in silicon.


I'm a bit more of a pragmatic person, it's not so much that I think that
Lennert Buytenhek's original DSA design from 2008 was the holy grail and
that we should do everything we can to preserve it intact. Far from it.
But I actually like having the option to inject a DSA-tagged packet
using Spirent TestCenter and measure IP forwarding between dsa_loop
"switch" ports (actually a one-armed router is what it is). I also like,
as a reviewer, to be able to test, if I want to, how a tail tagger
behaves even if I don't own a switch with tail tagging. And this
separation between the switch driver and the tag protocol driver makes
that possible, just see it as a nice perk which we don't want to lose.

As for more advanced features, like "the hardware requires me to invent
a unique number based on a rolling counter, call it a TX timestamp ID,
put it in the DSA header, then when transmission is done, an IRQ will be
raised, and I need to match that TX timestamp that just became available
to me, which is identifiable via the timestamp ID that I put in the DSA
header, with the original skb", of course you can't do that without
communication between the tagger and the driver itself, unless you make
the tagger handle interrupts (and then there's the whole issue that the
tagging protocol driver needs to be instantiated per switch, if it's
going to be stateful), or the switch driver send packets. As a general
rule of thumb, just don't break dsa_loop and we should be fine. For
example, yes, PTP requires driver <-> tagger communication, but PTP
timestamping is also not enabled by default, and guarded by an ioctl
which dsa_loop doesn't implement. So the tagger can never trigger faulty
code, dereferencing a ds->priv pointer which it thinks is "struct
mv88e6xxx_chip" but is actually "struct dsa_loop_priv".
Tobias Waldekranz May 5, 2021, 9:01 a.m. UTC | #10
On Wed, May 05, 2021 at 02:04, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Wed, May 05, 2021 at 12:12:15AM +0200, Tobias Waldekranz wrote:

>> > and you create a dependency between the tagger and the switch driver

>> > which was supposed by design to not exist.

>> 

>> Sure, but _why_ should it not exist? Many fields in the tag can only be

>> correctly generated/interpreted in combination with knowledge of the

>> current configuration, which is the driver's domain. The dependency is

>> already there, etched in silicon.

>

> I'm a bit more of a pragmatic person,


Excuse me sir, I believe you left your dagger IN MY HEART :)

> it's not so much that I think that

> Lennert Buytenhek's original DSA design from 2008 was the holy grail and

> that we should do everything we can to preserve it intact. Far from it.

> But I actually like having the option to inject a DSA-tagged packet

> using Spirent TestCenter and measure IP forwarding between dsa_loop

> "switch" ports (actually a one-armed router is what it is). I also like,

> as a reviewer, to be able to test, if I want to, how a tail tagger

> behaves even if I don't own a switch with tail tagging. And this

> separation between the switch driver and the tag protocol driver makes

> that possible, just see it as a nice perk which we don't want to lose.


Completely understandable. I was trying to extrapolate where we will end
up with this separation as we add more and more features and couple the
tagger closer to the driver, and see if the current architecture was
still the optimal one. Trying to be ...pragmatic, if you will.

> As for more advanced features, like "the hardware requires me to invent

> a unique number based on a rolling counter, call it a TX timestamp ID,

> put it in the DSA header, then when transmission is done, an IRQ will be

> raised, and I need to match that TX timestamp that just became available

> to me, which is identifiable via the timestamp ID that I put in the DSA

> header, with the original skb", of course you can't do that without

> communication between the tagger and the driver itself, unless you make

> the tagger handle interrupts (and then there's the whole issue that the

> tagging protocol driver needs to be instantiated per switch, if it's

> going to be stateful), or the switch driver send packets. As a general

> rule of thumb, just don't break dsa_loop and we should be fine. For

> example, yes, PTP requires driver <-> tagger communication, but PTP

> timestamping is also not enabled by default, and guarded by an ioctl

> which dsa_loop doesn't implement. So the tagger can never trigger faulty

> code, dereferencing a ds->priv pointer which it thinks is "struct

> mv88e6xxx_chip" but is actually "struct dsa_loop_priv".


This should also hold for forward offloading, since dsa_loop would not
implement .ndo_dfwd_{add,del}_station.

Alright, include/linux/dsa/mv88e6xxx.h here I come!
Vladimir Oltean May 5, 2021, 4:12 p.m. UTC | #11
On Wed, May 05, 2021 at 11:01:09AM +0200, Tobias Waldekranz wrote:
> On Wed, May 05, 2021 at 02:04, Vladimir Oltean <olteanv@gmail.com> wrote:

> > On Wed, May 05, 2021 at 12:12:15AM +0200, Tobias Waldekranz wrote:

> >> > and you create a dependency between the tagger and the switch driver

> >> > which was supposed by design to not exist.

> >> 

> >> Sure, but _why_ should it not exist? Many fields in the tag can only be

> >> correctly generated/interpreted in combination with knowledge of the

> >> current configuration, which is the driver's domain. The dependency is

> >> already there, etched in silicon.

> >

> > I'm a bit more of a pragmatic person,

> 

> Excuse me sir, I believe you left your dagger IN MY HEART :)


You might have misinterpreted my words, I did not mean to say "look what
a good quality I have and you don't", in fact I don't view pragmatism as
much of a desirable quality at all. What I meant to say in the context
is that, even though in general I value functionality more than how it
is implemented, I would still like to keep the separation between
taggers and switch drivers at least at the most basic RX/TX level, for
the reasons explained.
diff mbox series

Patch

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 1f9ba9889034..77d4df819299 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -119,6 +119,7 @@  struct dsa_netdevice_ops {
 
 struct dsa_skb_cb {
 	struct sk_buff *clone;
+	struct net_device *sb_dev;
 };
 
 struct __dsa_skb_cb {
@@ -828,6 +829,12 @@  struct dsa_switch_ops {
 					  const struct switchdev_obj_ring_role_mrp *mrp);
 	int	(*port_mrp_del_ring_role)(struct dsa_switch *ds, int port,
 					  const struct switchdev_obj_ring_role_mrp *mrp);
+
+	/* L2 forward offloading */
+	void *	(*dfwd_add_station)(struct dsa_switch *ds, int port,
+				    struct net_device *sb_dev);
+	void	(*dfwd_del_station)(struct dsa_switch *ds, int port,
+				    struct net_device *sb_dev);
 };
 
 #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 77b33bd161b8..3689ffa2dbb8 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -657,6 +657,13 @@  static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 	return dsa_enqueue_skb(nskb, dev);
 }
 
+static u16 dsa_slave_select_queue(struct net_device *dev, struct sk_buff *skb,
+				  struct net_device *sb_dev)
+{
+	DSA_SKB_CB(skb)->sb_dev = sb_dev;
+	return netdev_pick_tx(dev, skb, sb_dev);
+}
+
 /* ethtool operations *******************************************************/
 
 static void dsa_slave_get_drvinfo(struct net_device *dev,
@@ -1708,10 +1715,33 @@  static int dsa_slave_fill_forward_path(struct net_device_path_ctx *ctx,
 	return 0;
 }
 
+static void *dsa_slave_dfwd_add_station(struct net_device *dev,
+					struct net_device *sb_dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->dfwd_add_station)
+		return ds->ops->dfwd_add_station(ds, dp->index, sb_dev);
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static void dsa_slave_dfwd_del_station(struct net_device *dev,
+				       void *sb_dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->dfwd_del_station)
+		ds->ops->dfwd_del_station(ds, dp->index, sb_dev);
+}
+
 static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_open	 	= dsa_slave_open,
 	.ndo_stop		= dsa_slave_close,
 	.ndo_start_xmit		= dsa_slave_xmit,
+	.ndo_select_queue	= dsa_slave_select_queue,
 	.ndo_change_rx_flags	= dsa_slave_change_rx_flags,
 	.ndo_set_rx_mode	= dsa_slave_set_rx_mode,
 	.ndo_set_mac_address	= dsa_slave_set_mac_address,
@@ -1734,6 +1764,8 @@  static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_get_devlink_port	= dsa_slave_get_devlink_port,
 	.ndo_change_mtu		= dsa_slave_change_mtu,
 	.ndo_fill_forward_path	= dsa_slave_fill_forward_path,
+	.ndo_dfwd_add_station	= dsa_slave_dfwd_add_station,
+	.ndo_dfwd_del_station	= dsa_slave_dfwd_del_station,
 };
 
 static struct device_type dsa_type = {
@@ -1914,8 +1946,8 @@  int dsa_slave_create(struct dsa_port *port)
 	slave_dev->features = master->vlan_features | NETIF_F_HW_TC;
 	if (ds->ops->port_vlan_add && ds->ops->port_vlan_del)
 		slave_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
-	slave_dev->hw_features |= NETIF_F_HW_TC;
-	slave_dev->features |= NETIF_F_LLTX;
+	slave_dev->hw_features |= NETIF_F_HW_TC | NETIF_F_HW_L2FW_DOFFLOAD;
+	slave_dev->features |= NETIF_F_LLTX | NETIF_F_HW_L2FW_DOFFLOAD;
 	slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
 	if (!is_zero_ether_addr(port->mac))
 		ether_addr_copy(slave_dev->dev_addr, port->mac);