diff mbox series

[RFC,6/9] staging: dpaa2-switch: add .ndo_start_xmit() callback

Message ID 20201104165720.2566399-7-ciorneiioana@gmail.com
State New
Headers show
Series [RFC,1/9] staging: dpaa2-switch: get control interface attributes | expand

Commit Message

Ioana Ciornei Nov. 4, 2020, 4:57 p.m. UTC
From: Ioana Ciornei <ioana.ciornei@nxp.com>

Implement the .ndo_start_xmit() callback for the switch port interfaces.
For each of the switch ports, gather the corresponding queue
destination ID (QDID) necessary for Tx enqueueing.

We'll reserve 64 bytes for software annotations, where we keep a skb
backpointer used on the Tx confirmation side for releasing the allocated
memory. At the moment, we only support linear skbs.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
 drivers/staging/fsl-dpaa2/ethsw/dpsw-cmd.h |  24 ++++
 drivers/staging/fsl-dpaa2/ethsw/dpsw.c     |  41 ++++++
 drivers/staging/fsl-dpaa2/ethsw/dpsw.h     |  28 ++++
 drivers/staging/fsl-dpaa2/ethsw/ethsw.c    | 146 ++++++++++++++++++---
 drivers/staging/fsl-dpaa2/ethsw/ethsw.h    |  14 ++
 5 files changed, 238 insertions(+), 15 deletions(-)

Comments

Vladimir Oltean Nov. 4, 2020, 9:27 p.m. UTC | #1
On Wed, Nov 04, 2020 at 06:57:17PM +0200, Ioana Ciornei wrote:
> From: Ioana Ciornei <ioana.ciornei@nxp.com>

> 

> Implement the .ndo_start_xmit() callback for the switch port interfaces.

> For each of the switch ports, gather the corresponding queue

> destination ID (QDID) necessary for Tx enqueueing.

> 

> We'll reserve 64 bytes for software annotations, where we keep a skb

> backpointer used on the Tx confirmation side for releasing the allocated

> memory. At the moment, we only support linear skbs.

> 

> Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>

> ---

> @@ -554,8 +560,10 @@ static int dpaa2_switch_port_open(struct net_device *netdev)

>  	struct ethsw_core *ethsw = port_priv->ethsw_data;

>  	int err;

>  

> -	/* No need to allow Tx as control interface is disabled */

> -	netif_tx_stop_all_queues(netdev);

> +	if (!dpaa2_switch_has_ctrl_if(port_priv->ethsw_data)) {

> +		/* No need to allow Tx as control interface is disabled */

> +		netif_tx_stop_all_queues(netdev);


Personal taste probably, but you could remove the braces here.

> +	}

>  

>  	/* Explicitly set carrier off, otherwise

>  	 * netif_carrier_ok() will return true and cause 'ip link show'

> @@ -610,15 +618,6 @@ static int dpaa2_switch_port_stop(struct net_device *netdev)

>  	return 0;

>  }

>  

> +static netdev_tx_t dpaa2_switch_port_tx(struct sk_buff *skb,

> +					struct net_device *net_dev)

> +{

> +	struct ethsw_port_priv *port_priv = netdev_priv(net_dev);

> +	struct ethsw_core *ethsw = port_priv->ethsw_data;

> +	int retries = DPAA2_SWITCH_SWP_BUSY_RETRIES;

> +	struct dpaa2_fd fd;

> +	int err;

> +

> +	if (!dpaa2_switch_has_ctrl_if(ethsw))

> +		goto err_free_skb;

> +

> +	if (unlikely(skb_headroom(skb) < DPAA2_SWITCH_NEEDED_HEADROOM)) {

> +		struct sk_buff *ns;

> +

> +		ns = skb_realloc_headroom(skb, DPAA2_SWITCH_NEEDED_HEADROOM);


Looks like this passion for skb_realloc_headroom runs in the company?
Few other drivers use it, and Claudiu just had a bug with that in gianfar.
Luckily what saves you from the same bug is the skb_unshare from right below.
Maybe you could use skb_cow_head and simplify this a bit?

> +		if (unlikely(!ns)) {

> +			netdev_err(net_dev, "Error reallocating skb headroom\n");

> +			goto err_free_skb;

> +		}

> +		dev_kfree_skb(skb);


Please use dev_consume_skb_any here, as it's not error path. Or, if you
use skb_cow_head, only the skb data will be reallocated, not the skb
structure itself, so there will be no consume_skb in that case at all,
another reason to simplify.

> +		skb = ns;

> +	}

> +

> +	/* We'll be holding a back-reference to the skb until Tx confirmation */

> +	skb = skb_unshare(skb, GFP_ATOMIC);

> +	if (unlikely(!skb)) {

> +		/* skb_unshare() has already freed the skb */

> +		netdev_err(net_dev, "Error copying the socket buffer\n");

> +		goto err_exit;

> +	}

> +

> +	if (skb_is_nonlinear(skb)) {

> +		netdev_err(net_dev, "No support for non-linear SKBs!\n");


Rate-limit maybe?
And what is the reason for no non-linear skb's? Too much code to copy
from dpaa2-eth?

> +		goto err_free_skb;

> +	}

> +

> +	err = dpaa2_switch_build_single_fd(ethsw, skb, &fd);

> +	if (unlikely(err)) {

> +		netdev_err(net_dev, "ethsw_build_*_fd() %d\n", err);

> +		goto err_free_skb;

> +	}

> +

> +	do {

> +		err = dpaa2_io_service_enqueue_qd(NULL,

> +						  port_priv->tx_qdid,

> +						  8, 0, &fd);

> +		retries--;

> +	} while (err == -EBUSY && retries);

> +

> +	if (unlikely(err < 0)) {

> +		dpaa2_switch_free_fd(ethsw, &fd);

> +		goto err_exit;

> +	}

> +

> +	return NETDEV_TX_OK;

> +

> +err_free_skb:

> +	dev_kfree_skb(skb);

> +err_exit:

> +	return NETDEV_TX_OK;

> +}

> +

> diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.h b/drivers/staging/fsl-dpaa2/ethsw/ethsw.h

> index bd24be2c6308..b267c04e2008 100644

> --- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.h

> +++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.h

> @@ -66,6 +66,19 @@

>   */

>  #define DPAA2_SWITCH_SWP_BUSY_RETRIES		1000

>  

> +/* Hardware annotation buffer size */

> +#define DPAA2_SWITCH_HWA_SIZE		64

> +/* Software annotation buffer size */

> +#define DPAA2_SWITCH_SWA_SIZE		64

> +

> +#define DPAA2_SWITCH_TX_BUF_ALIGN	64


Could you align all of these to the "1000" from DPAA2_SWITCH_SWP_BUSY_RETRIES?

> +

> +#define DPAA2_SWITCH_TX_DATA_OFFSET \

> +	(DPAA2_SWITCH_HWA_SIZE + DPAA2_SWITCH_SWA_SIZE)

> +

> +#define DPAA2_SWITCH_NEEDED_HEADROOM \

> +	(DPAA2_SWITCH_TX_DATA_OFFSET + DPAA2_SWITCH_TX_BUF_ALIGN)

> +


Ironically, you create a definition for NEEDED_HEADROOM but you do not
set dev->needed_headroom.
Andrew Lunn Nov. 5, 2020, 1:04 a.m. UTC | #2
> +static int dpaa2_switch_build_single_fd(struct ethsw_core *ethsw,

> +					struct sk_buff *skb,

> +					struct dpaa2_fd *fd)

> +{

> +	struct device *dev = ethsw->dev;

> +	struct sk_buff **skbh;

> +	dma_addr_t addr;

> +	u8 *buff_start;

> +	void *hwa;

> +

> +	buff_start = PTR_ALIGN(skb->data - DPAA2_SWITCH_TX_DATA_OFFSET -

> +			       DPAA2_SWITCH_TX_BUF_ALIGN,

> +			       DPAA2_SWITCH_TX_BUF_ALIGN);

> +

> +	/* Clear FAS to have consistent values for TX confirmation. It is

> +	 * located in the first 8 bytes of the buffer's hardware annotation

> +	 * area

> +	 */

> +	hwa = buff_start + DPAA2_SWITCH_SWA_SIZE;

> +	memset(hwa, 0, 8);

> +

> +	/* Store a backpointer to the skb at the beginning of the buffer

> +	 * (in the private data area) such that we can release it

> +	 * on Tx confirm

> +	 */

> +	skbh = (struct sk_buff **)buff_start;

> +	*skbh = skb;


Where is the TX confirm which uses this stored pointer. I don't see it
in this file.

It can be expensive to store pointer like this in buffers used for
DMA. It has to be flushed out of the cache here as part of the
send. Then the TX complete needs to invalidate and then read it back
into the cache. Or you use coherent memory which is just slow.

It can be cheaper to keep a parallel ring in cacheable memory which
never gets flushed.

      Andrew
Ioana Ciornei Nov. 5, 2020, 8:11 a.m. UTC | #3
On Wed, Nov 04, 2020 at 11:27:00PM +0200, Vladimir Oltean wrote:
> On Wed, Nov 04, 2020 at 06:57:17PM +0200, Ioana Ciornei wrote:
> > From: Ioana Ciornei <ioana.ciornei@nxp.com>
> > 
> > Implement the .ndo_start_xmit() callback for the switch port interfaces.
> > For each of the switch ports, gather the corresponding queue
> > destination ID (QDID) necessary for Tx enqueueing.
> > 
> > We'll reserve 64 bytes for software annotations, where we keep a skb
> > backpointer used on the Tx confirmation side for releasing the allocated
> > memory. At the moment, we only support linear skbs.
> > 
> > Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
> > ---
> > @@ -554,8 +560,10 @@ static int dpaa2_switch_port_open(struct net_device *netdev)
> >  	struct ethsw_core *ethsw = port_priv->ethsw_data;
> >  	int err;
> >  
> > -	/* No need to allow Tx as control interface is disabled */
> > -	netif_tx_stop_all_queues(netdev);
> > +	if (!dpaa2_switch_has_ctrl_if(port_priv->ethsw_data)) {
> > +		/* No need to allow Tx as control interface is disabled */
> > +		netif_tx_stop_all_queues(netdev);
> 
> Personal taste probably, but you could remove the braces here.

Usually checkpatch complains about this kind of thing but not this time.
Maybe it takes into account the comment as well..

I'll remove the braces.

> 
> > +	}
> >  
> >  	/* Explicitly set carrier off, otherwise
> >  	 * netif_carrier_ok() will return true and cause 'ip link show'
> > @@ -610,15 +618,6 @@ static int dpaa2_switch_port_stop(struct net_device *netdev)
> >  	return 0;
> >  }
> >  
> > +static netdev_tx_t dpaa2_switch_port_tx(struct sk_buff *skb,
> > +					struct net_device *net_dev)
> > +{
> > +	struct ethsw_port_priv *port_priv = netdev_priv(net_dev);
> > +	struct ethsw_core *ethsw = port_priv->ethsw_data;
> > +	int retries = DPAA2_SWITCH_SWP_BUSY_RETRIES;
> > +	struct dpaa2_fd fd;
> > +	int err;
> > +
> > +	if (!dpaa2_switch_has_ctrl_if(ethsw))
> > +		goto err_free_skb;
> > +
> > +	if (unlikely(skb_headroom(skb) < DPAA2_SWITCH_NEEDED_HEADROOM)) {
> > +		struct sk_buff *ns;
> > +
> > +		ns = skb_realloc_headroom(skb, DPAA2_SWITCH_NEEDED_HEADROOM);
> 
> Looks like this passion for skb_realloc_headroom runs in the company?

Not really, ocelot and sja1105 are safe :)

> Few other drivers use it, and Claudiu just had a bug with that in gianfar.
> Luckily what saves you from the same bug is the skb_unshare from right below.
> Maybe you could use skb_cow_head and simplify this a bit?
> 
> > +		if (unlikely(!ns)) {
> > +			netdev_err(net_dev, "Error reallocating skb headroom\n");
> > +			goto err_free_skb;
> > +		}
> > +		dev_kfree_skb(skb);
> 
> Please use dev_consume_skb_any here, as it's not error path. Or, if you
> use skb_cow_head, only the skb data will be reallocated, not the skb
> structure itself, so there will be no consume_skb in that case at all,
> another reason to simplify.

Ok, I can try that.

How dpaa2-eth deals now with this is to just create a S/G FD when the
headroom is less than what's necessary, so no skb_realloc_headroom() or
skb_cow_head(). But I agree that it's best to make it as simple as
possible.

> 
> > +		skb = ns;
> > +	}
> > +
> > +	/* We'll be holding a back-reference to the skb until Tx confirmation */
> > +	skb = skb_unshare(skb, GFP_ATOMIC);
> > +	if (unlikely(!skb)) {
> > +		/* skb_unshare() has already freed the skb */
> > +		netdev_err(net_dev, "Error copying the socket buffer\n");
> > +		goto err_exit;
> > +	}
> > +
> > +	if (skb_is_nonlinear(skb)) {
> > +		netdev_err(net_dev, "No support for non-linear SKBs!\n");
> 
> Rate-limit maybe?

Yep, that probably should be rate-limited.

> And what is the reason for no non-linear skb's? Too much code to copy
> from dpaa2-eth?

Once this is out of staging, dpaa2-eth and dpaa2-switch could share
the Tx/Rx code path so, as you said, I just didn't want to duplicate
everything if it's not specifically needed.

> > diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.h b/drivers/staging/fsl-dpaa2/ethsw/ethsw.h
> > index bd24be2c6308..b267c04e2008 100644
> > --- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.h
> > +++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.h
> > @@ -66,6 +66,19 @@
> >   */
> >  #define DPAA2_SWITCH_SWP_BUSY_RETRIES		1000
> >  
> > +/* Hardware annotation buffer size */
> > +#define DPAA2_SWITCH_HWA_SIZE		64
> > +/* Software annotation buffer size */
> > +#define DPAA2_SWITCH_SWA_SIZE		64
> > +
> > +#define DPAA2_SWITCH_TX_BUF_ALIGN	64
> 
> Could you align all of these to the "1000" from DPAA2_SWITCH_SWP_BUSY_RETRIES?
> 

Sure.

> > +
> > +#define DPAA2_SWITCH_TX_DATA_OFFSET \
> > +	(DPAA2_SWITCH_HWA_SIZE + DPAA2_SWITCH_SWA_SIZE)
> > +
> > +#define DPAA2_SWITCH_NEEDED_HEADROOM \
> > +	(DPAA2_SWITCH_TX_DATA_OFFSET + DPAA2_SWITCH_TX_BUF_ALIGN)
> > +
> 
> Ironically, you create a definition for NEEDED_HEADROOM but you do not
> set dev->needed_headroom.
Ioana Ciornei Nov. 5, 2020, 8:25 a.m. UTC | #4
On Thu, Nov 05, 2020 at 02:04:39AM +0100, Andrew Lunn wrote:
> > +static int dpaa2_switch_build_single_fd(struct ethsw_core *ethsw,
> > +					struct sk_buff *skb,
> > +					struct dpaa2_fd *fd)
> > +{
> > +	struct device *dev = ethsw->dev;
> > +	struct sk_buff **skbh;
> > +	dma_addr_t addr;
> > +	u8 *buff_start;
> > +	void *hwa;
> > +
> > +	buff_start = PTR_ALIGN(skb->data - DPAA2_SWITCH_TX_DATA_OFFSET -
> > +			       DPAA2_SWITCH_TX_BUF_ALIGN,
> > +			       DPAA2_SWITCH_TX_BUF_ALIGN);
> > +
> > +	/* Clear FAS to have consistent values for TX confirmation. It is
> > +	 * located in the first 8 bytes of the buffer's hardware annotation
> > +	 * area
> > +	 */
> > +	hwa = buff_start + DPAA2_SWITCH_SWA_SIZE;
> > +	memset(hwa, 0, 8);
> > +
> > +	/* Store a backpointer to the skb at the beginning of the buffer
> > +	 * (in the private data area) such that we can release it
> > +	 * on Tx confirm
> > +	 */
> > +	skbh = (struct sk_buff **)buff_start;
> > +	*skbh = skb;
> 
> Where is the TX confirm which uses this stored pointer. I don't see it
> in this file.
> 

The Tx confirm - dpaa2_switch_tx_conf() - is added in patch 5/9.

> It can be expensive to store pointer like this in buffers used for
> DMA.

Yes, it is. But the hardware does not give us any other indication that
a packet was actually sent so that we can move ahead with consuming the
initial skb.

> It has to be flushed out of the cache here as part of the
> send. Then the TX complete needs to invalidate and then read it back
> into the cache. Or you use coherent memory which is just slow.
> 
> It can be cheaper to keep a parallel ring in cacheable memory which
> never gets flushed.

I'm afraid I don't really understand your suggestion. In this parallel
ring I would keep the skb pointers of all frames which are in-flight?
Then, when a packet is received on the Tx confirmation queue I would
have to loop over the parallel ring and determine somehow which skb was
this packet initially associated to. Isn't this even more expensive?

Ioana
Andrew Lunn Nov. 5, 2020, 1:45 p.m. UTC | #5
> > Where is the TX confirm which uses this stored pointer. I don't see it

> > in this file.

> > 

> 

> The Tx confirm - dpaa2_switch_tx_conf() - is added in patch 5/9.


Not so obvious. Could it be moved here?

> > It can be expensive to store pointer like this in buffers used for

> > DMA.

> 

> Yes, it is. But the hardware does not give us any other indication that

> a packet was actually sent so that we can move ahead with consuming the

> initial skb.

> 

> > It has to be flushed out of the cache here as part of the

> > send. Then the TX complete needs to invalidate and then read it back

> > into the cache. Or you use coherent memory which is just slow.

> > 

> > It can be cheaper to keep a parallel ring in cacheable memory which

> > never gets flushed.

> 

> I'm afraid I don't really understand your suggestion. In this parallel

> ring I would keep the skb pointers of all frames which are in-flight?

> Then, when a packet is received on the Tx confirmation queue I would

> have to loop over the parallel ring and determine somehow which skb was

> this packet initially associated to. Isn't this even more expensive?


I don't know this particular hardware, so i will talk in general
terms. Generally, you have a transmit ring. You add new frames to be
sent to the beginning of the ring, and you take off completed frames
from the end of the ring. This is kept in 'expensive' memory, in that
either it is coherent, or you need to do flushed/invalidates.

It is expected that the hardware keeps to ring order. It does not pick
and choose which frames it sends, it does them in order. That means
completion also happens in ring order. So the driver can keep a simple
linear array the size of the ring, in cachable memory, with pointers
to the skbuf. And it just needs a counting index to know which one
just completed.

Now, your hardware is more complex. You have one queue feeding
multiple switch ports. Maybe it does not keep to ring order? If you
have one port running at 10M/Half, and another at 10G/Full, does it
leave frames for the 10/Half port in the ring when its egress queue it
full? That is probably a bad idea, since the 10G/Full port could then
starve for lack of free slots in the ring? So my guess would be, the
frames get dropped. And so ring order is maintained.

If you are paranoid it could get out of sync, keep an array of tuples,
address of the frame descriptor and the skbuf. If the fd address does
not match what you expect, then do the linear search of the fd
address, and increment a counter that something odd has happened.

	 Andrew
Ioana Ciornei Nov. 5, 2020, 3:51 p.m. UTC | #6
On Thu, Nov 05, 2020 at 02:45:12PM +0100, Andrew Lunn wrote:
> > > Where is the TX confirm which uses this stored pointer. I don't see it

> > > in this file.

> > > 

> > 

> > The Tx confirm - dpaa2_switch_tx_conf() - is added in patch 5/9.

> 

> Not so obvious. Could it be moved here?

> 


Sure, I'll move it here so that we have both Tx and Tx confirmation in
the same patch.

> > > It can be expensive to store pointer like this in buffers used for

> > > DMA.

> > 

> > Yes, it is. But the hardware does not give us any other indication that

> > a packet was actually sent so that we can move ahead with consuming the

> > initial skb.

> > 

> > > It has to be flushed out of the cache here as part of the

> > > send. Then the TX complete needs to invalidate and then read it back

> > > into the cache. Or you use coherent memory which is just slow.

> > > 

> > > It can be cheaper to keep a parallel ring in cacheable memory which

> > > never gets flushed.

> > 

> > I'm afraid I don't really understand your suggestion. In this parallel

> > ring I would keep the skb pointers of all frames which are in-flight?

> > Then, when a packet is received on the Tx confirmation queue I would

> > have to loop over the parallel ring and determine somehow which skb was

> > this packet initially associated to. Isn't this even more expensive?

> 

> I don't know this particular hardware, so i will talk in general

> terms. Generally, you have a transmit ring. You add new frames to be

> sent to the beginning of the ring, and you take off completed frames

> from the end of the ring. This is kept in 'expensive' memory, in that

> either it is coherent, or you need to do flushed/invalidates.

> 

> It is expected that the hardware keeps to ring order. It does not pick

> and choose which frames it sends, it does them in order. That means

> completion also happens in ring order. So the driver can keep a simple

> linear array the size of the ring, in cachable memory, with pointers

> to the skbuf. And it just needs a counting index to know which one

> just completed.


I agree with all of the above in a general sense.

> 

> Now, your hardware is more complex. You have one queue feeding

> multiple switch ports.


Not really. I have one Tx queue for each switch port and just one Tx
confirmation queue for all of them.

> Maybe it does not keep to ring order?


If the driver enqueues frames #1, #2, #3 in this exact order on a switch
port then the frames will arrive in the same order on the Tx
confirmation queue irrespective of any other traffic sent on other
switch ports.

> If you

> have one port running at 10M/Half, and another at 10G/Full, does it

> leave frames for the 10/Half port in the ring when its egress queue it

> full? That is probably a bad idea, since the 10G/Full port could then

> starve for lack of free slots in the ring? So my guess would be, the

> frames get dropped. And so ring order is maintained.

> 

> If you are paranoid it could get out of sync, keep an array of tuples,

> address of the frame descriptor and the skbuf. If the fd address does

> not match what you expect, then do the linear search of the fd

> address, and increment a counter that something odd has happened.

> 


The problem with this would be, I think, with two TX softirqs on two
different cores which want to send a frame on the same switch port. In
order to update the shadow ring, there should be some kind of locking
mechanism on the access to the shadow ring which would might invalidate
any attempt to make this more efficient.

This might not be a problem for the dpaa2-switch since it does not
enable NETIF_F_LLTX but it might be for dpaa2-eth.

Also, as the architecture is defined now, the driver does not really see
the Tx queues as being fixed-size so that it can infer the size for the
shadow copy.

I will have to dig a little bit more in this area to understand exactly
why the decision to use skb backpointers was made in the first place (I
am not really talking about the dpaa2-switch here, dpaa2-eth has the
same exact behavior and has been around for some time now).

Ioana
diff mbox series

Patch

diff --git a/drivers/staging/fsl-dpaa2/ethsw/dpsw-cmd.h b/drivers/staging/fsl-dpaa2/ethsw/dpsw-cmd.h
index 9caff864ae3e..e239747f8a54 100644
--- a/drivers/staging/fsl-dpaa2/ethsw/dpsw-cmd.h
+++ b/drivers/staging/fsl-dpaa2/ethsw/dpsw-cmd.h
@@ -45,6 +45,8 @@ 
 #define DPSW_CMDID_IF_ENABLE                DPSW_CMD_ID(0x03D)
 #define DPSW_CMDID_IF_DISABLE               DPSW_CMD_ID(0x03E)
 
+#define DPSW_CMDID_IF_GET_ATTR              DPSW_CMD_ID(0x042)
+
 #define DPSW_CMDID_IF_SET_MAX_FRAME_LENGTH  DPSW_CMD_ID(0x044)
 
 #define DPSW_CMDID_IF_GET_LINK_STATE        DPSW_CMD_ID(0x046)
@@ -258,6 +260,28 @@  struct dpsw_cmd_if {
 	__le16 if_id;
 };
 
+#define DPSW_ADMIT_UNTAGGED_SHIFT	0
+#define DPSW_ADMIT_UNTAGGED_SIZE	4
+#define DPSW_ENABLED_SHIFT		5
+#define DPSW_ENABLED_SIZE		1
+#define DPSW_ACCEPT_ALL_VLAN_SHIFT	6
+#define DPSW_ACCEPT_ALL_VLAN_SIZE	1
+
+struct dpsw_rsp_if_get_attr {
+	/* cmd word 0 */
+	/* from LSB: admit_untagged:4 enabled:1 accept_all_vlan:1 */
+	u8 conf;
+	u8 pad1;
+	u8 num_tcs;
+	u8 pad2;
+	__le16 qdid;
+	/* cmd word 1 */
+	__le32 options;
+	__le32 pad3;
+	/* cmd word 2 */
+	__le32 rate;
+};
+
 struct dpsw_cmd_if_set_max_frame_length {
 	__le16 if_id;
 	__le16 frame_length;
diff --git a/drivers/staging/fsl-dpaa2/ethsw/dpsw.c b/drivers/staging/fsl-dpaa2/ethsw/dpsw.c
index 785140d4652c..2a1967754913 100644
--- a/drivers/staging/fsl-dpaa2/ethsw/dpsw.c
+++ b/drivers/staging/fsl-dpaa2/ethsw/dpsw.c
@@ -704,6 +704,47 @@  int dpsw_if_disable(struct fsl_mc_io *mc_io,
 	return mc_send_command(mc_io, &cmd);
 }
 
+/**
+ * dpsw_if_get_attributes() - Function obtains attributes of interface
+ * @mc_io:	Pointer to MC portal's I/O object
+ * @cmd_flags:	Command flags; one or more of 'MC_CMD_FLAG_'
+ * @token:	Token of DPSW object
+ * @if_id:	Interface Identifier
+ * @attr:	Returned interface attributes
+ *
+ * Return:	Completion status. '0' on Success; Error code otherwise.
+ */
+int dpsw_if_get_attributes(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 token,
+			   u16 if_id, struct dpsw_if_attr *attr)
+{
+	struct dpsw_rsp_if_get_attr *rsp_params;
+	struct fsl_mc_command cmd = { 0 };
+	struct dpsw_cmd_if *cmd_params;
+	int err;
+
+	cmd.header = mc_encode_cmd_header(DPSW_CMDID_IF_GET_ATTR, cmd_flags,
+					  token);
+	cmd_params = (struct dpsw_cmd_if *)cmd.params;
+	cmd_params->if_id = cpu_to_le16(if_id);
+
+	err = mc_send_command(mc_io, &cmd);
+	if (err)
+		return err;
+
+	rsp_params = (struct dpsw_rsp_if_get_attr *)cmd.params;
+	attr->num_tcs = rsp_params->num_tcs;
+	attr->rate = le32_to_cpu(rsp_params->rate);
+	attr->options = le32_to_cpu(rsp_params->options);
+	attr->qdid = le16_to_cpu(rsp_params->qdid);
+	attr->enabled = dpsw_get_field(rsp_params->conf, ENABLED);
+	attr->accept_all_vlan = dpsw_get_field(rsp_params->conf,
+					       ACCEPT_ALL_VLAN);
+	attr->admit_untagged = dpsw_get_field(rsp_params->conf,
+					      ADMIT_UNTAGGED);
+
+	return 0;
+}
+
 /**
  * dpsw_if_set_max_frame_length() - Set Maximum Receive frame length.
  * @mc_io:		Pointer to MC portal's I/O object
diff --git a/drivers/staging/fsl-dpaa2/ethsw/dpsw.h b/drivers/staging/fsl-dpaa2/ethsw/dpsw.h
index 718242ea7d87..a6ed6860946c 100644
--- a/drivers/staging/fsl-dpaa2/ethsw/dpsw.h
+++ b/drivers/staging/fsl-dpaa2/ethsw/dpsw.h
@@ -440,6 +440,34 @@  int dpsw_if_disable(struct fsl_mc_io *mc_io,
 		    u16 token,
 		    u16 if_id);
 
+/**
+ * struct dpsw_if_attr - Structure representing DPSW interface attributes
+ * @num_tcs: Number of traffic classes
+ * @rate: Transmit rate in bits per second
+ * @options: Interface configuration options (bitmap)
+ * @enabled: Indicates if interface is enabled
+ * @accept_all_vlan: The device discards/accepts incoming frames
+ *		for VLANs that do not include this interface
+ * @admit_untagged: When set to 'DPSW_ADMIT_ONLY_VLAN_TAGGED', the device
+ *		discards untagged frames or priority-tagged frames received on
+ *		this interface;
+ *		When set to 'DPSW_ADMIT_ALL', untagged frames or priority-
+ *		tagged frames received on this interface are accepted
+ * @qdid: control frames transmit qdid
+ */
+struct dpsw_if_attr {
+	u8 num_tcs;
+	u32 rate;
+	u32 options;
+	int enabled;
+	int accept_all_vlan;
+	enum dpsw_accepted_frames admit_untagged;
+	u16 qdid;
+};
+
+int dpsw_if_get_attributes(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 token,
+			   u16 if_id, struct dpsw_if_attr *attr);
+
 int dpsw_if_set_max_frame_length(struct fsl_mc_io *mc_io,
 				 u32 cmd_flags,
 				 u16 token,
diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
index d09aa4a5126a..7805f0e58ec2 100644
--- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
+++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
@@ -477,6 +477,7 @@  static int dpaa2_switch_port_change_mtu(struct net_device *netdev, int mtu)
 static int dpaa2_switch_port_carrier_state_sync(struct net_device *netdev)
 {
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
+	struct ethsw_core *ethsw = port_priv->ethsw_data;
 	struct dpsw_link_state state;
 	int err;
 
@@ -497,10 +498,15 @@  static int dpaa2_switch_port_carrier_state_sync(struct net_device *netdev)
 	WARN_ONCE(state.up > 1, "Garbage read into link_state");
 
 	if (state.up != port_priv->link_state) {
-		if (state.up)
+		if (state.up) {
 			netif_carrier_on(netdev);
-		else
+			if (dpaa2_switch_has_ctrl_if(ethsw))
+				netif_tx_start_all_queues(netdev);
+		} else {
 			netif_carrier_off(netdev);
+			if (dpaa2_switch_has_ctrl_if(ethsw))
+				netif_tx_stop_all_queues(netdev);
+		}
 		port_priv->link_state = state.up;
 	}
 
@@ -554,8 +560,10 @@  static int dpaa2_switch_port_open(struct net_device *netdev)
 	struct ethsw_core *ethsw = port_priv->ethsw_data;
 	int err;
 
-	/* No need to allow Tx as control interface is disabled */
-	netif_tx_stop_all_queues(netdev);
+	if (!dpaa2_switch_has_ctrl_if(port_priv->ethsw_data)) {
+		/* No need to allow Tx as control interface is disabled */
+		netif_tx_stop_all_queues(netdev);
+	}
 
 	/* Explicitly set carrier off, otherwise
 	 * netif_carrier_ok() will return true and cause 'ip link show'
@@ -610,15 +618,6 @@  static int dpaa2_switch_port_stop(struct net_device *netdev)
 	return 0;
 }
 
-static netdev_tx_t dpaa2_switch_port_dropframe(struct sk_buff *skb,
-					       struct net_device *netdev)
-{
-	/* we don't support I/O for now, drop the frame */
-	dev_kfree_skb_any(skb);
-
-	return NETDEV_TX_OK;
-}
-
 static int dpaa2_switch_port_parent_id(struct net_device *dev,
 				       struct netdev_phys_item_id *ppid)
 {
@@ -835,6 +834,114 @@  static void dpaa2_switch_free_fd(const struct ethsw_core *ethsw,
 	dev_kfree_skb(skb);
 }
 
+static int dpaa2_switch_build_single_fd(struct ethsw_core *ethsw,
+					struct sk_buff *skb,
+					struct dpaa2_fd *fd)
+{
+	struct device *dev = ethsw->dev;
+	struct sk_buff **skbh;
+	dma_addr_t addr;
+	u8 *buff_start;
+	void *hwa;
+
+	buff_start = PTR_ALIGN(skb->data - DPAA2_SWITCH_TX_DATA_OFFSET -
+			       DPAA2_SWITCH_TX_BUF_ALIGN,
+			       DPAA2_SWITCH_TX_BUF_ALIGN);
+
+	/* Clear FAS to have consistent values for TX confirmation. It is
+	 * located in the first 8 bytes of the buffer's hardware annotation
+	 * area
+	 */
+	hwa = buff_start + DPAA2_SWITCH_SWA_SIZE;
+	memset(hwa, 0, 8);
+
+	/* Store a backpointer to the skb at the beginning of the buffer
+	 * (in the private data area) such that we can release it
+	 * on Tx confirm
+	 */
+	skbh = (struct sk_buff **)buff_start;
+	*skbh = skb;
+
+	addr = dma_map_single(dev, buff_start,
+			      skb_tail_pointer(skb) - buff_start,
+			      DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(dev, addr)))
+		return -ENOMEM;
+
+	/* Setup the FD fields */
+	memset(fd, 0, sizeof(*fd));
+
+	dpaa2_fd_set_addr(fd, addr);
+	dpaa2_fd_set_offset(fd, (u16)(skb->data - buff_start));
+	dpaa2_fd_set_len(fd, skb->len);
+	dpaa2_fd_set_format(fd, dpaa2_fd_single);
+
+	return 0;
+}
+
+static netdev_tx_t dpaa2_switch_port_tx(struct sk_buff *skb,
+					struct net_device *net_dev)
+{
+	struct ethsw_port_priv *port_priv = netdev_priv(net_dev);
+	struct ethsw_core *ethsw = port_priv->ethsw_data;
+	int retries = DPAA2_SWITCH_SWP_BUSY_RETRIES;
+	struct dpaa2_fd fd;
+	int err;
+
+	if (!dpaa2_switch_has_ctrl_if(ethsw))
+		goto err_free_skb;
+
+	if (unlikely(skb_headroom(skb) < DPAA2_SWITCH_NEEDED_HEADROOM)) {
+		struct sk_buff *ns;
+
+		ns = skb_realloc_headroom(skb, DPAA2_SWITCH_NEEDED_HEADROOM);
+		if (unlikely(!ns)) {
+			netdev_err(net_dev, "Error reallocating skb headroom\n");
+			goto err_free_skb;
+		}
+		dev_kfree_skb(skb);
+		skb = ns;
+	}
+
+	/* We'll be holding a back-reference to the skb until Tx confirmation */
+	skb = skb_unshare(skb, GFP_ATOMIC);
+	if (unlikely(!skb)) {
+		/* skb_unshare() has already freed the skb */
+		netdev_err(net_dev, "Error copying the socket buffer\n");
+		goto err_exit;
+	}
+
+	if (skb_is_nonlinear(skb)) {
+		netdev_err(net_dev, "No support for non-linear SKBs!\n");
+		goto err_free_skb;
+	}
+
+	err = dpaa2_switch_build_single_fd(ethsw, skb, &fd);
+	if (unlikely(err)) {
+		netdev_err(net_dev, "ethsw_build_*_fd() %d\n", err);
+		goto err_free_skb;
+	}
+
+	do {
+		err = dpaa2_io_service_enqueue_qd(NULL,
+						  port_priv->tx_qdid,
+						  8, 0, &fd);
+		retries--;
+	} while (err == -EBUSY && retries);
+
+	if (unlikely(err < 0)) {
+		dpaa2_switch_free_fd(ethsw, &fd);
+		goto err_exit;
+	}
+
+	return NETDEV_TX_OK;
+
+err_free_skb:
+	dev_kfree_skb(skb);
+err_exit:
+	return NETDEV_TX_OK;
+}
+
 static const struct net_device_ops dpaa2_switch_port_ops = {
 	.ndo_open		= dpaa2_switch_port_open,
 	.ndo_stop		= dpaa2_switch_port_stop,
@@ -848,7 +955,7 @@  static const struct net_device_ops dpaa2_switch_port_ops = {
 	.ndo_fdb_del		= dpaa2_switch_port_fdb_del,
 	.ndo_fdb_dump		= dpaa2_switch_port_fdb_dump,
 
-	.ndo_start_xmit		= dpaa2_switch_port_dropframe,
+	.ndo_start_xmit		= dpaa2_switch_port_tx,
 	.ndo_get_port_parent_id	= dpaa2_switch_port_parent_id,
 	.ndo_get_phys_port_name = dpaa2_switch_port_get_phys_name,
 };
@@ -2237,6 +2344,7 @@  static int dpaa2_switch_port_init(struct ethsw_port_priv *port_priv, u16 port)
 {
 	struct net_device *netdev = port_priv->netdev;
 	struct ethsw_core *ethsw = port_priv->ethsw_data;
+	struct dpsw_if_attr dpsw_if_attr;
 	struct dpsw_vlan_if_cfg vcfg;
 	int err;
 
@@ -2263,7 +2371,15 @@  static int dpaa2_switch_port_init(struct ethsw_port_priv *port_priv, u16 port)
 	if (err)
 		netdev_err(netdev, "dpsw_vlan_remove_if err %d\n", err);
 
-	return err;
+	err = dpsw_if_get_attributes(ethsw->mc_io, 0, ethsw->dpsw_handle,
+				     port_priv->idx, &dpsw_if_attr);
+	if (err) {
+		netdev_err(netdev, "dpsw_if_get_attributes err %d\n", err);
+		return err;
+	}
+	port_priv->tx_qdid = dpsw_if_attr.qdid;
+
+	return 0;
 }
 
 static void dpaa2_switch_unregister_notifier(struct device *dev)
diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.h b/drivers/staging/fsl-dpaa2/ethsw/ethsw.h
index bd24be2c6308..b267c04e2008 100644
--- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.h
+++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.h
@@ -66,6 +66,19 @@ 
  */
 #define DPAA2_SWITCH_SWP_BUSY_RETRIES		1000
 
+/* Hardware annotation buffer size */
+#define DPAA2_SWITCH_HWA_SIZE		64
+/* Software annotation buffer size */
+#define DPAA2_SWITCH_SWA_SIZE		64
+
+#define DPAA2_SWITCH_TX_BUF_ALIGN	64
+
+#define DPAA2_SWITCH_TX_DATA_OFFSET \
+	(DPAA2_SWITCH_HWA_SIZE + DPAA2_SWITCH_SWA_SIZE)
+
+#define DPAA2_SWITCH_NEEDED_HEADROOM \
+	(DPAA2_SWITCH_TX_DATA_OFFSET + DPAA2_SWITCH_TX_BUF_ALIGN)
+
 extern const struct ethtool_ops dpaa2_switch_port_ethtool_ops;
 
 struct ethsw_core;
@@ -92,6 +105,7 @@  struct ethsw_port_priv {
 	u8			vlans[VLAN_VID_MASK + 1];
 	u16			pvid;
 	struct net_device	*bridge_dev;
+	u16			tx_qdid;
 };
 
 /* Switch data */