diff mbox series

[5/5] examples/l3fwd: add neon support for l3fwd

Message ID 1493709255-8887-5-git-send-email-jianbo.liu@linaro.org
State New
Headers show
Series [1/5] examples/l3fwd: extract arch independent code from multi hash lookup | expand

Commit Message

Jianbo Liu May 2, 2017, 7:14 a.m. UTC
Use ARM NEON intrinsics to accelerate l3 fowarding.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>

---
 examples/l3fwd/l3fwd.h             |   4 -
 examples/l3fwd/l3fwd_em.c          |   4 +-
 examples/l3fwd/l3fwd_em_hlm.h      |   5 +
 examples/l3fwd/l3fwd_em_hlm_neon.h |  74 +++++++++++
 examples/l3fwd/l3fwd_em_single.h   |   4 +
 examples/l3fwd/l3fwd_lpm.c         |   4 +-
 examples/l3fwd/l3fwd_lpm_neon.h    | 157 ++++++++++++++++++++++
 examples/l3fwd/l3fwd_neon.h        | 259 +++++++++++++++++++++++++++++++++++++
 8 files changed, 504 insertions(+), 7 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

-- 
1.8.3.1

Comments

Sekhar, Ashwin May 2, 2017, 11:20 a.m. UTC | #1
Hi,

Please find comments inline.

On Tue, 2017-05-02 at 15:14 +0800, Jianbo Liu wrote:
> Use ARM NEON intrinsics to accelerate l3 fowarding.

> 

> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>

> ---

>  examples/l3fwd/l3fwd.h             |   4 -

>  examples/l3fwd/l3fwd_em.c          |   4 +-

>  examples/l3fwd/l3fwd_em_hlm.h      |   5 +

>  examples/l3fwd/l3fwd_em_hlm_neon.h |  74 +++++++++++

>  examples/l3fwd/l3fwd_em_single.h   |   4 +

>  examples/l3fwd/l3fwd_lpm.c         |   4 +-

>  examples/l3fwd/l3fwd_lpm_neon.h    | 157 ++++++++++++++++++++++

>  examples/l3fwd/l3fwd_neon.h        | 259

> +++++++++++++++++++++++++++++++++++++

>  8 files changed, 504 insertions(+), 7 deletions(-)

>  create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h

>  create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h

>  create mode 100644 examples/l3fwd/l3fwd_neon.h

> 

> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h

> index 011ba14..c45589a 100644

> --- a/examples/l3fwd/l3fwd.h

> +++ b/examples/l3fwd/l3fwd.h

> @@ -40,10 +40,6 @@

>  

>  #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1

>  

> -#if !defined(NO_HASH_MULTI_LOOKUP) &&

> defined(RTE_MACHINE_CPUFLAG_NEON)

> -#define NO_HASH_MULTI_LOOKUP 1

> -#endif

> -

>  #define MAX_PKT_BURST     32

>  #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */

>  

> diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c

> index cccf797..ac1e2e0 100644

> --- a/examples/l3fwd/l3fwd_em.c

> +++ b/examples/l3fwd/l3fwd_em.c

> @@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {

>  	return (uint8_t)((ret < 0) ? portid :

> ipv6_l3fwd_out_if[ret]);

>  }

>  

> -#if defined(__SSE4_1__)

> +#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)

>  #if defined(NO_HASH_MULTI_LOOKUP)

>  #include "l3fwd_em_single.h"

>  #else

> @@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {

>  			if (nb_rx == 0)

>  				continue;

>  

> -#if defined(__SSE4_1__)

> +#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)

>  			l3fwd_em_send_packets(nb_rx, pkts_burst,

>  							portid,

> qconf);

>  #else

> diff --git a/examples/l3fwd/l3fwd_em_hlm.h

> b/examples/l3fwd/l3fwd_em_hlm.h

> index 636dea4..3329c1a 100644

> --- a/examples/l3fwd/l3fwd_em_hlm.h

> +++ b/examples/l3fwd/l3fwd_em_hlm.h

> @@ -35,8 +35,13 @@

>  #ifndef __L3FWD_EM_HLM_H__

>  #define __L3FWD_EM_HLM_H__

>  

> +#if defined(__SSE4_1__)

>  #include "l3fwd_sse.h"

>  #include "l3fwd_em_hlm_sse.h"

> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)

> +#include "l3fwd_neon.h"

> +#include "l3fwd_em_hlm_neon.h"

> +#endif

>  

>  static inline __attribute__((always_inline)) void

>  em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf

> *m[8],

> diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h

> b/examples/l3fwd/l3fwd_em_hlm_neon.h

> new file mode 100644

> index 0000000..dae1acf

> --- /dev/null

> +++ b/examples/l3fwd/l3fwd_em_hlm_neon.h

> @@ -0,0 +1,74 @@

> +/*-

> + *   BSD LICENSE

> + *

> + *   Copyright(c) 2016 Intel Corporation. All rights reserved.

> + *   Copyright(c) 2017, Linaro Limited

> + *   All rights reserved.

> + *

> + *   Redistribution and use in source and binary forms, with or

> without

> + *   modification, are permitted provided that the following

> conditions

> + *   are met:

> + *

> + *     * Redistributions of source code must retain the above

> copyright

> + *       notice, this list of conditions and the following

> disclaimer.

> + *     * Redistributions in binary form must reproduce the above

> copyright

> + *       notice, this list of conditions and the following

> disclaimer in

> + *       the documentation and/or other materials provided with the

> + *       distribution.

> + *     * Neither the name of Intel Corporation nor the names of its

> + *       contributors may be used to endorse or promote products

> derived

> + *       from this software without specific prior written

> permission.

> + *

> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND

> CONTRIBUTORS

> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT

> NOT

> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

> FITNESS FOR

> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

> COPYRIGHT

> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

> INCIDENTAL,

> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT

> NOT

> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

> OF USE,

> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

> ON ANY

> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR

> TORT

> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF

> THE USE

> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH

> DAMAGE.

> + */

> +

> +#ifndef __L3FWD_EM_HLM_NEON_H__

> +#define __L3FWD_EM_HLM_NEON_H__

> +

> +#include <arm_neon.h>

> +

> +static inline void

> +get_ipv4_5tuple(struct rte_mbuf *m0, int32x4_t mask0,

> +		union ipv4_5tuple_host *key)

> +{

> +	int32x4_t tmpdata0 = vld1q_s32(rte_pktmbuf_mtod_offset(m0,

> int32_t *,

> +				sizeof(struct ether_hdr) +

> +				offsetof(struct ipv4_hdr,

> time_to_live)));

> +

> +	key->xmm = vandq_s32(tmpdata0, mask0);

> +}

> +

> +static inline void

> +get_ipv6_5tuple(struct rte_mbuf *m0, int32x4_t mask0,

> +		int32x4_t mask1, union ipv6_5tuple_host *key)

> +{

> +	int32x4_t tmpdata0 = vld1q_s32(

> +			rte_pktmbuf_mtod_offset(m0, int *,

> +				sizeof(struct ether_hdr) +

> +				offsetof(struct ipv6_hdr,

> payload_len)));

> +

> +	int32x4_t tmpdata1 = vld1q_s32(

> +			rte_pktmbuf_mtod_offset(m0, int *,

> +				sizeof(struct ether_hdr) +

> +				offsetof(struct ipv6_hdr,

> payload_len) + 8));

> +

> +	int32x4_t tmpdata2 = vld1q_s32(

> +			rte_pktmbuf_mtod_offset(m0, int *,

> +				sizeof(struct ether_hdr) +

> +				offsetof(struct ipv6_hdr,

> payload_len) + 16));

> +

> +	key->xmm[0] = vandq_s32(tmpdata0, mask0);

> +	key->xmm[1] = tmpdata1;

> +	key->xmm[2] = vandq_s32(tmpdata2, mask1);

> +}

> +#endif /* __L3FWD_EM_HLM_NEON_H__ */

> diff --git a/examples/l3fwd/l3fwd_em_single.h

> b/examples/l3fwd/l3fwd_em_single.h

> index c0a9725..8604571 100644

> --- a/examples/l3fwd/l3fwd_em_single.h

> +++ b/examples/l3fwd/l3fwd_em_single.h

> @@ -43,7 +43,11 @@

>   * compilation time.

>   */

>  

> +#if defined(__SSE4_1__)

>  #include "l3fwd_sse.h"

> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)

> +#include "l3fwd_neon.h"

> +#endif

>  

>  static inline __attribute__((always_inline)) uint16_t

>  em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf

> *pkt,

> diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c

> index fc554fc..ddef250 100644

> --- a/examples/l3fwd/l3fwd_lpm.c

> +++ b/examples/l3fwd/l3fwd_lpm.c

> @@ -189,6 +189,8 @@ static inline __attribute__((always_inline))

> uint16_t

>  

>  #if defined(__SSE4_1__)

>  #include "l3fwd_lpm_sse.h"

> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)

> +#include "l3fwd_lpm_neon.h"

>  #else

>  #include "l3fwd_lpm.h"

>  #endif

> @@ -261,7 +263,7 @@ static inline __attribute__((always_inline))

> uint16_t

>  			if (nb_rx == 0)

>  				continue;

>  

> -#if defined(__SSE4_1__)

> +#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)

>  			l3fwd_lpm_send_packets(nb_rx, pkts_burst,

>  						portid, qconf);

>  #else

> diff --git a/examples/l3fwd/l3fwd_lpm_neon.h

> b/examples/l3fwd/l3fwd_lpm_neon.h

> new file mode 100644

> index 0000000..772e54b

> --- /dev/null

> +++ b/examples/l3fwd/l3fwd_lpm_neon.h

> @@ -0,0 +1,157 @@

> +/*-

> + *   BSD LICENSE

> + *

> + *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.

> + *   Copyright(c) 2017, Linaro Limited

> + *   All rights reserved.

> + *

> + *   Redistribution and use in source and binary forms, with or

> without

> + *   modification, are permitted provided that the following

> conditions

> + *   are met:

> + *

> + *     * Redistributions of source code must retain the above

> copyright

> + *       notice, this list of conditions and the following

> disclaimer.

> + *     * Redistributions in binary form must reproduce the above

> copyright

> + *       notice, this list of conditions and the following

> disclaimer in

> + *       the documentation and/or other materials provided with the

> + *       distribution.

> + *     * Neither the name of Intel Corporation nor the names of its

> + *       contributors may be used to endorse or promote products

> derived

> + *       from this software without specific prior written

> permission.

> + *

> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND

> CONTRIBUTORS

> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT

> NOT

> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

> FITNESS FOR

> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

> COPYRIGHT

> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

> INCIDENTAL,

> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT

> NOT

> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

> OF USE,

> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

> ON ANY

> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR

> TORT

> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF

> THE USE

> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH

> DAMAGE.

> + */

> +

> +#ifndef __L3FWD_LPM_NEON_H__

> +#define __L3FWD_LPM_NEON_H__

> +

> +#include <arm_neon.h>

> +

> +#include "l3fwd_neon.h"

> +

> +/*

> + * Read packet_type and destination IPV4 addresses from 4 mbufs.

> + */

> +static inline void

> +processx4_step1(struct rte_mbuf *pkt[FWDSTEP],

> +		int32x4_t *dip,

> +		uint32_t *ipv4_flag)

> +{

> +	struct ipv4_hdr *ipv4_hdr;

> +	struct ether_hdr *eth_hdr;

> +	int32_t dst[FWDSTEP];

> +

> +	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);

> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);

> +	dst[0] = ipv4_hdr->dst_addr;

> +	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;

> +

> +	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);

> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);

> +	dst[1] = ipv4_hdr->dst_addr;

> +	ipv4_flag[0] &= pkt[1]->packet_type;

> +

> +	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);

> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);

> +	dst[2] = ipv4_hdr->dst_addr;

> +	ipv4_flag[0] &= pkt[2]->packet_type;

> +

> +	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);

> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);

> +	dst[3] = ipv4_hdr->dst_addr;

> +	ipv4_flag[0] &= pkt[3]->packet_type;

> +

> +	dip[0] = vld1q_s32(dst);

> +}

> +

> +/*

> + * Lookup into LPM for destination port.

> + * If lookup fails, use incoming port (portid) as destination port.

> + */

> +static inline void

> +processx4_step2(const struct lcore_conf *qconf,

> +		int32x4_t dip,

> +		uint32_t ipv4_flag,

> +		uint8_t portid,

> +		struct rte_mbuf *pkt[FWDSTEP],

> +		uint16_t dprt[FWDSTEP])

> +{

> +	rte_xmm_t dst;

> +	uint8x16_t bswap_mask = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,

> 8,

> +				 15, 14, 13, 12};

> +

> +	/* Byte swap 4 IPV4 addresses. */

> +	dip =

> vreinterpretq_s32_u8(vqtbl1q_u8(vreinterpretq_u8_s32(dip),

> +					      bswap_mask));

> +

This can be easily done by vrev32q_u8. With this we can avoid the need
for bswap_mask. Also TBL instruction has higher latency compared to the
rev32 instruction in thunderx, thunderx2t99 and cortexa57. 

> +	/* if all 4 packets are IPV4. */

> +	if (likely(ipv4_flag)) {

> +		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip,

> dst.u32,

> +			portid);

> +		/* get rid of unused upper 16 bit for each dport. */

> +		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));

> +	} else {

> +		dst.x = dip;

> +		dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],

> +						     dst.u32[0],

> portid);

> +		dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],

> +						     dst.u32[1],

> portid);

> +		dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],

> +						     dst.u32[2],

> portid);

> +		dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],

> +						     dst.u32[3],

> portid);

> +	}

> +}

> +

> +/*

> + * Buffer optimized handling of packets, invoked

> + * from main_loop.

> + */

> +static inline void

> +l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,

> +			uint8_t portid, struct lcore_conf *qconf)

> +{

> +	int32_t j;

> +	uint16_t dst_port[MAX_PKT_BURST];

> +	int32x4_t dip[MAX_PKT_BURST / FWDSTEP];

> +	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];

> +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);

> +

> +	for (j = 0; j != k; j += FWDSTEP)

> +		processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],

> +				&ipv4_flag[j / FWDSTEP]);

> +

> +	for (j = 0; j != k; j += FWDSTEP)

> +		processx4_step2(qconf, dip[j / FWDSTEP],

> +				ipv4_flag[j / FWDSTEP], portid,

> &pkts_burst[j],

> +				&dst_port[j]);

> +

> +	/* Classify last up to 3 packets one by one */

> +	switch (nb_rx % FWDSTEP) {

> +	case 3:

> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],

> portid);

> +		j++;

> +		/* fallthrough */

> +	case 2:

> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],

> portid);

> +		j++;

> +		/* fallthrough */

> +	case 1:

> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],

> portid);

> +		j++;

> +	}

> +

> +	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);

> +}

> +

> +#endif /* __L3FWD_LPM_NEON_H__ */

> diff --git a/examples/l3fwd/l3fwd_neon.h

> b/examples/l3fwd/l3fwd_neon.h

> new file mode 100644

> index 0000000..75c8976

> --- /dev/null

> +++ b/examples/l3fwd/l3fwd_neon.h

> @@ -0,0 +1,259 @@

> +/*-

> + *   BSD LICENSE

> + *

> + *   Copyright(c) 2016 Intel Corporation. All rights reserved.

> + *   Copyright(c) 2017, Linaro Limited

> + *   All rights reserved.

> + *

> + *   Redistribution and use in source and binary forms, with or

> without

> + *   modification, are permitted provided that the following

> conditions

> + *   are met:

> + *

> + *     * Redistributions of source code must retain the above

> copyright

> + *       notice, this list of conditions and the following

> disclaimer.

> + *     * Redistributions in binary form must reproduce the above

> copyright

> + *       notice, this list of conditions and the following

> disclaimer in

> + *       the documentation and/or other materials provided with the

> + *       distribution.

> + *     * Neither the name of Intel Corporation nor the names of its

> + *       contributors may be used to endorse or promote products

> derived

> + *       from this software without specific prior written

> permission.

> + *

> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND

> CONTRIBUTORS

> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT

> NOT

> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

> FITNESS FOR

> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

> COPYRIGHT

> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

> INCIDENTAL,

> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT

> NOT

> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

> OF USE,

> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

> ON ANY

> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR

> TORT

> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF

> THE USE

> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH

> DAMAGE.

> + */

> +

> +

> +#ifndef _L3FWD_NEON_H_

> +#define _L3FWD_NEON_H_

> +

> +#include "l3fwd.h"

> +#include "l3fwd_common.h"

> +

> +/*

> + * Update source and destination MAC addresses in the ethernet

> header.

> + * Perform RFC1812 checks and updates for IPV4 packets.

> + */

> +static inline void

> +processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t

> dst_port[FWDSTEP])

> +{

> +	uint32x4_t te[FWDSTEP];

> +	uint32x4_t ve[FWDSTEP];

> +	uint32_t *p[FWDSTEP];

> +

> +	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);

> +	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);

> +	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);

> +	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);

> +

> +	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);

> +	te[0] = vld1q_u32(p[0]);

> +

> +	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);

> +	te[1] = vld1q_u32(p[1]);

> +

> +	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);

> +	te[2] = vld1q_u32(p[2]);

> +

> +	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);

> +	te[3] = vld1q_u32(p[3]);

> +

> +	/* Update last 4 bytes */

> +	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);

> +	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);

> +	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);

> +	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);

> +

> +	vst1q_u32(p[0], ve[0]);

> +	vst1q_u32(p[1], ve[1]);

> +	vst1q_u32(p[2], ve[2]);

> +	vst1q_u32(p[3], ve[3]);

> +

> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0]

> + 1),

> +		&dst_port[0], pkt[0]->packet_type);

> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1]

> + 1),

> +		&dst_port[1], pkt[1]->packet_type);

> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2]

> + 1),

> +		&dst_port[2], pkt[2]->packet_type);

> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3]

> + 1),

> +		&dst_port[3], pkt[3]->packet_type);

> +}

> +

> +/*

> + * Group consecutive packets with the same destination port in

> bursts of 4.

> + * Suppose we have array of destionation ports:

> + * dst_port[] = {a, b, c, d,, e, ... }

> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.

> + * We doing 4 comparisions at once and the result is 4 bit mask.

> + * This mask is used as an index into prebuild array of pnum values.

> + */

> +static inline uint16_t *

> +port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,

> +	     uint16x8_t dp2)

> +{

> +	union {

> +		uint16_t u16[FWDSTEP + 1];

> +		uint64_t u64;

> +	} *pnum = (void *)pn;

> +

> +	int32_t v;

> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};

> +

> +	dp1 = vceqq_u16(dp1, dp2);

> +	dp1 = vandq_u16(dp1, mask);

> +	v = vaddvq_u16(dp1);

> +

> +	/* update last port counter. */

> +	lp[0] += gptbl[v].lpv;

> +

> +	/* if dest port value has changed. */

> +	if (v != GRPMSK) {

> +		pnum->u64 = gptbl[v].pnum;

> +		pnum->u16[FWDSTEP] = 1;

> +		lp = pnum->u16 + gptbl[v].idx;

> +	}

> +

> +	return lp;

> +}

> +

> +/**

> + * Process one packet:

> + * Update source and destination MAC addresses in the ethernet

> header.

> + * Perform RFC1812 checks and updates for IPV4 packets.

> + */

> +static inline void

> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)

> +{

> +	struct ether_hdr *eth_hdr;

> +	uint32x4_t te, ve;

> +

> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);

> +

> +	te = vld1q_u32((uint32_t *)eth_hdr);

> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);

> +

> +

> +	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,

> +			pkt->packet_type);

> +

> +	ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);

> +	vst1q_u32((uint32_t *)eth_hdr, ve);

> +}

> +

> +/**

> + * Send packets burst from pkts_burst to the ports in dst_port array

> + */

> +static inline __attribute__((always_inline)) void

> +send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf

> **pkts_burst,

> +		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)

> +{

> +	int32_t k;

> +	int j = 0;

> +	uint16_t dlp;

> +	uint16_t *lp;

> +	uint16_t pnum[MAX_PKT_BURST + 1];

> +

> +	/*

> +	 * Finish packet processing and group consecutive

> +	 * packets with the same destination port.

> +	 */

> +	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);

> +	if (k != 0) {

> +		uint16x8_t dp1, dp2;

> +

> +		lp = pnum;

> +		lp[0] = 1;

> +

> +		processx4_step3(pkts_burst, dst_port);

> +

> +		/* dp1: <d[0], d[1], d[2], d[3], ... > */

> +		dp1 = vld1q_u16(dst_port);

> +

> +		for (j = FWDSTEP; j != k; j += FWDSTEP) {

> +			processx4_step3(&pkts_burst[j],

> &dst_port[j]);

> +

> +			/*

> +			 * dp2:

> +			 * <d[j-3], d[j-2], d[j-1], d[j], ... >

> +			 */

> +			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);

> +			lp  = port_groupx4(&pnum[j - FWDSTEP], lp,

> dp1, dp2);

> +

> +			/*

> +			 * dp1:

> +			 * <d[j], d[j+1], d[j+2], d[j+3], ... >

> +			 */

> +			dp1 = vextq_u16(dp1, dp1, FWDSTEP - 1);

> +		}

> +

> +		/*

> +		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >

> +		 */

> +		dp2 = vextq_u16(dp1, dp1, 1);

> +		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2,

> 3);

> +		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1,

> dp2);

> +

> +		/*

> +		 * remove values added by the last repeated

> +		 * dst port.

> +		 */

> +		lp[0]--;

> +		dlp = dst_port[j - 1];

> +	} else {

> +		/* set dlp and lp to the never used values. */

> +		dlp = BAD_PORT - 1;

> +		lp = pnum + MAX_PKT_BURST;

> +	}

> +

> +	/* Process up to last 3 packets one by one. */

> +	switch (nb_rx % FWDSTEP) {

> +	case 3:

> +		process_packet(pkts_burst[j], dst_port + j);

> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);

> +		j++;

> +		/* fallthrough */

> +	case 2:

> +		process_packet(pkts_burst[j], dst_port + j);

> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);

> +		j++;

> +		/* fallthrough */

> +	case 1:

> +		process_packet(pkts_burst[j], dst_port + j);

> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);

> +		j++;

> +	}

> +

> +	/*

> +	 * Send packets out, through destination port.

> +	 * Consecutive packets with the same destination port

> +	 * are already grouped together.

> +	 * If destination port for the packet equals BAD_PORT,

> +	 * then free the packet without sending it out.

> +	 */

> +	for (j = 0; j < nb_rx; j += k) {

> +

> +		int32_t m;

> +		uint16_t pn;

> +

> +		pn = dst_port[j];

> +		k = pnum[j];

> +

> +		if (likely(pn != BAD_PORT))

> +			send_packetsx4(qconf, pn, pkts_burst + j,

> k);

> +		else

> +			for (m = j; m != j + k; m++)

> +				rte_pktmbuf_free(pkts_burst[m]);

> +

> +	}

> +}

> +

> +#endif /* _L3FWD_NEON_H_ */


Thanks and Regards
Ashwin
Sekhar, Ashwin May 2, 2017, 11:47 a.m. UTC | #2
Hi Jianbo,

I tested your neon changes on thunderx. I am seeing a performance
regression of ~10% for LPM case and ~20% for EM case with your changes.
Did you see improvement on any arm64 platform with these changes. If
yes, how much was the improvement?

FYI, I had also tried vectorizing the l3fwd app with neon. Few of the
optimizations that I can suggest that helped in my case.

* Packet data prefetch is missing in the x86 sse version compared to
the scalar version (l3fwd_lpm_send_packets vs
l3fwd_lpm_no_opt_send_packets) . I couldn't understand why this was not
done in x86. But adding the prefetch was improving performance for
thunderx.

* Offsets to some packet elements like eth_hdr, ip header, packet type
etc. are recalculated in different functions. Calculating them once,
caching them and passing them directly to different functions was
improving performance.

* There are 3 different loops in l3fwd_lpm_send_packets where we
iterate over the packets. One each for processx4_step1 and
processx4_step2 and one in send_packets_multi. Unifying these loops
were also helping.

Thanks and Regards
Ashwin
Jianbo Liu May 3, 2017, 5:24 a.m. UTC | #3
Hi Ashwin,

On 2 May 2017 at 19:47, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
> Hi Jianbo,

>

> I tested your neon changes on thunderx. I am seeing a performance

> regression of ~10% for LPM case and ~20% for EM case with your changes.

> Did you see improvement on any arm64 platform with these changes. If

> yes, how much was the improvement?


Thanks for your reviewing and testing.
For some reason, I have not done much with the performance testing.
I'll send a new version later after tuning the performance.

Thanks!
Jianbo

>

> FYI, I had also tried vectorizing the l3fwd app with neon. Few of the

> optimizations that I can suggest that helped in my case.

>

> * Packet data prefetch is missing in the x86 sse version compared to

> the scalar version (l3fwd_lpm_send_packets vs

> l3fwd_lpm_no_opt_send_packets) . I couldn't understand why this was not

> done in x86. But adding the prefetch was improving performance for

> thunderx.

>

> * Offsets to some packet elements like eth_hdr, ip header, packet type

> etc. are recalculated in different functions. Calculating them once,

> caching them and passing them directly to different functions was

> improving performance.

>

> * There are 3 different loops in l3fwd_lpm_send_packets where we

> iterate over the packets. One each for processx4_step1 and

> processx4_step2 and one in send_packets_multi. Unifying these loops

> were also helping.

>

> Thanks and Regards

> Ashwin

>
Jianbo Liu May 4, 2017, 8:42 a.m. UTC | #4
Hi Ashwin,

On 3 May 2017 at 13:24, Jianbo Liu <jianbo.liu@linaro.org> wrote:
> Hi Ashwin,

>

> On 2 May 2017 at 19:47, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:

>> Hi Jianbo,

>>

>> I tested your neon changes on thunderx. I am seeing a performance

>> regression of ~10% for LPM case and ~20% for EM case with your changes.

>> Did you see improvement on any arm64 platform with these changes. If

>> yes, how much was the improvement?

>

> Thanks for your reviewing and testing.

> For some reason, I have not done much with the performance testing.

> I'll send a new version later after tuning the performance.

>


Can you tell me how did you test?
My testing shows that EM case is much better, while LPM is almost the
same as before.

Thanks!
Jianbo
Sekhar, Ashwin May 5, 2017, 4:24 a.m. UTC | #5
On Thu, 2017-05-04 at 16:42 +0800, Jianbo Liu wrote:
> Hi Ashwin,

> 

> On 3 May 2017 at 13:24, Jianbo Liu <jianbo.liu@linaro.org> wrote:

> > 

> > Hi Ashwin,

> > 

> > On 2 May 2017 at 19:47, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>

> > wrote:

> > > 

> > > Hi Jianbo,

> > > 

> > > I tested your neon changes on thunderx. I am seeing a performance

> > > regression of ~10% for LPM case and ~20% for EM case with your

> > > changes.

> > > Did you see improvement on any arm64 platform with these changes.

> > > If

> > > yes, how much was the improvement?

> > Thanks for your reviewing and testing.

> > For some reason, I have not done much with the performance testing.

> > I'll send a new version later after tuning the performance.

> > 

> Can you tell me how did you test?

Built with following commands.
make config T=arm64-thunderx-linuxapp-gcc
make -j32

Tested LPM with
sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p 0x1 --config="(0,0,10)"

Tested EM with
sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p 0x1 --config="(0,0,10)" -E

> My testing shows that EM case is much better, while LPM is almost the

> same as before.

Could you please tell on which arm64 processor/platform you tested.
Also how much was the percentage increase in performance for EM ?

> Thanks!

> Jianbo
Jianbo Liu May 5, 2017, 5:43 a.m. UTC | #6
On 5 May 2017 at 12:24, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
> On Thu, 2017-05-04 at 16:42 +0800, Jianbo Liu wrote:

>> Hi Ashwin,

>>

>> On 3 May 2017 at 13:24, Jianbo Liu <jianbo.liu@linaro.org> wrote:

>> >

>> > Hi Ashwin,

>> >

>> > On 2 May 2017 at 19:47, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>

>> > wrote:

>> > >

>> > > Hi Jianbo,

>> > >

>> > > I tested your neon changes on thunderx. I am seeing a performance

>> > > regression of ~10% for LPM case and ~20% for EM case with your

>> > > changes.

>> > > Did you see improvement on any arm64 platform with these changes.

>> > > If

>> > > yes, how much was the improvement?

>> > Thanks for your reviewing and testing.

>> > For some reason, I have not done much with the performance testing.

>> > I'll send a new version later after tuning the performance.

>> >

>> Can you tell me how did you test?

> Built with following commands.

> make config T=arm64-thunderx-linuxapp-gcc

> make -j32

>

> Tested LPM with

> sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p 0x1 --config="(0,0,10)"

>

> Tested EM with

> sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p 0x1 --config="(0,0,10)" -E

>


Only one port? What's the network topology, and lpm/em rules? How did
you stress traffic...?

>> My testing shows that EM case is much better, while LPM is almost the

>> same as before.

> Could you please tell on which arm64 processor/platform you tested.

> Also how much was the percentage increase in performance for EM ?

>


I'm sorry I can't tell you what's arm64 platform I tested on. But I
can get a ThunderX, and replicate your testing environment if you can
tell me more...

Thanks!
Jianbo
Sekhar, Ashwin May 9, 2017, 8:10 a.m. UTC | #7
On Fri, 2017-05-05 at 13:43 +0800, Jianbo Liu wrote:
> On 5 May 2017 at 12:24, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>

> wrote:

> > 

> > On Thu, 2017-05-04 at 16:42 +0800, Jianbo Liu wrote:

> > > 

> > > Hi Ashwin,

> > > 

> > > On 3 May 2017 at 13:24, Jianbo Liu <jianbo.liu@linaro.org> wrote:

> > > > 

> > > > 

> > > > Hi Ashwin,

> > > > 

> > > > On 2 May 2017 at 19:47, Sekhar, Ashwin <Ashwin.Sekhar@cavium.co

> > > > m>

> > > > wrote:

> > > > > 

> > > > > 

> > > > > Hi Jianbo,

> > > > > 

> > > > > I tested your neon changes on thunderx. I am seeing a

> > > > > performance

> > > > > regression of ~10% for LPM case and ~20% for EM case with

> > > > > your

> > > > > changes.

> > > > > Did you see improvement on any arm64 platform with these

> > > > > changes.

> > > > > If

> > > > > yes, how much was the improvement?

> > > > Thanks for your reviewing and testing.

> > > > For some reason, I have not done much with the performance

> > > > testing.

> > > > I'll send a new version later after tuning the performance.

> > > > 

> > > Can you tell me how did you test?

> > Built with following commands.

> > make config T=arm64-thunderx-linuxapp-gcc

> > make -j32

> > 

> > Tested LPM with

> > sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p

> > 0x1 --config="(0,0,10)"

> > 

> > Tested EM with

> > sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p

> > 0x1 --config="(0,0,10)" -E

> > 

> Only one port? What's the network topology, and lpm/em rules? How did

> you stress traffic...?

port - 1 topology: DUT connected back to back to traffic generator.

We are using the default rules in the C code. flow generation is:
src.ip.min 192.168.18.1
src.ip.max 192.168.18.90
src.ip.inc 1

Also, Please let us know the topology that you are using.
> 

> > 

> > > 

> > > My testing shows that EM case is much better, while LPM is almost

> > > the

> > > same as before.

> > Could you please tell on which arm64 processor/platform you tested.

> > Also how much was the percentage increase in performance for EM ?

> > 

> I'm sorry I can't tell you what's arm64 platform I tested on. But I

> can get a ThunderX, and replicate your testing environment if you can

> tell me more...

Thanks.
> 

> Thanks!

> Jianbo
Jianbo Liu May 10, 2017, 2:39 a.m. UTC | #8
Hi Ashwin,

On 9 May 2017 at 16:10, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
> On Fri, 2017-05-05 at 13:43 +0800, Jianbo Liu wrote:

>> On 5 May 2017 at 12:24, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>

>> wrote:

>> >

>> > On Thu, 2017-05-04 at 16:42 +0800, Jianbo Liu wrote:

>> > >

>> > > Hi Ashwin,

>> > >

>> > > On 3 May 2017 at 13:24, Jianbo Liu <jianbo.liu@linaro.org> wrote:

>> > > >

>> > > >

>> > > > Hi Ashwin,

>> > > >

>> > > > On 2 May 2017 at 19:47, Sekhar, Ashwin <Ashwin.Sekhar@cavium.co

>> > > > m>

>> > > > wrote:

>> > > > >

>> > > > >

>> > > > > Hi Jianbo,

>> > > > >

>> > > > > I tested your neon changes on thunderx. I am seeing a

>> > > > > performance

>> > > > > regression of ~10% for LPM case and ~20% for EM case with

>> > > > > your

>> > > > > changes.

>> > > > > Did you see improvement on any arm64 platform with these

>> > > > > changes.

>> > > > > If

>> > > > > yes, how much was the improvement?

>> > > > Thanks for your reviewing and testing.

>> > > > For some reason, I have not done much with the performance

>> > > > testing.

>> > > > I'll send a new version later after tuning the performance.

>> > > >

>> > > Can you tell me how did you test?

>> > Built with following commands.

>> > make config T=arm64-thunderx-linuxapp-gcc

>> > make -j32

>> >

>> > Tested LPM with

>> > sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p

>> > 0x1 --config="(0,0,10)"

>> >

>> > Tested EM with

>> > sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p

>> > 0x1 --config="(0,0,10)" -E

>> >

>> Only one port? What's the network topology, and lpm/em rules? How did

>> you stress traffic...?

> port - 1 topology: DUT connected back to back to traffic generator.

>

> We are using the default rules in the C code. flow generation is:

> src.ip.min 192.168.18.1

> src.ip.max 192.168.18.90

> src.ip.inc 1

>

> Also, Please let us know the topology that you are using.


I used two ports with one rule to forward packets from one to the other.
Sent v2, please try this new version.

Thanks!
Jianbo
diff mbox series

Patch

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index 011ba14..c45589a 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -40,10 +40,6 @@ 
 
 #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
 
-#if !defined(NO_HASH_MULTI_LOOKUP) && defined(RTE_MACHINE_CPUFLAG_NEON)
-#define NO_HASH_MULTI_LOOKUP 1
-#endif
-
 #define MAX_PKT_BURST     32
 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
 
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index cccf797..ac1e2e0 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -328,7 +328,7 @@  struct ipv6_l3fwd_em_route {
 	return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_single.h"
 #else
@@ -709,7 +709,7 @@  struct ipv6_l3fwd_em_route {
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_em_send_packets(nb_rx, pkts_burst,
 							portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 636dea4..3329c1a 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -35,8 +35,13 @@ 
 #ifndef __L3FWD_EM_HLM_H__
 #define __L3FWD_EM_HLM_H__
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
 #include "l3fwd_em_hlm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#include "l3fwd_em_hlm_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h b/examples/l3fwd/l3fwd_em_hlm_neon.h
new file mode 100644
index 0000000..dae1acf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
@@ -0,0 +1,74 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_NEON_H__
+#define __L3FWD_EM_HLM_NEON_H__
+
+#include <arm_neon.h>
+
+static inline void
+get_ipv4_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		union ipv4_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(rte_pktmbuf_mtod_offset(m0, int32_t *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv4_hdr, time_to_live)));
+
+	key->xmm = vandq_s32(tmpdata0, mask0);
+}
+
+static inline void
+get_ipv6_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		int32x4_t mask1, union ipv6_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len)));
+
+	int32x4_t tmpdata1 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 8));
+
+	int32x4_t tmpdata2 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 16));
+
+	key->xmm[0] = vandq_s32(tmpdata0, mask0);
+	key->xmm[1] = tmpdata1;
+	key->xmm[2] = vandq_s32(tmpdata2, mask1);
+}
+#endif /* __L3FWD_EM_HLM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_em_single.h b/examples/l3fwd/l3fwd_em_single.h
index c0a9725..8604571 100644
--- a/examples/l3fwd/l3fwd_em_single.h
+++ b/examples/l3fwd/l3fwd_em_single.h
@@ -43,7 +43,11 @@ 
  * compilation time.
  */
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) uint16_t
 em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index fc554fc..ddef250 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -189,6 +189,8 @@  static inline __attribute__((always_inline)) uint16_t
 
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_lpm_neon.h"
 #else
 #include "l3fwd_lpm.h"
 #endif
@@ -261,7 +263,7 @@  static inline __attribute__((always_inline)) uint16_t
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_lpm_send_packets(nb_rx, pkts_burst,
 						portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
new file mode 100644
index 0000000..772e54b
--- /dev/null
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -0,0 +1,157 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_LPM_NEON_H__
+#define __L3FWD_LPM_NEON_H__
+
+#include <arm_neon.h>
+
+#include "l3fwd_neon.h"
+
+/*
+ * Read packet_type and destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
+		int32x4_t *dip,
+		uint32_t *ipv4_flag)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[0] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[1] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[1]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[2] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[2]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[3] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[3]->packet_type;
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ * If lookup fails, use incoming port (portid) as destination port.
+ */
+static inline void
+processx4_step2(const struct lcore_conf *qconf,
+		int32x4_t dip,
+		uint32_t ipv4_flag,
+		uint8_t portid,
+		struct rte_mbuf *pkt[FWDSTEP],
+		uint16_t dprt[FWDSTEP])
+{
+	rte_xmm_t dst;
+	uint8x16_t bswap_mask = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8,
+				 15, 14, 13, 12};
+
+	/* Byte swap 4 IPV4 addresses. */
+	dip = vreinterpretq_s32_u8(vqtbl1q_u8(vreinterpretq_u8_s32(dip),
+					      bswap_mask));
+
+	/* if all 4 packets are IPV4. */
+	if (likely(ipv4_flag)) {
+		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dst.u32,
+			portid);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+	} else {
+		dst.x = dip;
+		dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
+						     dst.u32[0], portid);
+		dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
+						     dst.u32[1], portid);
+		dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
+						     dst.u32[2], portid);
+		dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
+						     dst.u32[3], portid);
+	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t j;
+	uint16_t dst_port[MAX_PKT_BURST];
+	int32x4_t dip[MAX_PKT_BURST / FWDSTEP];
+	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+	for (j = 0; j != k; j += FWDSTEP)
+		processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],
+				&ipv4_flag[j / FWDSTEP]);
+
+	for (j = 0; j != k; j += FWDSTEP)
+		processx4_step2(qconf, dip[j / FWDSTEP],
+				ipv4_flag[j / FWDSTEP], portid, &pkts_burst[j],
+				&dst_port[j]);
+
+	/* Classify last up to 3 packets one by one */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+		/* fallthrough */
+	case 2:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+		/* fallthrough */
+	case 1:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+	}
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+}
+
+#endif /* __L3FWD_LPM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
new file mode 100644
index 0000000..75c8976
--- /dev/null
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -0,0 +1,259 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_NEON_H_
+#define _L3FWD_NEON_H_
+
+#include "l3fwd.h"
+#include "l3fwd_common.h"
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+
+	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);
+	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);
+	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);
+	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);
+
+	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+	te[0] = vld1q_u32(p[0]);
+
+	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);
+	te[1] = vld1q_u32(p[1]);
+
+	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);
+	te[2] = vld1q_u32(p[2]);
+
+	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);
+	te[3] = vld1q_u32(p[3]);
+
+	/* Update last 4 bytes */
+	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);
+	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);
+	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
+	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
+
+	vst1q_u32(p[0], ve[0]);
+	vst1q_u32(p[1], ve[1]);
+	vst1q_u32(p[2], ve[2]);
+	vst1q_u32(p[3], ve[3]);
+
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0] + 1),
+		&dst_port[0], pkt[0]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1] + 1),
+		&dst_port[1], pkt[1]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2] + 1),
+		&dst_port[2], pkt[2]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3] + 1),
+		&dst_port[3], pkt[3]->packet_type);
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destionation ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisions at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+	     uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	int32_t v;
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+/**
+ * Process one packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
+{
+	struct ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+
+	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
+			pkt->packet_type);
+
+	ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+}
+
+/**
+ * Send packets burst from pkts_burst to the ports in dst_port array
+ */
+static inline __attribute__((always_inline)) void
+send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
+		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
+{
+	int32_t k;
+	int j = 0;
+	uint16_t dlp;
+	uint16_t *lp;
+	uint16_t pnum[MAX_PKT_BURST + 1];
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts_burst, dst_port);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (j = FWDSTEP; j != k; j += FWDSTEP) {
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
+			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp1, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[j - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (j = 0; j < nb_rx; j += k) {
+
+		int32_t m;
+		uint16_t pn;
+
+		pn = dst_port[j];
+		k = pnum[j];
+
+		if (likely(pn != BAD_PORT))
+			send_packetsx4(qconf, pn, pkts_burst + j, k);
+		else
+			for (m = j; m != j + k; m++)
+				rte_pktmbuf_free(pkts_burst[m]);
+
+	}
+}
+
+#endif /* _L3FWD_NEON_H_ */