diff mbox series

[PATCHv2,bpf-next] samples/bpf: add xdp program on egress for xdp_redirect_map

Message ID 20201126084325.477470-1-liuhangbin@gmail.com
State New
Headers show
Series [PATCHv2,bpf-next] samples/bpf: add xdp program on egress for xdp_redirect_map | expand

Commit Message

Hangbin Liu Nov. 26, 2020, 8:43 a.m. UTC
Current sample test xdp_redirect_map only count pkts on ingress. But we
can't know whether the pkts are redirected or dropped. So add a counter
on egress interface so we could know how many pkts are redirect in fact.

sample result:

$ ./xdp_redirect_map -X veth1 veth2
input: 5 output: 6
libbpf: elf: skipping unrecognized data section(9) .rodata.str1.16
libbpf: elf: skipping unrecognized data section(23) .eh_frame
libbpf: elf: skipping relo section(24) .rel.eh_frame for section(23) .eh_frame
in ifindex 5:          1 pkt/s, out ifindex 6:          1 pkt/s
in ifindex 5:          1 pkt/s, out ifindex 6:          1 pkt/s
in ifindex 5:          0 pkt/s, out ifindex 6:          0 pkt/s
in ifindex 5:         68 pkt/s, out ifindex 6:         68 pkt/s
in ifindex 5:         91 pkt/s, out ifindex 6:         91 pkt/s
in ifindex 5:         91 pkt/s, out ifindex 6:         91 pkt/s
in ifindex 5:         66 pkt/s, out ifindex 6:         66 pkt/s

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
v2:
a) use pkt counter instead of IP ttl modification on egress program
b) make the egress program selectable by option -X

---
 samples/bpf/xdp_redirect_map_kern.c |  26 +++--
 samples/bpf/xdp_redirect_map_user.c | 142 ++++++++++++++++++----------
 2 files changed, 113 insertions(+), 55 deletions(-)

Comments

Yonghong Song Nov. 27, 2020, 6:31 a.m. UTC | #1
On 11/26/20 12:43 AM, Hangbin Liu wrote:
> Current sample test xdp_redirect_map only count pkts on ingress. But we

> can't know whether the pkts are redirected or dropped. So add a counter

> on egress interface so we could know how many pkts are redirect in fact.

> 

> sample result:

> 

> $ ./xdp_redirect_map -X veth1 veth2

> input: 5 output: 6

> libbpf: elf: skipping unrecognized data section(9) .rodata.str1.16

> libbpf: elf: skipping unrecognized data section(23) .eh_frame

> libbpf: elf: skipping relo section(24) .rel.eh_frame for section(23) .eh_frame

> in ifindex 5:          1 pkt/s, out ifindex 6:          1 pkt/s

> in ifindex 5:          1 pkt/s, out ifindex 6:          1 pkt/s

> in ifindex 5:          0 pkt/s, out ifindex 6:          0 pkt/s

> in ifindex 5:         68 pkt/s, out ifindex 6:         68 pkt/s

> in ifindex 5:         91 pkt/s, out ifindex 6:         91 pkt/s

> in ifindex 5:         91 pkt/s, out ifindex 6:         91 pkt/s

> in ifindex 5:         66 pkt/s, out ifindex 6:         66 pkt/s

> 

> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>

> ---

> v2:

> a) use pkt counter instead of IP ttl modification on egress program

> b) make the egress program selectable by option -X

> 

> ---

>   samples/bpf/xdp_redirect_map_kern.c |  26 +++--

>   samples/bpf/xdp_redirect_map_user.c | 142 ++++++++++++++++++----------

>   2 files changed, 113 insertions(+), 55 deletions(-)

> 

> diff --git a/samples/bpf/xdp_redirect_map_kern.c b/samples/bpf/xdp_redirect_map_kern.c

> index 6489352ab7a4..fd6704a4f7e2 100644

> --- a/samples/bpf/xdp_redirect_map_kern.c

> +++ b/samples/bpf/xdp_redirect_map_kern.c

> @@ -22,19 +22,19 @@

>   struct {

>   	__uint(type, BPF_MAP_TYPE_DEVMAP);

>   	__uint(key_size, sizeof(int));

> -	__uint(value_size, sizeof(int));

> +	__uint(value_size, sizeof(struct bpf_devmap_val));

>   	__uint(max_entries, 100);

>   } tx_port SEC(".maps");

>   

> -/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success

> - * feedback.  Redirect TX errors can be caught via a tracepoint.

> +/* Count RX/TX packets, use key 0 for rx pkt count, key 1 for tx

> + * pkt count.

>    */

>   struct {

>   	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);

>   	__type(key, u32);

>   	__type(value, long);

> -	__uint(max_entries, 1);

> -} rxcnt SEC(".maps");

> +	__uint(max_entries, 2);

> +} pktcnt SEC(".maps");

>   

>   static void swap_src_dst_mac(void *data)

>   {

> @@ -72,7 +72,7 @@ int xdp_redirect_map_prog(struct xdp_md *ctx)

>   	vport = 0;

>   

>   	/* count packet in global counter */

> -	value = bpf_map_lookup_elem(&rxcnt, &key);

> +	value = bpf_map_lookup_elem(&pktcnt, &key);

>   	if (value)

>   		*value += 1;

>   

> @@ -82,6 +82,20 @@ int xdp_redirect_map_prog(struct xdp_md *ctx)

>   	return bpf_redirect_map(&tx_port, vport, 0);

>   }

>   

> +SEC("xdp_devmap/map_prog")

> +int xdp_devmap_prog(struct xdp_md *ctx)

> +{

> +	long *value;

> +	u32 key = 1;

> +

> +	/* count packet in global counter */

> +	value = bpf_map_lookup_elem(&pktcnt, &key);

> +	if (value)

> +		*value += 1;

> +

> +	return XDP_PASS;

> +}

> +

>   /* Redirect require an XDP bpf_prog loaded on the TX device */

>   SEC("xdp_redirect_dummy")

>   int xdp_redirect_dummy_prog(struct xdp_md *ctx)

> diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c

> index 35e16dee613e..8bdec0865e1d 100644

> --- a/samples/bpf/xdp_redirect_map_user.c

> +++ b/samples/bpf/xdp_redirect_map_user.c

> @@ -21,12 +21,13 @@

>   

>   static int ifindex_in;

>   static int ifindex_out;

> -static bool ifindex_out_xdp_dummy_attached = true;

> +static bool ifindex_out_xdp_dummy_attached = false;

> +static bool xdp_prog_attached = false;


Maybe xdp_devmap_prog_attached? Feel xdp_prog_attached
is too generic since actually it controls xdp_devmap program
attachment.

>   static __u32 prog_id;

>   static __u32 dummy_prog_id;

>   

>   static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;

> -static int rxcnt_map_fd;

> +static int pktcnt_map_fd;

>   

>   static void int_exit(int sig)

>   {

> @@ -60,26 +61,46 @@ static void int_exit(int sig)

>   	exit(0);

>   }

>   

> -static void poll_stats(int interval, int ifindex)

> +static void poll_stats(int interval, int if_ingress, int if_egress)

>   {

>   	unsigned int nr_cpus = bpf_num_possible_cpus();

> -	__u64 values[nr_cpus], prev[nr_cpus];

> +	__u64 values[nr_cpus], in_prev[nr_cpus], e_prev[nr_cpus];

> +	__u64 sum;

> +	__u32 key;

> +	int i;

>   

> -	memset(prev, 0, sizeof(prev));

> +	memset(in_prev, 0, sizeof(in_prev));

> +	memset(e_prev, 0, sizeof(e_prev));

>   

>   	while (1) {

> -		__u64 sum = 0;

> -		__u32 key = 0;

> -		int i;

> +		sum = 0;

> +		key = 0;

>   

>   		sleep(interval);

> -		assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0);

> -		for (i = 0; i < nr_cpus; i++)

> -			sum += (values[i] - prev[i]);

> -		if (sum)

> -			printf("ifindex %i: %10llu pkt/s\n",

> -			       ifindex, sum / interval);

> -		memcpy(prev, values, sizeof(values));

> +		if (bpf_map_lookup_elem(pktcnt_map_fd, &key, values) == 0) {


When we could have a failure here? If it indeed failed maybe it signals
something wrong and the process should fail?

> +			for (i = 0; i < nr_cpus; i++)

> +				sum += (values[i] - in_prev[i]);

> +			if (sum)

> +				printf("in ifindex %i: %10llu pkt/s",

> +				       if_ingress, sum / interval);

> +			memcpy(in_prev, values, sizeof(values));

> +		}

> +

> +		if (!xdp_prog_attached) {

> +			printf("\n");

> +			continue;

> +		}

> +

> +		sum = 0;

> +		key = 1;

> +		if (bpf_map_lookup_elem(pktcnt_map_fd, &key, values) == 0) {


same as the above, if bpf_map_lookup_elem() failed, maybe we should 
signal a failure?

> +			for (i = 0; i < nr_cpus; i++)

> +				sum += (values[i] - e_prev[i]);

> +			if (sum)

> +				printf(", out ifindex %i: %10llu pkt/s\n",

> +				       if_egress, sum / interval);

> +			memcpy(e_prev, values, sizeof(values));

> +		}

>   	}

>   }

>   

[...]
Hangbin Liu Nov. 30, 2020, 7:51 a.m. UTC | #2
On Thu, Nov 26, 2020 at 10:31:56PM -0800, Yonghong Song wrote:
> > index 35e16dee613e..8bdec0865e1d 100644

> > --- a/samples/bpf/xdp_redirect_map_user.c

> > +++ b/samples/bpf/xdp_redirect_map_user.c

> > @@ -21,12 +21,13 @@

> >   static int ifindex_in;

> >   static int ifindex_out;

> > -static bool ifindex_out_xdp_dummy_attached = true;

> > +static bool ifindex_out_xdp_dummy_attached = false;

> > +static bool xdp_prog_attached = false;

> 

> Maybe xdp_devmap_prog_attached? Feel xdp_prog_attached

> is too generic since actually it controls xdp_devmap program

> attachment.


Hi Yonghong,

Thanks for your comments. As Jesper replied, The 2nd xdp_prog on egress
doesn't tell us if the redirect was successful. So the number is meaningless.

I plan to write a example about vlan header modification based on egress
index. I will post the patch later.

Thanks
Hangbin
Jesper Dangaard Brouer Nov. 30, 2020, 9:32 a.m. UTC | #3
On Mon, 30 Nov 2020 15:51:07 +0800
Hangbin Liu <liuhangbin@gmail.com> wrote:

> On Thu, Nov 26, 2020 at 10:31:56PM -0800, Yonghong Song wrote:

> > > index 35e16dee613e..8bdec0865e1d 100644

> > > --- a/samples/bpf/xdp_redirect_map_user.c

> > > +++ b/samples/bpf/xdp_redirect_map_user.c

> > > @@ -21,12 +21,13 @@

> > >   static int ifindex_in;

> > >   static int ifindex_out;

> > > -static bool ifindex_out_xdp_dummy_attached = true;

> > > +static bool ifindex_out_xdp_dummy_attached = false;

> > > +static bool xdp_prog_attached = false;  

> > 

> > Maybe xdp_devmap_prog_attached? Feel xdp_prog_attached

> > is too generic since actually it controls xdp_devmap program

> > attachment.  

> 

> Hi Yonghong,

> 

> Thanks for your comments. As Jesper replied, The 2nd xdp_prog on egress

> doesn't tell us if the redirect was successful. So the number is meaningless.


Well, I would not say the counter is meaningless.  It true that 2nd
devmap xdp_prog doesn't tell us if the redirect was successful, which
means that your description/(understanding) of the counter was wrong.

I still think it is relevant to have a counter for packets processed by
this 2nd xdp_prog, just to make is visually clear that the 2nd xdp-prog
attached (to devmap entry) is running.  The point is that QA is using
these programs.

The lack of good output from this specific sample have cause many
bugzilla cases for me.  BZ cases that requires going back and forth a
number of times, before figuring out how the prog was (mis)used.  This
is why other samples like xdp_rxq_info and xdp_redirect_cpu have such a
verbose output, which in-practice have helped many times on QA issues.


> I plan to write a example about vlan header modification based on egress

> index. I will post the patch later.


I did notice the internal thread you had with Toke.  I still think it
will be more simple to modify the Ethernet mac addresses.  Adding a
VLAN id tag is more work, and will confuse benchmarks.  You are
increasing the packet size, which means that you NIC need to spend
slightly more time sending this packet (3.2 nanosec at 10Gbit/s), which
could falsely be interpreted as cost of 2nd devmap XDP-program.

(Details: these 3.2 ns will not be visible for smaller packets, because
the minimum Ethernet frame size will hide this, but I've experience this
problem with larger frames on real switch hardware (Juniper), where
ingress didn't have VLAN-tag and egress we added VLAN-tag with XDP, and
then switch buffer slowly increased until overflow).

As Alexei already pointed out, you assignment is to modify the packet
in the 2nd devmap XDP-prog.  Why: because you need to realize that this
will break your approach to multicast in your previous patchset.
(Yes, the offlist patch I gave you, that move running 2nd devmap
XDP-prog to a later stage, solved this packet-modify issue).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer
Hangbin Liu Nov. 30, 2020, 1:10 p.m. UTC | #4
On Mon, Nov 30, 2020 at 10:32:08AM +0100, Jesper Dangaard Brouer wrote:
> > I plan to write a example about vlan header modification based on egress

> > index. I will post the patch later.

> 

> I did notice the internal thread you had with Toke.  I still think it

> will be more simple to modify the Ethernet mac addresses.  Adding a

> VLAN id tag is more work, and will confuse benchmarks.  You are


I plan to only modify the vlan id if there has. If you prefer to modify the
mac address, which way you'd like? Set src mac to egress interface's MAC?

> As Alexei already pointed out, you assignment is to modify the packet

> in the 2nd devmap XDP-prog.  Why: because you need to realize that this

> will break your approach to multicast in your previous patchset.

> (Yes, the offlist patch I gave you, that move running 2nd devmap

> XDP-prog to a later stage, solved this packet-modify issue).


BTW, it looks with your patch, the counter on egress would make more sense.
Should I add the counter after your patch posted?

Thanks
Hangbin
Jesper Dangaard Brouer Nov. 30, 2020, 3:12 p.m. UTC | #5
On Mon, 30 Nov 2020 21:10:20 +0800
Hangbin Liu <liuhangbin@gmail.com> wrote:

> On Mon, Nov 30, 2020 at 10:32:08AM +0100, Jesper Dangaard Brouer wrote:

> > > I plan to write a example about vlan header modification based on egress

> > > index. I will post the patch later.  

> > 

> > I did notice the internal thread you had with Toke.  I still think it

> > will be more simple to modify the Ethernet mac addresses.  Adding a

> > VLAN id tag is more work, and will confuse benchmarks.  You are  

> 

> I plan to only modify the vlan id if there has. 


This sentence is not complete, but because of the internal thread I
know/assume that you mean, that you will only modify the vlan id if
there is already another VLAN tag in the packet. Let me express that
this is not good enough. This is not a feasible choice.

> If you prefer to modify the mac address, which way you'd like? Set

> src mac to egress interface's MAC?


Yes, that will be a good choice, to use the src mac from the egress
interface.  This would simulate part of what is needed for L3/routing.

Can I request that the dst mac is will be the incoming src mac?
Or if you are user-friendly add option that allows to set dst mac.

This is close to what swap-MAC (swap_src_dst_mac) is used for.  Let me
explain in more details, why this is practical.  It is practical
because then the Ethernet frame will be a valid frame that is received
by the sending interface.  Thus, if you redirect back same interface
(like XDP_TX, but testing xdp_do_redirect code) then you can check on
traffic generator if all frames were actually forwarded.  This is
exactly what the Red Hat performance team's Trex packet generator setup
does to validate and find the zero-loss generator rate.


> > As Alexei already pointed out, you assignment is to modify the packet

> > in the 2nd devmap XDP-prog.  Why: because you need to realize that this

> > will break your approach to multicast in your previous patchset.

> > (Yes, the offlist patch I gave you, that move running 2nd devmap

> > XDP-prog to a later stage, solved this packet-modify issue).  

> 

> BTW, it looks with your patch, the counter on egress would make more sense.

> Should I add the counter after your patch posted?


As I tried to explain.  Regardless, I want a counter that counts the
times the 2nd devmap attached XDP-program runs.  This is not a counter
that counts egress packets.  This is a counter that show that the 2nd
devmap attached XDP-program is running.  I don't know how to make this
more clear.

We do need ANOTHER counter that report how many packets are transmitted
on the egress device.  I'm thinking we can simply read:

 /sys/class/net/mlx5p1/statistics/tx_packets

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer
Toke Høiland-Jørgensen Nov. 30, 2020, 4:07 p.m. UTC | #6
Jesper Dangaard Brouer <brouer@redhat.com> writes:

> On Mon, 30 Nov 2020 21:10:20 +0800

> Hangbin Liu <liuhangbin@gmail.com> wrote:

>

>> On Mon, Nov 30, 2020 at 10:32:08AM +0100, Jesper Dangaard Brouer wrote:

>> > > I plan to write a example about vlan header modification based on egress

>> > > index. I will post the patch later.  

>> > 

>> > I did notice the internal thread you had with Toke.  I still think it

>> > will be more simple to modify the Ethernet mac addresses.  Adding a

>> > VLAN id tag is more work, and will confuse benchmarks.  You are  

>> 

>> I plan to only modify the vlan id if there has. 

>

> This sentence is not complete, but because of the internal thread I

> know/assume that you mean, that you will only modify the vlan id if

> there is already another VLAN tag in the packet. Let me express that

> this is not good enough. This is not a feasible choice.

>

>> If you prefer to modify the mac address, which way you'd like? Set

>> src mac to egress interface's MAC?

>

> Yes, that will be a good choice, to use the src mac from the egress

> interface.  This would simulate part of what is needed for L3/routing.

>

> Can I request that the dst mac is will be the incoming src mac?

> Or if you are user-friendly add option that allows to set dst mac.


One issue with this is that I think it would be neat if we could output
the egress ifindex as part of the packet data, to verify that different
packets can get different content (in the multicast case). If we just
modify the MAC address this is difficult. I guess we could just decide
to step on one byte in the src MAC or something, but VLAN tags seemed
like an obvious alternative :)

-Toke
Jesper Dangaard Brouer Dec. 8, 2020, 10:39 a.m. UTC | #7
On Tue,  8 Dec 2020 16:18:56 +0800
Hangbin Liu <liuhangbin@gmail.com> wrote:

> This patch add a xdp program on egress to show that we can modify

> the packet on egress. In this sample we will set the pkt's src

> mac to egress's mac address. The xdp_prog will be attached when

> -X option supplied.

> 

> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>

> ---

> v3:

> a) modify the src mac address based on egress mac

> 

> v2:

> a) use pkt counter instead of IP ttl modification on egress program

> b) make the egress program selectable by option -X

> ---

>  samples/bpf/xdp_redirect_map_kern.c |  60 ++++++++++-

>  samples/bpf/xdp_redirect_map_user.c | 153 ++++++++++++++++++++--------

>  2 files changed, 168 insertions(+), 45 deletions(-)

> 


[...]
> diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c

> index 31131b6e7782..19636045c8dc 100644

> --- a/samples/bpf/xdp_redirect_map_user.c

> +++ b/samples/bpf/xdp_redirect_map_user.c

> @@ -14,6 +14,10 @@

>  #include <unistd.h>

>  #include <libgen.h>

>  #include <sys/resource.h>

> +#include <sys/ioctl.h>

> +#include <sys/types.h>

> +#include <sys/socket.h>

> +#include <netinet/in.h>

>  

>  #include "bpf_util.h"

>  #include <bpf/bpf.h>

> @@ -21,7 +25,8 @@

>  

>  static int ifindex_in;

>  static int ifindex_out;

> -static bool ifindex_out_xdp_dummy_attached = true;

> +static bool ifindex_out_xdp_dummy_attached = false;

> +static bool xdp_devmap_attached = false;

>  static __u32 prog_id;

>  static __u32 dummy_prog_id;

>  

> @@ -83,6 +88,29 @@ static void poll_stats(int interval, int ifindex)

>  	}

>  }

>  

> +static int get_mac_addr(unsigned int ifindex_out, void *mac_addr)

> +{

> +	struct ifreq ifr;

> +	char ifname[IF_NAMESIZE];

> +	int fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);


I would have expected (like ethtool):
 fd = socket(AF_INET, SOCK_DGRAM, 0);

> +	if (fd < 0)

> +		return -1;

> +

> +	if (!if_indextoname(ifindex_out, ifname))

> +		return -1;

> +

> +	strcpy(ifr.ifr_name, ifname);

> +

> +	if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)

> +		return -1;

> +

> +	memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char));

> +	close(fd);

> +

> +	return 0;

> +}

[...]

> -	/* Loading dummy XDP prog on out-device */

> -	if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd,

> -			    (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) {

> -		printf("WARN: link set xdp fd failed on %d\n", ifindex_out);

> -		ifindex_out_xdp_dummy_attached = false;

> -	}

> +	/* If -X supplied, load 2nd xdp prog on egress.

> +	 * If not, just load dummy prog on egress.

> +	 */


The dummy prog need to be loaded, regardless of 2nd xdp prog on egress.


> +	if (xdp_devmap_attached) {

> +		unsigned char mac_addr[6];

>  

> -	memset(&info, 0, sizeof(info));

> -	ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len);

> -	if (ret) {

> -		printf("can't get prog info - %s\n", strerror(errno));

> -		return ret;

> +		devmap_prog = bpf_object__find_program_by_title(obj, "xdp_devmap/map_prog");

> +		if (!devmap_prog) {

> +			printf("finding devmap_prog in obj file failed\n");

> +			goto out;

> +		}

> +		devmap_prog_fd = bpf_program__fd(devmap_prog);

> +		if (devmap_prog_fd < 0) {

> +			printf("finding devmap_prog fd failed\n");

> +			goto out;

> +		}

> +

> +		if (get_mac_addr(ifindex_out, mac_addr) < 0) {

> +			printf("get interface %d mac failed\n", ifindex_out);

> +			goto out;

> +		}

> +

> +		ret = bpf_map_update_elem(tx_mac_map_fd, &key, mac_addr, 0);

> +		if (ret) {

> +			perror("bpf_update_elem tx_mac_map_fd");

> +			goto out;

> +		}

> +	} else if (ifindex_in != ifindex_out) {

> +		dummy_prog = bpf_object__find_program_by_title(obj, "xdp_redirect_dummy");

> +		if (!dummy_prog) {

> +			printf("finding dummy_prog in obj file failed\n");

> +			goto out;

> +		}

> +

> +		dummy_prog_fd = bpf_program__fd(dummy_prog);

> +		if (dummy_prog_fd < 0) {

> +			printf("find dummy_prog fd failed\n");

> +			goto out;

> +		}

> +

> +		if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd,

> +					(xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) == 0) {

> +			ifindex_out_xdp_dummy_attached = true;

> +		} else {

> +			printf("WARN: link set xdp fd failed on %d\n", ifindex_out);

> +		}

> +

> +		memset(&info, 0, sizeof(info));

> +		ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len);

> +		if (ret) {

> +			printf("can't get prog info - %s\n", strerror(errno));

> +		}

> +		dummy_prog_id = info.id;

>  	}

> -	dummy_prog_id = info.id;

>  

>  	signal(SIGINT, int_exit);

>  	signal(SIGTERM, int_exit);

>  

> -	/* populate virtual to physical port map */

> -	ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0);

> +	devmap_val.ifindex = ifindex_out;

> +	devmap_val.bpf_prog.fd = devmap_prog_fd;

> +	ret = bpf_map_update_elem(tx_port_map_fd, &key, &devmap_val, 0);

>  	if (ret) {

> -		perror("bpf_update_elem");

> +		perror("bpf_update_elem tx_port_map_fd");

>  		goto out;

>  	}

>  

>  	poll_stats(2, ifindex_out);

>  

>  out:

> -	return 0;

> +	bpf_object__close(obj);

> +	return 1;

>  }




-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer
Hangbin Liu Dec. 8, 2020, 11:11 a.m. UTC | #8
On Tue, Dec 08, 2020 at 11:39:14AM +0100, Jesper Dangaard Brouer wrote:
> > +	/* If -X supplied, load 2nd xdp prog on egress.

> > +	 * If not, just load dummy prog on egress.

> > +	 */

> 

> The dummy prog need to be loaded, regardless of 2nd xdp prog on egress.


Thanks for this remind, Now I know why the pkts are dropped with I do perf
test on physical NICs.

Regards
Hangbin
diff mbox series

Patch

diff --git a/samples/bpf/xdp_redirect_map_kern.c b/samples/bpf/xdp_redirect_map_kern.c
index 6489352ab7a4..fd6704a4f7e2 100644
--- a/samples/bpf/xdp_redirect_map_kern.c
+++ b/samples/bpf/xdp_redirect_map_kern.c
@@ -22,19 +22,19 @@ 
 struct {
 	__uint(type, BPF_MAP_TYPE_DEVMAP);
 	__uint(key_size, sizeof(int));
-	__uint(value_size, sizeof(int));
+	__uint(value_size, sizeof(struct bpf_devmap_val));
 	__uint(max_entries, 100);
 } tx_port SEC(".maps");
 
-/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
- * feedback.  Redirect TX errors can be caught via a tracepoint.
+/* Count RX/TX packets, use key 0 for rx pkt count, key 1 for tx
+ * pkt count.
  */
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__type(key, u32);
 	__type(value, long);
-	__uint(max_entries, 1);
-} rxcnt SEC(".maps");
+	__uint(max_entries, 2);
+} pktcnt SEC(".maps");
 
 static void swap_src_dst_mac(void *data)
 {
@@ -72,7 +72,7 @@  int xdp_redirect_map_prog(struct xdp_md *ctx)
 	vport = 0;
 
 	/* count packet in global counter */
-	value = bpf_map_lookup_elem(&rxcnt, &key);
+	value = bpf_map_lookup_elem(&pktcnt, &key);
 	if (value)
 		*value += 1;
 
@@ -82,6 +82,20 @@  int xdp_redirect_map_prog(struct xdp_md *ctx)
 	return bpf_redirect_map(&tx_port, vport, 0);
 }
 
+SEC("xdp_devmap/map_prog")
+int xdp_devmap_prog(struct xdp_md *ctx)
+{
+	long *value;
+	u32 key = 1;
+
+	/* count packet in global counter */
+	value = bpf_map_lookup_elem(&pktcnt, &key);
+	if (value)
+		*value += 1;
+
+	return XDP_PASS;
+}
+
 /* Redirect require an XDP bpf_prog loaded on the TX device */
 SEC("xdp_redirect_dummy")
 int xdp_redirect_dummy_prog(struct xdp_md *ctx)
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
index 35e16dee613e..8bdec0865e1d 100644
--- a/samples/bpf/xdp_redirect_map_user.c
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -21,12 +21,13 @@ 
 
 static int ifindex_in;
 static int ifindex_out;
-static bool ifindex_out_xdp_dummy_attached = true;
+static bool ifindex_out_xdp_dummy_attached = false;
+static bool xdp_prog_attached = false;
 static __u32 prog_id;
 static __u32 dummy_prog_id;
 
 static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
-static int rxcnt_map_fd;
+static int pktcnt_map_fd;
 
 static void int_exit(int sig)
 {
@@ -60,26 +61,46 @@  static void int_exit(int sig)
 	exit(0);
 }
 
-static void poll_stats(int interval, int ifindex)
+static void poll_stats(int interval, int if_ingress, int if_egress)
 {
 	unsigned int nr_cpus = bpf_num_possible_cpus();
-	__u64 values[nr_cpus], prev[nr_cpus];
+	__u64 values[nr_cpus], in_prev[nr_cpus], e_prev[nr_cpus];
+	__u64 sum;
+	__u32 key;
+	int i;
 
-	memset(prev, 0, sizeof(prev));
+	memset(in_prev, 0, sizeof(in_prev));
+	memset(e_prev, 0, sizeof(e_prev));
 
 	while (1) {
-		__u64 sum = 0;
-		__u32 key = 0;
-		int i;
+		sum = 0;
+		key = 0;
 
 		sleep(interval);
-		assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0);
-		for (i = 0; i < nr_cpus; i++)
-			sum += (values[i] - prev[i]);
-		if (sum)
-			printf("ifindex %i: %10llu pkt/s\n",
-			       ifindex, sum / interval);
-		memcpy(prev, values, sizeof(values));
+		if (bpf_map_lookup_elem(pktcnt_map_fd, &key, values) == 0) {
+			for (i = 0; i < nr_cpus; i++)
+				sum += (values[i] - in_prev[i]);
+			if (sum)
+				printf("in ifindex %i: %10llu pkt/s",
+				       if_ingress, sum / interval);
+			memcpy(in_prev, values, sizeof(values));
+		}
+
+		if (!xdp_prog_attached) {
+			printf("\n");
+			continue;
+		}
+
+		sum = 0;
+		key = 1;
+		if (bpf_map_lookup_elem(pktcnt_map_fd, &key, values) == 0) {
+			for (i = 0; i < nr_cpus; i++)
+				sum += (values[i] - e_prev[i]);
+			if (sum)
+				printf(", out ifindex %i: %10llu pkt/s\n",
+				       if_egress, sum / interval);
+			memcpy(e_prev, values, sizeof(values));
+		}
 	}
 }
 
@@ -90,7 +111,8 @@  static void usage(const char *prog)
 		"OPTS:\n"
 		"    -S    use skb-mode\n"
 		"    -N    enforce native mode\n"
-		"    -F    force loading prog\n",
+		"    -F    force loading prog\n"
+		"    -X    load xdp program on egress\n",
 		prog);
 }
 
@@ -98,13 +120,14 @@  int main(int argc, char **argv)
 {
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
 	struct bpf_prog_load_attr prog_load_attr = {
-		.prog_type	= BPF_PROG_TYPE_XDP,
+		.prog_type	= BPF_PROG_TYPE_UNSPEC,
 	};
-	struct bpf_program *prog, *dummy_prog;
+	struct bpf_program *prog, *dummy_prog, *devmap_prog;
+	int prog_fd, dummy_prog_fd, devmap_prog_fd = -1;
+	struct bpf_devmap_val devmap_val;
 	struct bpf_prog_info info = {};
 	__u32 info_len = sizeof(info);
-	int prog_fd, dummy_prog_fd;
-	const char *optstr = "FSN";
+	const char *optstr = "FSNX";
 	struct bpf_object *obj;
 	int ret, opt, key = 0;
 	char filename[256];
@@ -121,6 +144,9 @@  int main(int argc, char **argv)
 		case 'F':
 			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
 			break;
+		case 'X':
+			xdp_prog_attached = true;
+			break;
 		default:
 			usage(basename(argv[0]));
 			return 1;
@@ -157,23 +183,14 @@  int main(int argc, char **argv)
 		return 1;
 
 	prog = bpf_program__next(NULL, obj);
-	dummy_prog = bpf_program__next(prog, obj);
-	if (!prog || !dummy_prog) {
-		printf("finding a prog in obj file failed\n");
-		return 1;
-	}
-	/* bpf_prog_load_xattr gives us the pointer to first prog's fd,
-	 * so we're missing only the fd for dummy prog
-	 */
-	dummy_prog_fd = bpf_program__fd(dummy_prog);
-	if (prog_fd < 0 || dummy_prog_fd < 0) {
-		printf("bpf_prog_load_xattr: %s\n", strerror(errno));
+	if (!prog || prog_fd <0) {
+		printf("finding prog in obj file failed\n");
 		return 1;
 	}
 
 	tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port");
-	rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
-	if (tx_port_map_fd < 0 || rxcnt_map_fd < 0) {
+	pktcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "pktcnt");
+	if (tx_port_map_fd < 0 || pktcnt_map_fd < 0) {
 		printf("bpf_object__find_map_fd_by_name failed\n");
 		return 1;
 	}
@@ -190,32 +207,59 @@  int main(int argc, char **argv)
 	}
 	prog_id = info.id;
 
-	/* Loading dummy XDP prog on out-device */
-	if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd,
-			    (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) {
-		printf("WARN: link set xdp fd failed on %d\n", ifindex_out);
-		ifindex_out_xdp_dummy_attached = false;
-	}
+	/* Loading dummy XDP prog on out-device if no xdp_prog_attached*/
+	if (xdp_prog_attached) {
+		devmap_prog = bpf_object__find_program_by_title(obj, "xdp_devmap/map_prog");
+		if (!devmap_prog) {
+			printf("finding devmap_prog in obj file failed\n");
+			return 1;
+		}
+		devmap_prog_fd = bpf_program__fd(devmap_prog);
+		if (devmap_prog_fd < 0) {
+			printf("find devmap_prog fd: %s\n", strerror(errno));
+			return 1;
+		}
+	} else {
+		dummy_prog = bpf_object__find_program_by_title(obj, "xdp_redirect_dummy");
+		if (!dummy_prog) {
+			printf("finding dummy_prog in obj file failed\n");
+			return 1;
+		}
 
-	memset(&info, 0, sizeof(info));
-	ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len);
-	if (ret) {
-		printf("can't get prog info - %s\n", strerror(errno));
-		return ret;
+		dummy_prog_fd = bpf_program__fd(dummy_prog);
+		if (dummy_prog_fd < 0) {
+			printf("find dummy_prog fd: %s\n", strerror(errno));
+			return 1;
+		}
+
+		if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd,
+				    (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) == 0) {
+			ifindex_out_xdp_dummy_attached = true;
+		} else {
+			printf("WARN: link set xdp fd failed on %d\n", ifindex_out);
+		}
+
+		memset(&info, 0, sizeof(info));
+		ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len);
+		if (ret) {
+			printf("can't get prog info - %s\n", strerror(errno));
+			return ret;
+		}
+		dummy_prog_id = info.id;
 	}
-	dummy_prog_id = info.id;
 
 	signal(SIGINT, int_exit);
 	signal(SIGTERM, int_exit);
 
-	/* populate virtual to physical port map */
-	ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0);
+	devmap_val.ifindex = ifindex_out;
+	devmap_val.bpf_prog.fd = devmap_prog_fd;
+	ret = bpf_map_update_elem(tx_port_map_fd, &key, &devmap_val, 0);
 	if (ret) {
-		perror("bpf_update_elem");
+		perror("bpf_update_elem tx_port_map_fd");
 		goto out;
 	}
 
-	poll_stats(2, ifindex_out);
+	poll_stats(2, ifindex_in, ifindex_out);
 
 out:
 	return 0;