diff mbox series

[v3] net-next: When a bond have a massive amount of VLANs with IPv6 addresses, performance of changing link state, attaching a VRF, changing an IPv6 address, etc. go down dramtically.

Message ID 20210819071727.1257434-1-gnaaman@drivenets.com
State New
Headers show
Series [v3] net-next: When a bond have a massive amount of VLANs with IPv6 addresses, performance of changing link state, attaching a VRF, changing an IPv6 address, etc. go down dramtically. | expand

Commit Message

Gilad Naaman Aug. 19, 2021, 7:17 a.m. UTC
The source of most of the slow down is the `dev_addr_lists.c` module,
which mainatins a linked list of HW addresses.
When using IPv6, this list grows for each IPv6 address added on a
VLAN, since each IPv6 address has a multicast HW address associated with
it.

When performing any modification to the involved links, this list is
traversed many times, often for nothing, all while holding the RTNL
lock.

Instead, this patch adds an auxilliary rbtree which cuts down
traversal time significantly.

Performance can be seen with the following script:

	#!/bin/bash
	ip netns del test || true 2>/dev/null
	ip netns add test

	echo 1 | ip netns exec test tee /proc/sys/net/ipv6/conf/all/keep_addr_on_down > /dev/null

	set -e

	ip -n test link add foo type veth peer name bar
	ip -n test link add b1 type bond
	ip -n test link add florp type vrf table 10

	ip -n test link set bar master b1
	ip -n test link set foo up
	ip -n test link set bar up
	ip -n test link set b1 up
	ip -n test link set florp up

	VLAN_COUNT=1500
	BASE_DEV=b1

	echo Creating vlans
	ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
	do ip -n test link add link $BASE_DEV name foo.\$i type vlan id \$i; done"

	echo Bringing them up
	ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
	do ip -n test link set foo.\$i up; done"

	echo Assiging IPv6 Addresses
	ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
	do ip -n test address add dev foo.\$i 2000::\$i/64; done"

	echo Attaching to VRF
	ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
	do ip -n test link set foo.\$i master florp; done"

On an Intel(R) Xeon(R) CPU E5-2650 v3 @ 2.30GHz machine, the performance
before the patch is (truncated):

	Creating vlans
	real 108.35
	Bringing them up
	real 4.96
	Assiging IPv6 Addresses
	real 19.22
	Attaching to VRF
	real 458.84

After the patch:

	Creating vlans
	real 5.59
	Bringing them up
	real 5.07
	Assiging IPv6 Addresses
	real 5.64
	Attaching to VRF
	real 25.37

Cc: David S. Miller <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Lu Wei <luwei32@huawei.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
---

In version 1, Nikolay Aleksandrov <nikolay@nvidia.com> asked
why not remove the list entirely in favour of the tree.

Altough this sounds like the right solution,
I am not sure it's possible to do this with RCU-safety in mind, and I
doubt my ability to correctly patch every place in the kernel that uses
the list without introducing further bugs.

v1 -> v2:
	- Formatting/typo fixes
	- Retarget net-next
v2 -> v3:
	- Exclude first list address from tree because it can be
	  changed via `dev->dev_addr` (Thanks <isaac@speedb.io>)

---
 include/linux/netdevice.h |   5 ++
 net/core/dev_addr_lists.c | 144 ++++++++++++++++++++++++++------------
 2 files changed, 103 insertions(+), 46 deletions(-)

Comments

Jakub Kicinski Aug. 25, 2021, 5:42 p.m. UTC | #1
Please repost with the subject fixed.

It should be something like:

[PATCH net-next v4] net: maintain rbtree of device hw addresses

not the first sentence of the commit message.
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 11a52f2fa..c84db4379 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -48,6 +48,7 @@ 
 #include <uapi/linux/if_bonding.h>
 #include <uapi/linux/pkt_cls.h>
 #include <linux/hashtable.h>
+#include <linux/rbtree.h>
 
 struct netpoll_info;
 struct device;
@@ -202,6 +203,7 @@  struct sk_buff;
 
 struct netdev_hw_addr {
 	struct list_head	list;
+	struct rb_node		node;
 	unsigned char		addr[MAX_ADDR_LEN];
 	unsigned char		type;
 #define NETDEV_HW_ADDR_T_LAN		1
@@ -219,6 +221,9 @@  struct netdev_hw_addr {
 struct netdev_hw_addr_list {
 	struct list_head	list;
 	int			count;
+
+	/* Auxiliary tree for faster lookup on addition and deletion */
+	struct rb_root		tree;
 };
 
 #define netdev_hw_addr_list_count(l) ((l)->count)
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 2f949b5a1..dc9bc01f2 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -16,10 +16,9 @@ 
  * General list handling functions
  */
 
-static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
-			       const unsigned char *addr, int addr_len,
-			       unsigned char addr_type, bool global,
-			       bool sync)
+static struct netdev_hw_addr*
+__hw_addr_create(const unsigned char *addr, int addr_len,
+		 unsigned char addr_type, bool global, bool sync)
 {
 	struct netdev_hw_addr *ha;
 	int alloc_size;
@@ -29,32 +28,44 @@  static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
 		alloc_size = L1_CACHE_BYTES;
 	ha = kmalloc(alloc_size, GFP_ATOMIC);
 	if (!ha)
-		return -ENOMEM;
+		return NULL;
 	memcpy(ha->addr, addr, addr_len);
 	ha->type = addr_type;
 	ha->refcount = 1;
 	ha->global_use = global;
 	ha->synced = sync ? 1 : 0;
 	ha->sync_cnt = 0;
-	list_add_tail_rcu(&ha->list, &list->list);
-	list->count++;
 
-	return 0;
+	return ha;
 }
 
 static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
 			    const unsigned char *addr, int addr_len,
 			    unsigned char addr_type, bool global, bool sync,
-			    int sync_count)
+			    int sync_count, bool exclusive)
 {
+	struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL;
 	struct netdev_hw_addr *ha;
 
 	if (addr_len > MAX_ADDR_LEN)
 		return -EINVAL;
 
-	list_for_each_entry(ha, &list->list, list) {
-		if (ha->type == addr_type &&
-		    !memcmp(ha->addr, addr, addr_len)) {
+	while (*ins_point) {
+		int diff;
+
+		ha = rb_entry(*ins_point, struct netdev_hw_addr, node);
+		diff = memcmp(addr, ha->addr, addr_len);
+		if (diff == 0)
+			diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));
+
+		parent = *ins_point;
+		if (diff < 0) {
+			ins_point = &parent->rb_left;
+		} else if (diff > 0) {
+			ins_point = &parent->rb_right;
+		} else {
+			if (exclusive)
+				return -EEXIST;
 			if (global) {
 				/* check if addr is already used as global */
 				if (ha->global_use)
@@ -73,8 +84,25 @@  static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
 		}
 	}
 
-	return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
-				   sync);
+	ha = __hw_addr_create(addr, addr_len, addr_type, global, sync);
+	if (!ha)
+		return -ENOMEM;
+
+	/* The first address in dev->dev_addrs is pointed to by dev->dev_addr
+	 * and mutated freely by device drivers and netdev ops, so if we insert
+	 * it into the tree we'll end up with an invalid rbtree.
+	 */
+	if (list->count > 0) {
+		rb_link_node(&ha->node, parent, ins_point);
+		rb_insert_color(&ha->node, &list->tree);
+	} else {
+		RB_CLEAR_NODE(&ha->node);
+	}
+
+	list_add_tail_rcu(&ha->list, &list->list);
+	list->count++;
+
+	return 0;
 }
 
 static int __hw_addr_add(struct netdev_hw_addr_list *list,
@@ -82,7 +110,7 @@  static int __hw_addr_add(struct netdev_hw_addr_list *list,
 			 unsigned char addr_type)
 {
 	return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
-				0);
+				0, false);
 }
 
 static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
@@ -103,24 +131,61 @@  static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
 
 	if (--ha->refcount)
 		return 0;
+
+	if (!RB_EMPTY_NODE(&ha->node))
+		rb_erase(&ha->node, &list->tree);
+
 	list_del_rcu(&ha->list);
 	kfree_rcu(ha, rcu_head);
 	list->count--;
 	return 0;
 }
 
+static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list,
+					       const unsigned char *addr, int addr_len,
+					       unsigned char addr_type)
+{
+	struct netdev_hw_addr *ha;
+	struct rb_node *node;
+
+	/* The first address isn't inserted into the tree because in the dev->dev_addrs
+	 * list it's the address pointed to by dev->dev_addr which is freely mutated
+	 * in place, so we need to check it separately.
+	 */
+	ha = list_first_entry(&list->list, struct netdev_hw_addr, list);
+	if (ha && !memcmp(addr, ha->addr, addr_len) &&
+	    (!addr_type || addr_type == ha->type))
+		return ha;
+
+	node = list->tree.rb_node;
+
+	while (node) {
+		struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node);
+		int diff = memcmp(addr, ha->addr, addr_len);
+
+		if (diff == 0 && addr_type)
+			diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));
+
+		if (diff < 0)
+			node = node->rb_left;
+		else if (diff > 0)
+			node = node->rb_right;
+		else
+			return ha;
+	}
+
+	return NULL;
+}
+
 static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
 			    const unsigned char *addr, int addr_len,
 			    unsigned char addr_type, bool global, bool sync)
 {
-	struct netdev_hw_addr *ha;
+	struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type);
 
-	list_for_each_entry(ha, &list->list, list) {
-		if (!memcmp(ha->addr, addr, addr_len) &&
-		    (ha->type == addr_type || !addr_type))
-			return __hw_addr_del_entry(list, ha, global, sync);
-	}
-	return -ENOENT;
+	if (!ha)
+		return -ENOENT;
+	return __hw_addr_del_entry(list, ha, global, sync);
 }
 
 static int __hw_addr_del(struct netdev_hw_addr_list *list,
@@ -137,7 +202,7 @@  static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
 	int err;
 
 	err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
-			       false, true, ha->sync_cnt);
+			       false, true, ha->sync_cnt, false);
 	if (err && err != -EEXIST)
 		return err;
 
@@ -407,6 +472,7 @@  static void __hw_addr_flush(struct netdev_hw_addr_list *list)
 {
 	struct netdev_hw_addr *ha, *tmp;
 
+	list->tree = RB_ROOT;
 	list_for_each_entry_safe(ha, tmp, &list->list, list) {
 		list_del_rcu(&ha->list);
 		kfree_rcu(ha, rcu_head);
@@ -418,6 +484,7 @@  void __hw_addr_init(struct netdev_hw_addr_list *list)
 {
 	INIT_LIST_HEAD(&list->list);
 	list->count = 0;
+	list->tree = RB_ROOT;
 }
 EXPORT_SYMBOL(__hw_addr_init);
 
@@ -552,22 +619,14 @@  EXPORT_SYMBOL(dev_addr_del);
  */
 int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
 {
-	struct netdev_hw_addr *ha;
 	int err;
 
 	netif_addr_lock_bh(dev);
-	list_for_each_entry(ha, &dev->uc.list, list) {
-		if (!memcmp(ha->addr, addr, dev->addr_len) &&
-		    ha->type == NETDEV_HW_ADDR_T_UNICAST) {
-			err = -EEXIST;
-			goto out;
-		}
-	}
-	err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
-				  NETDEV_HW_ADDR_T_UNICAST, true, false);
+	err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len,
+			       NETDEV_HW_ADDR_T_UNICAST, true, false,
+			       0, true);
 	if (!err)
 		__dev_set_rx_mode(dev);
-out:
 	netif_addr_unlock_bh(dev);
 	return err;
 }
@@ -736,22 +795,14 @@  EXPORT_SYMBOL(dev_uc_init);
  */
 int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
 {
-	struct netdev_hw_addr *ha;
 	int err;
 
 	netif_addr_lock_bh(dev);
-	list_for_each_entry(ha, &dev->mc.list, list) {
-		if (!memcmp(ha->addr, addr, dev->addr_len) &&
-		    ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
-			err = -EEXIST;
-			goto out;
-		}
-	}
-	err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
-				  NETDEV_HW_ADDR_T_MULTICAST, true, false);
+	err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
+			       NETDEV_HW_ADDR_T_MULTICAST, true, false,
+			       0, true);
 	if (!err)
 		__dev_set_rx_mode(dev);
-out:
 	netif_addr_unlock_bh(dev);
 	return err;
 }
@@ -764,7 +815,8 @@  static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
 
 	netif_addr_lock_bh(dev);
 	err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
-			       NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);
+			       NETDEV_HW_ADDR_T_MULTICAST, global, false,
+			       0, false);
 	if (!err)
 		__dev_set_rx_mode(dev);
 	netif_addr_unlock_bh(dev);