[RFC,05/11] net: Infrastructure for per queue aRFS

Message ID 20200624171749.11927-6-tom@herbertland.com
State New
Headers show
Series
  • [RFC,01/11] cgroup: Export cgroup_{procs,threads}_start and cgroup_procs_next
Related show

Commit Message

Tom Herbert June 24, 2020, 5:17 p.m.
Infrastructure changes to allow aRFS to be based on Per Thread Queues
instead of just CPU. The basic change is to create a field in
rps_dev_flow to hold either a CPU or a queue index (not just a CPU
that is).

Changes include:
	- Replace u16 cpu field in rps_dev_flow structure with
	  rps_cpu_qid structure that contains either a CPU or a device
	  queue index. Note the structure is still sixteen bits
	- Helper functions to clear and set the cpu in the
	  rps_cpu_qid of rps_dev_flow
	- Create a sock_masks structure that contains the partition
	  of the thirty-two bit entry in rps_sock_flow_table. The
	  structure contains two masks, one to extract the upper bits
	  of the hash and one to extract the CPU number or queue index
	- Replace rps_cpu_mask with sock_masks from rps_sock_flow_table
	- Add rps_max_num_queues which will be used when creating
	  sock_masks for queue entries in rps_sock_flow_table
---
 include/linux/netdevice.h  | 94 +++++++++++++++++++++++++++++++++-----
 net/core/dev.c             | 47 ++++++++++++-------
 net/core/net-sysfs.c       |  2 +-
 net/core/sysctl_net_core.c |  6 ++-
 4 files changed, 119 insertions(+), 30 deletions(-)

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bf5f2a85da97..d528aa61fea3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -674,18 +674,65 @@  struct rps_map {
 };
 #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))
 
+/* The rps_cpu_qid structure is sixteen bits and holds either a CPU number or
+ * a queue index. The use_qid field specifies which type of value is set (i.e.
+ * if use_qid is 1 then cpu_qid contains a fifteen bit queue identifier, and if
+ * use_qid is 0 then cpu_qid contains a fifteen bit CPU number). No entry is
+ * signified by RPS_NO_CPU_QID in val which is set to NO_QUEUE (0xffff). So the
+ * range of CPU numbers that can be stored is 0..32,767 (0x7fff) and the range
+ * of queue identifiers is 0..32,766. Note that CPU numbers are limited by
+ * CONFIG_NR_CPUS which currently has a maximum supported value of 8,192 (per
+ * arch/x86/Kconfig), so WARN_ON is used to check that a CPU number is less
+ * than 0x8000 when setting the cpu in rps_cpu_qid. The queue index is limited
+ * by configuration.
+ */
+struct rps_cpu_qid {
+	union {
+		u16 val;
+		struct {
+			u16 use_qid: 1;
+			union {
+				u16 cpu: 15;
+				u16 qid: 15;
+			};
+		};
+	};
+};
+
+#define RPS_NO_CPU_QID	NO_QUEUE	/* No CPU or qid in rps_cpu_qid */
+#define RPS_MAX_CPU	0x7fff		/* Maximum cpu in rps_cpu_qid */
+#define RPS_MAX_QID	0x7ffe		/* Maximum qid in rps_cpu_qid */
+
 /*
  * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
  * tail pointer for that CPU's input queue at the time of last enqueue, and
  * a hardware filter index.
  */
 struct rps_dev_flow {
-	u16 cpu;
+	struct rps_cpu_qid cpu_qid;
 	u16 filter;
 	unsigned int last_qtail;
 };
 #define RPS_NO_FILTER 0xffff
 
+static inline void rps_dev_flow_clear(struct rps_dev_flow *dev_flow)
+{
+	dev_flow->cpu_qid.val = RPS_NO_CPU_QID;
+}
+
+static inline void rps_dev_flow_set_cpu(struct rps_dev_flow *dev_flow, u16 cpu)
+{
+	struct rps_cpu_qid cpu_qid;
+
+	if (WARN_ON(cpu > RPS_MAX_CPU))
+		return;
+
+	/* Set the rflow target to the CPU atomically */
+	cpu_qid.use_qid = 0;
+	cpu_qid.cpu = cpu;
+	dev_flow->cpu_qid = cpu_qid;
+}
+
 /*
  * The rps_dev_flow_table structure contains a table of flow mappings.
  */
@@ -697,34 +744,57 @@  struct rps_dev_flow_table {
 #define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
     ((_num) * sizeof(struct rps_dev_flow)))
 
+struct rps_sock_masks {
+	u32 mask;
+	u32 hash_mask;
+};
+
 /*
- * The rps_sock_flow_table contains mappings of flows to the last CPU
- * on which they were processed by the application (set in recvmsg).
- * Each entry is a 32bit value. Upper part is the high-order bits
- * of flow hash, lower part is CPU number.
- * rps_cpu_mask is used to partition the space, depending on number of
- * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
- * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,
- * meaning we use 32-6=26 bits for the hash.
+ * The rps_sock_flow_table contains mappings of flows to the last CPU on which
+ * they were processed by the application (set in recvmsg), or the mapping of
+ * the flow to a per thread queue for the application. Each entry is a 32bit
+ * value. The high order bit indicates whether a CPU number or a queue index is
+ * stored. The next high-order bits contain the flow hash, and the lower bits
+ * contain the CPU number or queue index. The sock_flow table contains two
+ * sets of masks, one for CPU entries (cpu_masks) and one for queue entries
+ * (queue_masks), that are to used partition the space between the hash bits
+ * and the CPU number or queue index. For the cpu masks, cpu_masks.mask is set
+ * to roundup_pow_of_two(nr_cpu_ids) - 1 and the corresponding hash mask,
+ * cpu_masks.hash_mask, is set to (~cpu_masks.mask & ~RPS_SOCK_FLOW_USE_QID).
+ * For example, if 64 CPUs are possible, cpu_masks.mask == 0x3f, meaning we use
+ * 31-6=25 bits for the hash (so cpu_masks.hash_mask == 0x7fffffc0). Similarly,
+ * queue_masks in rps_sock_flow_table is used to partition the space when a
+ * queue index is present.
  */
 struct rps_sock_flow_table {
 	u32	mask;
+	struct	rps_sock_masks cpu_masks;
+	struct	rps_sock_masks queue_masks;
 
 	u32	ents[] ____cacheline_aligned_in_smp;
 };
 #define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
 
-#define RPS_NO_CPU 0xffff
+#define RPS_SOCK_FLOW_USE_QID	(1 << 31)
+#define RPS_SOCK_FLOW_NO_IDENT	-1U
 
-extern u32 rps_cpu_mask;
 extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+extern unsigned int rps_max_num_queues;
+
+static inline void rps_init_sock_masks(struct rps_sock_masks *masks, u32 num)
+{
+	u32 mask = roundup_pow_of_two(num) - 1;
+
+	masks->mask = mask;
+	masks->hash_mask = (~mask & ~RPS_SOCK_FLOW_USE_QID);
+}
 
 static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
 					u32 hash)
 {
 	if (table && hash) {
+		u32 val = hash & table->cpu_masks.hash_mask;
 		unsigned int index = hash & table->mask;
-		u32 val = hash & ~rps_cpu_mask;
 
 		/* We only give a hint, preemption can change CPU under us */
 		val |= raw_smp_processor_id();
diff --git a/net/core/dev.c b/net/core/dev.c
index 9f7a3e78e23a..946940bdd583 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4242,8 +4242,7 @@  static inline void ____napi_schedule(struct softnet_data *sd,
 /* One global table that all flow-based protocols share. */
 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 EXPORT_SYMBOL(rps_sock_flow_table);
-u32 rps_cpu_mask __read_mostly;
-EXPORT_SYMBOL(rps_cpu_mask);
+unsigned int rps_max_num_queues;
 
 struct static_key_false rps_needed __read_mostly;
 EXPORT_SYMBOL(rps_needed);
@@ -4302,7 +4301,7 @@  set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 			per_cpu(softnet_data, next_cpu).input_queue_head;
 	}
 
-	rflow->cpu = next_cpu;
+	rps_dev_flow_set_cpu(rflow, next_cpu);
 	return rflow;
 }
 
@@ -4349,22 +4348,39 @@  static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 
 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 	if (flow_table && sock_flow_table) {
+		u32 next_cpu, comparator, ident;
 		struct rps_dev_flow *rflow;
-		u32 next_cpu;
-		u32 ident;
 
 		/* First check into global flow table if there is a match */
 		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
-		if ((ident ^ hash) & ~rps_cpu_mask)
-			goto try_rps;
+		comparator = ((ident & RPS_SOCK_FLOW_USE_QID) ?
+				sock_flow_table->queue_masks.hash_mask :
+				sock_flow_table->cpu_masks.hash_mask);
 
-		next_cpu = ident & rps_cpu_mask;
+		if ((ident ^ hash) & comparator)
+			goto try_rps;
 
 		/* OK, now we know there is a match,
 		 * we can look at the local (per receive queue) flow table
 		 */
 		rflow = &flow_table->flows[hash & flow_table->mask];
-		tcpu = rflow->cpu;
+
+		/* The flow_sock entry may refer to either a queue or a
+		 * CPU. Proceed accordingly.
+		 */
+		if (ident & RPS_SOCK_FLOW_USE_QID) {
+			/* A queue identifier is in the sock_flow_table entry */
+
+			/* Don't use aRFS to set CPU in this case, skip to
+			 * trying RPS
+			 */
+			goto try_rps;
+		}
+
+		/* A CPU number is in the sock_flow_table entry */
+
+		next_cpu = ident & sock_flow_table->cpu_masks.mask;
+		tcpu = rflow->cpu_qid.use_qid ? NO_QUEUE : rflow->cpu_qid.cpu;
 
 		/*
 		 * If the desired CPU (where last recvmsg was done) is
@@ -4396,10 +4412,8 @@  static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 
 	if (map) {
 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
-		if (cpu_online(tcpu)) {
+		if (cpu_online(tcpu))
 			cpu = tcpu;
-			goto done;
-		}
 	}
 
 done:
@@ -4424,17 +4438,18 @@  bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 {
 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 	struct rps_dev_flow_table *flow_table;
+	struct rps_cpu_qid cpu_qid;
 	struct rps_dev_flow *rflow;
 	bool expire = true;
-	unsigned int cpu;
 
 	rcu_read_lock();
 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	if (flow_table && flow_id <= flow_table->mask) {
 		rflow = &flow_table->flows[flow_id];
-		cpu = READ_ONCE(rflow->cpu);
-		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
-		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+		cpu_qid = READ_ONCE(rflow->cpu_qid);
+		if (rflow->filter == filter_id && !cpu_qid.use_qid &&
+		    cpu_qid.cpu < nr_cpu_ids &&
+		    ((int)(per_cpu(softnet_data, cpu_qid.cpu).input_queue_head -
 			   rflow->last_qtail) <
 		     (int)(10 * flow_table->mask)))
 			expire = false;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e353b822bb15..56d27463d466 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -858,7 +858,7 @@  static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
 
 		table->mask = mask;
 		for (count = 0; count <= mask; count++)
-			table->flows[count].cpu = RPS_NO_CPU;
+			rps_dev_flow_clear(&table->flows[count]);
 	} else {
 		table = NULL;
 	}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 9c7d46fbb75a..d09471f29d89 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -65,12 +65,16 @@  static int rps_create_sock_flow_table(size_t size, size_t orig_size,
 				return -ENOMEM;
 
 			sock_table->mask = size - 1;
+			rps_init_sock_masks(&sock_table->cpu_masks,
+					    nr_cpu_ids);
+			rps_init_sock_masks(&sock_table->queue_masks,
+					    rps_max_num_queues);
 		} else {
 			sock_table = orig_table;
 		}
 
 		for (i = 0; i < size; i++)
-			sock_table->ents[i] = RPS_NO_CPU;
+			sock_table->ents[i] = RPS_NO_CPU_QID;
 	} else {
 		sock_table = NULL;
 	}