diff mbox series

[RFC,net-next,v2,02/10] ipv4: Add a sysctl to control multipath hash fields

Message ID 20210509151615.200608-3-idosch@idosch.org
State New
Headers show
Series Add support for custom multipath hash | expand

Commit Message

Ido Schimmel May 9, 2021, 3:16 p.m. UTC
From: Ido Schimmel <idosch@nvidia.com>

A subsequent patch will add a new multipath hash policy where the packet
fields used for multipath hash calculation are determined by user space.
This patch adds a sysctl that allows user space to set these fields.

The packet fields are represented using a bitmask and are common between
IPv4 and IPv6 to allow user space to use the same numbering across both
protocols. For example, to hash based on standard 5-tuple:

 # sysctl -w net.ipv4.fib_multipath_hash_fields=0x0037
 net.ipv4.fib_multipath_hash_fields = 0x0037

The kernel rejects unknown fields, for example:

 # sysctl -w net.ipv4.fib_multipath_hash_fields=0x1000
 sysctl: setting key "net.ipv4.fib_multipath_hash_fields": Invalid argument

More fields can be added in the future, if needed.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
---
 Documentation/networking/ip-sysctl.rst | 27 ++++++++++++++++
 include/net/ip_fib.h                   | 43 ++++++++++++++++++++++++++
 include/net/netns/ipv4.h               |  1 +
 net/ipv4/fib_frontend.c                |  6 ++++
 net/ipv4/sysctl_net_ipv4.c             | 11 +++++++
 5 files changed, 88 insertions(+)

Comments

David Ahern May 11, 2021, 3:10 p.m. UTC | #1
On 5/9/21 9:16 AM, Ido Schimmel wrote:
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst

> index c2ecc9894fd0..15982f830abc 100644

> --- a/Documentation/networking/ip-sysctl.rst

> +++ b/Documentation/networking/ip-sysctl.rst

> @@ -100,6 +100,33 @@ fib_multipath_hash_policy - INTEGER

>  	- 1 - Layer 4

>  	- 2 - Layer 3 or inner Layer 3 if present

>  

> +fib_multipath_hash_fields - UNSIGNED INTEGER

> +	When fib_multipath_hash_policy is set to 3 (custom multipath hash), the

> +	fields used for multipath hash calculation are determined by this

> +	sysctl.

> +

> +	This value is a bitmask which enables various fields for multipath hash

> +	calculation.

> +

> +	Possible fields are:

> +

> +	====== ============================

> +	0x0001 Source IP address

> +	0x0002 Destination IP address

> +	0x0004 IP protocol

> +	0x0008 Unused


Document that this bit is flowlabel for IPv6 and ignored for ipv4.
David Ahern May 11, 2021, 3:49 p.m. UTC | #2
On 5/9/21 9:16 AM, Ido Schimmel wrote:
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c

> index a62934b9f15a..da627c4d633a 100644

> --- a/net/ipv4/sysctl_net_ipv4.c

> +++ b/net/ipv4/sysctl_net_ipv4.c

> @@ -19,6 +19,7 @@

>  #include <net/snmp.h>

>  #include <net/icmp.h>

>  #include <net/ip.h>

> +#include <net/ip_fib.h>

>  #include <net/route.h>

>  #include <net/tcp.h>

>  #include <net/udp.h>

> @@ -48,6 +49,8 @@ static int ip_ping_group_range_min[] = { 0, 0 };

>  static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };

>  static u32 u32_max_div_HZ = UINT_MAX / HZ;

>  static int one_day_secs = 24 * 3600;

> +static u32 fib_multipath_hash_fields_all_mask __maybe_unused =

> +	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;

>  

>  /* obsolete */

>  static int sysctl_tcp_low_latency __read_mostly;

> @@ -1052,6 +1055,14 @@ static struct ctl_table ipv4_net_table[] = {

>  		.extra1		= SYSCTL_ZERO,

>  		.extra2		= &two,

>  	},

> +	{

> +		.procname	= "fib_multipath_hash_fields",

> +		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_fields,

> +		.maxlen		= sizeof(u32),

> +		.mode		= 0644,

> +		.proc_handler	= proc_douintvec_minmax,

> +		.extra2		= &fib_multipath_hash_fields_all_mask,


no .extra1 means 0 is allowed which effectively disables hashing and
multipath selection; only the first leg will be used. Is that intended?



> +	},

>  #endif

>  	{

>  		.procname	= "ip_unprivileged_port_start",

>
Ido Schimmel May 11, 2021, 7:58 p.m. UTC | #3
On Tue, May 11, 2021 at 09:10:46AM -0600, David Ahern wrote:
> On 5/9/21 9:16 AM, Ido Schimmel wrote:

> > diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst

> > index c2ecc9894fd0..15982f830abc 100644

> > --- a/Documentation/networking/ip-sysctl.rst

> > +++ b/Documentation/networking/ip-sysctl.rst

> > @@ -100,6 +100,33 @@ fib_multipath_hash_policy - INTEGER

> >  	- 1 - Layer 4

> >  	- 2 - Layer 3 or inner Layer 3 if present

> >  

> > +fib_multipath_hash_fields - UNSIGNED INTEGER

> > +	When fib_multipath_hash_policy is set to 3 (custom multipath hash), the

> > +	fields used for multipath hash calculation are determined by this

> > +	sysctl.

> > +

> > +	This value is a bitmask which enables various fields for multipath hash

> > +	calculation.

> > +

> > +	Possible fields are:

> > +

> > +	====== ============================

> > +	0x0001 Source IP address

> > +	0x0002 Destination IP address

> > +	0x0004 IP protocol

> > +	0x0008 Unused

> 

> Document that this bit is flowlabel for IPv6 and ignored for ipv4.


OK
Ido Schimmel May 11, 2021, 8:05 p.m. UTC | #4
On Tue, May 11, 2021 at 09:49:30AM -0600, David Ahern wrote:
> On 5/9/21 9:16 AM, Ido Schimmel wrote:

> > diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c

> > index a62934b9f15a..da627c4d633a 100644

> > --- a/net/ipv4/sysctl_net_ipv4.c

> > +++ b/net/ipv4/sysctl_net_ipv4.c

> > @@ -19,6 +19,7 @@

> >  #include <net/snmp.h>

> >  #include <net/icmp.h>

> >  #include <net/ip.h>

> > +#include <net/ip_fib.h>

> >  #include <net/route.h>

> >  #include <net/tcp.h>

> >  #include <net/udp.h>

> > @@ -48,6 +49,8 @@ static int ip_ping_group_range_min[] = { 0, 0 };

> >  static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };

> >  static u32 u32_max_div_HZ = UINT_MAX / HZ;

> >  static int one_day_secs = 24 * 3600;

> > +static u32 fib_multipath_hash_fields_all_mask __maybe_unused =

> > +	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;

> >  

> >  /* obsolete */

> >  static int sysctl_tcp_low_latency __read_mostly;

> > @@ -1052,6 +1055,14 @@ static struct ctl_table ipv4_net_table[] = {

> >  		.extra1		= SYSCTL_ZERO,

> >  		.extra2		= &two,

> >  	},

> > +	{

> > +		.procname	= "fib_multipath_hash_fields",

> > +		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_fields,

> > +		.maxlen		= sizeof(u32),

> > +		.mode		= 0644,

> > +		.proc_handler	= proc_douintvec_minmax,

> > +		.extra2		= &fib_multipath_hash_fields_all_mask,

> 

> no .extra1 means 0 is allowed which effectively disables hashing and

> multipath selection; only the first leg will be used. Is that intended?


I didn't see any reason to forbid it, but I don't see any reason to use
it with '0' either. With this patch:

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 81343037de06..4fa77f182dcb 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1078,6 +1078,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(u32),
 		.mode		= 0644,
 		.proc_handler	= proc_fib_multipath_hash_fields,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &fib_multipath_hash_fields_all_mask,
 	},
 #endif

We get:

# sysctl -w net.ipv4.fib_multipath_hash_fields=0
sysctl: setting key "net.ipv4.fib_multipath_hash_fields": Invalid argument

I assume you want to see this change in the next version (and for IPv6)?

> 

> 

> 

> > +	},

> >  #endif

> >  	{

> >  		.procname	= "ip_unprivileged_port_start",

> > 

>
David Ahern May 17, 2021, 2:47 p.m. UTC | #5
On 5/11/21 2:05 PM, Ido Schimmel wrote:
> 

> # sysctl -w net.ipv4.fib_multipath_hash_fields=0

> sysctl: setting key "net.ipv4.fib_multipath_hash_fields": Invalid argument

> 

> I assume you want to see this change in the next version (and for IPv6)?

> 


Yes, I do not know of any reason to allow the hash policy to disable
multipath routes.
diff mbox series

Patch

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index c2ecc9894fd0..15982f830abc 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -100,6 +100,33 @@  fib_multipath_hash_policy - INTEGER
 	- 1 - Layer 4
 	- 2 - Layer 3 or inner Layer 3 if present
 
+fib_multipath_hash_fields - UNSIGNED INTEGER
+	When fib_multipath_hash_policy is set to 3 (custom multipath hash), the
+	fields used for multipath hash calculation are determined by this
+	sysctl.
+
+	This value is a bitmask which enables various fields for multipath hash
+	calculation.
+
+	Possible fields are:
+
+	====== ============================
+	0x0001 Source IP address
+	0x0002 Destination IP address
+	0x0004 IP protocol
+	0x0008 Unused
+	0x0010 Source port
+	0x0020 Destination port
+	0x0040 Inner source IP address
+	0x0080 Inner destination IP address
+	0x0100 Inner IP protocol
+	0x0200 Inner Flow Label
+	0x0400 Inner source port
+	0x0800 Inner destination port
+	====== ============================
+
+	Default: 0x0007 (source IP, destination IP and IP protocol)
+
 fib_sync_mem - UNSIGNED INTEGER
 	Amount of dirty memory from fib entries that can be backlogged before
 	synchronize_rcu is forced.
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index a914f33f3ed5..3ab2563b1a23 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -466,6 +466,49 @@  int fib_sync_up(struct net_device *dev, unsigned char nh_flags);
 void fib_sync_mtu(struct net_device *dev, u32 orig_mtu);
 void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig);
 
+/* Fields used for sysctl_fib_multipath_hash_fields.
+ * Common to IPv4 and IPv6.
+ *
+ * Add new fields at the end. This is user API.
+ */
+#define FIB_MULTIPATH_HASH_FIELD_SRC_IP			BIT(0)
+#define FIB_MULTIPATH_HASH_FIELD_DST_IP			BIT(1)
+#define FIB_MULTIPATH_HASH_FIELD_IP_PROTO		BIT(2)
+#define FIB_MULTIPATH_HASH_FIELD_FLOWLABEL		BIT(3)
+#define FIB_MULTIPATH_HASH_FIELD_SRC_PORT		BIT(4)
+#define FIB_MULTIPATH_HASH_FIELD_DST_PORT		BIT(5)
+#define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP		BIT(6)
+#define FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP		BIT(7)
+#define FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO		BIT(8)
+#define FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL	BIT(9)
+#define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT		BIT(10)
+#define FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT		BIT(11)
+
+#define FIB_MULTIPATH_HASH_FIELD_OUTER_MASK		\
+	(FIB_MULTIPATH_HASH_FIELD_SRC_IP |		\
+	 FIB_MULTIPATH_HASH_FIELD_DST_IP |		\
+	 FIB_MULTIPATH_HASH_FIELD_IP_PROTO |		\
+	 FIB_MULTIPATH_HASH_FIELD_FLOWLABEL |		\
+	 FIB_MULTIPATH_HASH_FIELD_SRC_PORT |		\
+	 FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+
+#define FIB_MULTIPATH_HASH_FIELD_INNER_MASK		\
+	(FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP |	\
+	 FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP |	\
+	 FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO |	\
+	 FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL |	\
+	 FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT |	\
+	 FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
+
+#define FIB_MULTIPATH_HASH_FIELD_ALL_MASK		\
+	(FIB_MULTIPATH_HASH_FIELD_OUTER_MASK |		\
+	 FIB_MULTIPATH_HASH_FIELD_INNER_MASK)
+
+#define FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK		\
+	(FIB_MULTIPATH_HASH_FIELD_SRC_IP |		\
+	 FIB_MULTIPATH_HASH_FIELD_DST_IP |		\
+	 FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
 		       const struct sk_buff *skb, struct flow_keys *flkeys);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index f6af8d96d3c6..746c80cd4257 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -210,6 +210,7 @@  struct netns_ipv4 {
 #endif
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
+	u32 sysctl_fib_multipath_hash_fields;
 	u8 sysctl_fib_multipath_use_neigh;
 	u8 sysctl_fib_multipath_hash_policy;
 #endif
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 84bb707bd88d..129213b7d834 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1516,6 +1516,12 @@  static int __net_init ip_fib_net_init(struct net *net)
 	if (err)
 		return err;
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	/* Default to 3-tuple */
+	net->ipv4.sysctl_fib_multipath_hash_fields =
+		FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
+#endif
+
 	/* Avoid false sharing : Use at least a full cache line */
 	size = max_t(size_t, size, L1_CACHE_BYTES);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a62934b9f15a..da627c4d633a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -19,6 +19,7 @@ 
 #include <net/snmp.h>
 #include <net/icmp.h>
 #include <net/ip.h>
+#include <net/ip_fib.h>
 #include <net/route.h>
 #include <net/tcp.h>
 #include <net/udp.h>
@@ -48,6 +49,8 @@  static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 static u32 u32_max_div_HZ = UINT_MAX / HZ;
 static int one_day_secs = 24 * 3600;
+static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
+	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
 
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
@@ -1052,6 +1055,14 @@  static struct ctl_table ipv4_net_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
+	{
+		.procname	= "fib_multipath_hash_fields",
+		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_fields,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra2		= &fib_multipath_hash_fields_all_mask,
+	},
 #endif
 	{
 		.procname	= "ip_unprivileged_port_start",