Message ID | 20210509151615.200608-3-idosch@idosch.org |
---|---|
State | New |
Headers | show |
Series | Add support for custom multipath hash | expand |
On 5/9/21 9:16 AM, Ido Schimmel wrote: > diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst > index c2ecc9894fd0..15982f830abc 100644 > --- a/Documentation/networking/ip-sysctl.rst > +++ b/Documentation/networking/ip-sysctl.rst > @@ -100,6 +100,33 @@ fib_multipath_hash_policy - INTEGER > - 1 - Layer 4 > - 2 - Layer 3 or inner Layer 3 if present > > +fib_multipath_hash_fields - UNSIGNED INTEGER > + When fib_multipath_hash_policy is set to 3 (custom multipath hash), the > + fields used for multipath hash calculation are determined by this > + sysctl. > + > + This value is a bitmask which enables various fields for multipath hash > + calculation. > + > + Possible fields are: > + > + ====== ============================ > + 0x0001 Source IP address > + 0x0002 Destination IP address > + 0x0004 IP protocol > + 0x0008 Unused Document that this bit is flowlabel for IPv6 and ignored for ipv4.
On 5/9/21 9:16 AM, Ido Schimmel wrote: > diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c > index a62934b9f15a..da627c4d633a 100644 > --- a/net/ipv4/sysctl_net_ipv4.c > +++ b/net/ipv4/sysctl_net_ipv4.c > @@ -19,6 +19,7 @@ > #include <net/snmp.h> > #include <net/icmp.h> > #include <net/ip.h> > +#include <net/ip_fib.h> > #include <net/route.h> > #include <net/tcp.h> > #include <net/udp.h> > @@ -48,6 +49,8 @@ static int ip_ping_group_range_min[] = { 0, 0 }; > static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; > static u32 u32_max_div_HZ = UINT_MAX / HZ; > static int one_day_secs = 24 * 3600; > +static u32 fib_multipath_hash_fields_all_mask __maybe_unused = > + FIB_MULTIPATH_HASH_FIELD_ALL_MASK; > > /* obsolete */ > static int sysctl_tcp_low_latency __read_mostly; > @@ -1052,6 +1055,14 @@ static struct ctl_table ipv4_net_table[] = { > .extra1 = SYSCTL_ZERO, > .extra2 = &two, > }, > + { > + .procname = "fib_multipath_hash_fields", > + .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields, > + .maxlen = sizeof(u32), > + .mode = 0644, > + .proc_handler = proc_douintvec_minmax, > + .extra2 = &fib_multipath_hash_fields_all_mask, no .extra1 means 0 is allowed which effectively disables hashing and multipath selection; only the first leg will be used. Is that intended? > + }, > #endif > { > .procname = "ip_unprivileged_port_start", >
On Tue, May 11, 2021 at 09:10:46AM -0600, David Ahern wrote: > On 5/9/21 9:16 AM, Ido Schimmel wrote: > > diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst > > index c2ecc9894fd0..15982f830abc 100644 > > --- a/Documentation/networking/ip-sysctl.rst > > +++ b/Documentation/networking/ip-sysctl.rst > > @@ -100,6 +100,33 @@ fib_multipath_hash_policy - INTEGER > > - 1 - Layer 4 > > - 2 - Layer 3 or inner Layer 3 if present > > > > +fib_multipath_hash_fields - UNSIGNED INTEGER > > + When fib_multipath_hash_policy is set to 3 (custom multipath hash), the > > + fields used for multipath hash calculation are determined by this > > + sysctl. > > + > > + This value is a bitmask which enables various fields for multipath hash > > + calculation. > > + > > + Possible fields are: > > + > > + ====== ============================ > > + 0x0001 Source IP address > > + 0x0002 Destination IP address > > + 0x0004 IP protocol > > + 0x0008 Unused > > Document that this bit is flowlabel for IPv6 and ignored for ipv4. OK
On Tue, May 11, 2021 at 09:49:30AM -0600, David Ahern wrote: > On 5/9/21 9:16 AM, Ido Schimmel wrote: > > diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c > > index a62934b9f15a..da627c4d633a 100644 > > --- a/net/ipv4/sysctl_net_ipv4.c > > +++ b/net/ipv4/sysctl_net_ipv4.c > > @@ -19,6 +19,7 @@ > > #include <net/snmp.h> > > #include <net/icmp.h> > > #include <net/ip.h> > > +#include <net/ip_fib.h> > > #include <net/route.h> > > #include <net/tcp.h> > > #include <net/udp.h> > > @@ -48,6 +49,8 @@ static int ip_ping_group_range_min[] = { 0, 0 }; > > static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; > > static u32 u32_max_div_HZ = UINT_MAX / HZ; > > static int one_day_secs = 24 * 3600; > > +static u32 fib_multipath_hash_fields_all_mask __maybe_unused = > > + FIB_MULTIPATH_HASH_FIELD_ALL_MASK; > > > > /* obsolete */ > > static int sysctl_tcp_low_latency __read_mostly; > > @@ -1052,6 +1055,14 @@ static struct ctl_table ipv4_net_table[] = { > > .extra1 = SYSCTL_ZERO, > > .extra2 = &two, > > }, > > + { > > + .procname = "fib_multipath_hash_fields", > > + .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields, > > + .maxlen = sizeof(u32), > > + .mode = 0644, > > + .proc_handler = proc_douintvec_minmax, > > + .extra2 = &fib_multipath_hash_fields_all_mask, > > no .extra1 means 0 is allowed which effectively disables hashing and > multipath selection; only the first leg will be used. Is that intended? I didn't see any reason to forbid it, but I don't see any reason to use it with '0' either. With this patch: diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 81343037de06..4fa77f182dcb 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1078,6 +1078,7 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_fib_multipath_hash_fields, + .extra1 = SYSCTL_ONE, .extra2 = &fib_multipath_hash_fields_all_mask, }, #endif We get: # sysctl -w net.ipv4.fib_multipath_hash_fields=0 sysctl: setting key "net.ipv4.fib_multipath_hash_fields": Invalid argument I assume you want to see this change in the next version (and for IPv6)? > > > > > + }, > > #endif > > { > > .procname = "ip_unprivileged_port_start", > > >
On 5/11/21 2:05 PM, Ido Schimmel wrote: > > # sysctl -w net.ipv4.fib_multipath_hash_fields=0 > sysctl: setting key "net.ipv4.fib_multipath_hash_fields": Invalid argument > > I assume you want to see this change in the next version (and for IPv6)? > Yes, I do not know of any reason to allow the hash policy to disable multipath routes.
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index c2ecc9894fd0..15982f830abc 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -100,6 +100,33 @@ fib_multipath_hash_policy - INTEGER - 1 - Layer 4 - 2 - Layer 3 or inner Layer 3 if present +fib_multipath_hash_fields - UNSIGNED INTEGER + When fib_multipath_hash_policy is set to 3 (custom multipath hash), the + fields used for multipath hash calculation are determined by this + sysctl. + + This value is a bitmask which enables various fields for multipath hash + calculation. + + Possible fields are: + + ====== ============================ + 0x0001 Source IP address + 0x0002 Destination IP address + 0x0004 IP protocol + 0x0008 Unused + 0x0010 Source port + 0x0020 Destination port + 0x0040 Inner source IP address + 0x0080 Inner destination IP address + 0x0100 Inner IP protocol + 0x0200 Inner Flow Label + 0x0400 Inner source port + 0x0800 Inner destination port + ====== ============================ + + Default: 0x0007 (source IP, destination IP and IP protocol) + fib_sync_mem - UNSIGNED INTEGER Amount of dirty memory from fib entries that can be backlogged before synchronize_rcu is forced. diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a914f33f3ed5..3ab2563b1a23 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -466,6 +466,49 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags); void fib_sync_mtu(struct net_device *dev, u32 orig_mtu); void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig); +/* Fields used for sysctl_fib_multipath_hash_fields. + * Common to IPv4 and IPv6. + * + * Add new fields at the end. This is user API. + */ +#define FIB_MULTIPATH_HASH_FIELD_SRC_IP BIT(0) +#define FIB_MULTIPATH_HASH_FIELD_DST_IP BIT(1) +#define FIB_MULTIPATH_HASH_FIELD_IP_PROTO BIT(2) +#define FIB_MULTIPATH_HASH_FIELD_FLOWLABEL BIT(3) +#define FIB_MULTIPATH_HASH_FIELD_SRC_PORT BIT(4) +#define FIB_MULTIPATH_HASH_FIELD_DST_PORT BIT(5) +#define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP BIT(6) +#define FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP BIT(7) +#define FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO BIT(8) +#define FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL BIT(9) +#define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT BIT(10) +#define FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT BIT(11) + +#define FIB_MULTIPATH_HASH_FIELD_OUTER_MASK \ + (FIB_MULTIPATH_HASH_FIELD_SRC_IP | \ + FIB_MULTIPATH_HASH_FIELD_DST_IP | \ + FIB_MULTIPATH_HASH_FIELD_IP_PROTO | \ + FIB_MULTIPATH_HASH_FIELD_FLOWLABEL | \ + FIB_MULTIPATH_HASH_FIELD_SRC_PORT | \ + FIB_MULTIPATH_HASH_FIELD_DST_PORT) + +#define FIB_MULTIPATH_HASH_FIELD_INNER_MASK \ + (FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP | \ + FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP | \ + FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO | \ + FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL | \ + FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT | \ + FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) + +#define FIB_MULTIPATH_HASH_FIELD_ALL_MASK \ + (FIB_MULTIPATH_HASH_FIELD_OUTER_MASK | \ + FIB_MULTIPATH_HASH_FIELD_INNER_MASK) + +#define FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK \ + (FIB_MULTIPATH_HASH_FIELD_SRC_IP | \ + FIB_MULTIPATH_HASH_FIELD_DST_IP | \ + FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index f6af8d96d3c6..746c80cd4257 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -210,6 +210,7 @@ struct netns_ipv4 { #endif #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH + u32 sysctl_fib_multipath_hash_fields; u8 sysctl_fib_multipath_use_neigh; u8 sysctl_fib_multipath_hash_policy; #endif diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 84bb707bd88d..129213b7d834 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1516,6 +1516,12 @@ static int __net_init ip_fib_net_init(struct net *net) if (err) return err; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* Default to 3-tuple */ + net->ipv4.sysctl_fib_multipath_hash_fields = + FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; +#endif + /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a62934b9f15a..da627c4d633a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -19,6 +19,7 @@ #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> +#include <net/ip_fib.h> #include <net/route.h> #include <net/tcp.h> #include <net/udp.h> @@ -48,6 +49,8 @@ static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; +static u32 fib_multipath_hash_fields_all_mask __maybe_unused = + FIB_MULTIPATH_HASH_FIELD_ALL_MASK; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1052,6 +1055,14 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, + { + .procname = "fib_multipath_hash_fields", + .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra2 = &fib_multipath_hash_fields_all_mask, + }, #endif { .procname = "ip_unprivileged_port_start",