diff mbox series

[nf-next,v4,4/5] netfilter: Introduce egress hook

Message ID 979835dc887d3affc4e76464aa21da0e298fd638.1611304190.git.lukas@wunner.de
State New
Headers show
Series Netfilter egress hook | expand

Commit Message

Lukas Wunner Jan. 22, 2021, 8:47 a.m. UTC
Commit e687ad60af09 ("netfilter: add netfilter ingress hook after
handle_ing() under unique static key") introduced the ability to
classify packets with netfilter on ingress.

Support the same on egress to satisfy user requirements such as:
* outbound security policies for containers (Laura)
* filtering and mangling intra-node Direct Server Return (DSR) traffic
  on a load balancer (Laura)
* filtering locally generated traffic coming in through AF_PACKET,
  such as local ARP traffic generated for clustering purposes or DHCP
  (Laura; the AF_PACKET plumbing is contained in a separate commit)
* L2 filtering from ingress and egress for AVB (Audio Video Bridging)
  and gPTP with nftables (Pablo)
* in the future: in-kernel NAT64/NAT46 (Pablo)

A patch for nftables to hook up egress rules from user space has been
submitted separately, so users may immediately take advantage of the
feature.

The hook is positioned after packet handling by traffic control.
Thus, if packets are redirected into and out of containers with tc,
the data path is:
ingress: host tc -> container tc -> container nft
egress:  container tc -> host tc -> host nft

This was done to address an objection from Daniel Borkmann:  If desired,
nft does not get into tc's way performance-wise.  The host is able to
firewall malicious packets coming out of a container, but only after tc
has done its duty.  An implication is that tc may set skb->mark on
egress for nft to act on it, but not the other way round.

If egress netfilter handling is not enabled on any interface, it is
patched out of the data path by way of a static_key and doesn't make a
performance difference that is discernible from noise:

Before:             2076 2076 2076 2077 2077 2074 Mb/sec
After:              2080 2078 2078 2079 2079 2077 Mb/sec
Before + tc accept: 1877 1875 1878 1874 1882 1873 Mb/sec
After  + tc accept: 1874 1877 1881 1875 1876 1876 Mb/sec
Before + tc drop:   2167 2182 2168 2171 2164 2171 Mb/sec
After  + tc drop:   2176 2180 2184 2178 2178 2180 Mb/sec

Measurements were performed on a Core i7-3615QM.  Commands to reproduce:
ip link add dev foo type dummy
ip link set dev foo up
modprobe pktgen
echo "add_device foo" > /proc/net/pktgen/kpktgend_3
samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -i foo -n 400000000 -m "11:11:11:11:11:11" -d 1.1.1.1

Accept all traffic with tc:
tc qdisc add dev foo clsact
tc filter add dev foo egress bpf da bytecode '1,6 0 0 0,'

Drop all traffic with tc:
tc qdisc add dev foo clsact
tc filter add dev foo egress bpf da bytecode '1,6 0 0 2,'

Apply this patch when measuring packet drops to avoid errors in dmesg:
https://lore.kernel.org/netdev/a73dda33-57f4-95d8-ea51-ed483abd6a7a@iogearbox.net/

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Cc: Laura García Liébana <nevola@gmail.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Thomas Graf <tgraf@suug.ch>
---
 include/linux/netdevice.h        |  4 +++
 include/linux/netfilter_netdev.h | 52 ++++++++++++++++++++++++++++++++
 include/uapi/linux/netfilter.h   |  1 +
 net/core/dev.c                   | 11 +++++--
 net/netfilter/Kconfig            |  8 +++++
 net/netfilter/core.c             | 34 +++++++++++++++++++--
 net/netfilter/nft_chain_filter.c |  4 ++-
 7 files changed, 108 insertions(+), 6 deletions(-)

Comments

Daniel Borkmann Jan. 26, 2021, 7:13 p.m. UTC | #1
On 1/22/21 9:47 AM, Lukas Wunner wrote:
> Commit e687ad60af09 ("netfilter: add netfilter ingress hook after

> handle_ing() under unique static key") introduced the ability to

> classify packets with netfilter on ingress.

> 

> Support the same on egress to satisfy user requirements such as:

> * outbound security policies for containers (Laura)

> * filtering and mangling intra-node Direct Server Return (DSR) traffic

>    on a load balancer (Laura)

> * filtering locally generated traffic coming in through AF_PACKET,

>    such as local ARP traffic generated for clustering purposes or DHCP

>    (Laura; the AF_PACKET plumbing is contained in a separate commit)

> * L2 filtering from ingress and egress for AVB (Audio Video Bridging)

>    and gPTP with nftables (Pablo)

> * in the future: in-kernel NAT64/NAT46 (Pablo)

> 

> A patch for nftables to hook up egress rules from user space has been

> submitted separately, so users may immediately take advantage of the

> feature.

> 

> The hook is positioned after packet handling by traffic control.

> Thus, if packets are redirected into and out of containers with tc,

> the data path is:

> ingress: host tc -> container tc -> container nft

> egress:  container tc -> host tc -> host nft

> 

> This was done to address an objection from Daniel Borkmann:  If desired,

> nft does not get into tc's way performance-wise.  The host is able to

> firewall malicious packets coming out of a container, but only after tc

> has done its duty.  An implication is that tc may set skb->mark on

> egress for nft to act on it, but not the other way round.

> 

> If egress netfilter handling is not enabled on any interface, it is

> patched out of the data path by way of a static_key and doesn't make a

> performance difference that is discernible from noise:

> 

[...]
> @@ -4096,13 +4098,18 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)

>   	qdisc_pkt_len_init(skb);

>   #ifdef CONFIG_NET_CLS_ACT

>   	skb->tc_at_ingress = 0;

> -# ifdef CONFIG_NET_EGRESS

> +#endif

> +#ifdef CONFIG_NET_EGRESS

>   	if (static_branch_unlikely(&egress_needed_key)) {

>   		skb = sch_handle_egress(skb, &rc, dev);

>   		if (!skb)

>   			goto out;

> +		if (nf_hook_egress_active()) {

> +			skb = nf_hook_egress(skb, &rc, dev);

> +			if (!skb)

> +				goto out;

> +		}


This won't work unfortunately, the issue is that this now creates an asymmetric path, for
example: [tc ingress] -> [nf ingress] -> [host] -> [tc egress] -> [nf egress]. So any NAT
translation done on tc ingress + tc egress will break on the nf hooks given nf is not able to
correlate inbound with outbound traffic. Likewise for container-based traffic that is in its
own netns, one of the existing paths is: [tc ingress (phys,hostns)] -> [tc ingress (veth,podns)] ->
[reply] -> [tc egress (veth,podns)] -> [tc ingress (veth,hostns)] -> [nf egress (phys,hostns)*] ->
[tc egress (phys,hostns)]. As can be seen the [nf ingress] hook is not an issue at all given
everything operates in tc layer but the [nf egress*] one is in this case, as it touches in tc
layer where existing data planes will start to break upon rule injection. Wrt above objection,
what needs to be done for the minimum case is to i) fix the asymmetry problem from here, and
ii) add a flag for tc layer-redirected skbs to skip the nf egress hook *; with that in place
this concern should be resolved. Thanks!

>   	}

> -# endif

>   #endif

>   	/* If device/qdisc don't need skb->dst, release it right now while

>   	 * its hot in this cpu cache.
Lukas Wunner Sept. 11, 2021, 9:26 p.m. UTC | #2
On Tue, Jan 26, 2021 at 08:13:19PM +0100, Daniel Borkmann wrote:
> On 1/22/21 9:47 AM, Lukas Wunner wrote:

> > Commit e687ad60af09 ("netfilter: add netfilter ingress hook after

> > handle_ing() under unique static key") introduced the ability to

> > classify packets with netfilter on ingress.

> > 

> > Support the same on egress to satisfy user requirements such as:

[...]
> > The hook is positioned after packet handling by traffic control.

> > Thus, if packets are redirected into and out of containers with tc,

> > the data path is:

> > ingress: host tc -> container tc -> container nft

> > egress:  container tc -> host tc -> host nft

> > 

> > This was done to address an objection from Daniel Borkmann:  If desired,

> > nft does not get into tc's way performance-wise.  The host is able to

> > firewall malicious packets coming out of a container, but only after tc

> > has done its duty.  An implication is that tc may set skb->mark on

> > egress for nft to act on it, but not the other way round.

> [...]

> > @@ -4096,13 +4098,18 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)

> >   	qdisc_pkt_len_init(skb);

> >   #ifdef CONFIG_NET_CLS_ACT

> >   	skb->tc_at_ingress = 0;

> > -# ifdef CONFIG_NET_EGRESS

> > +#endif

> > +#ifdef CONFIG_NET_EGRESS

> >   	if (static_branch_unlikely(&egress_needed_key)) {

> >   		skb = sch_handle_egress(skb, &rc, dev);

> >   		if (!skb)

> >   			goto out;

> > +		if (nf_hook_egress_active()) {

> > +			skb = nf_hook_egress(skb, &rc, dev);

> > +			if (!skb)

> > +				goto out;

> > +		}

> 

> This won't work unfortunately, the issue is that this now creates an

> asymmetric path, for example:

>   [tc ingress] -> [nf ingress] -> [host] -> [tc egress] -> [nf egress].

> So any NAT translation done on tc ingress + tc egress will break on

> the nf hooks given nf is not able to correlate inbound with outbound

> traffic.

> 

> Likewise for container-based traffic that is in its own netns,

> one of the existing paths is:

>   [tc ingress (phys,hostns)] -> [tc ingress (veth,podns)] -> [reply] ->

>   [tc egress (veth,podns)] -> [tc ingress (veth,hostns)] ->

>   [nf egress (phys,hostns)*] -> [tc egress (phys,hostns)].

> As can be seen the [nf ingress] hook is not an issue at all given

> everything operates in tc layer but the [nf egress*] one is in this case,

> as it touches in tc layer where existing data planes will start to break

> upon rule injection.

> 

> Wrt above objection, what needs to be done for the minimum case is to

> i) fix the asymmetry problem from here, and

> ii) add a flag for tc layer-redirected skbs to skip the nf egress hook *;

> with that in place this concern should be resolved. Thanks!


Daniel, your reply above has left me utterly confused.  After thinking
about it for a while, I'm requesting clarification what you really want.
We do want the netfilter egress hook in mainline and we're happy to
accommodate to your requirements, but we need you to specify them clearly.

Regarding the data path for packets which are going out from a container,
I think you've erroneously mixed up the last two elements above:
If the nf egress hook is placed after tc egress (as is done in the present
patch), the data path is actually as follows:

  [tc egress (veth,podns)] -> [tc ingress (veth,hostns)] ->
  [tc egress (phys,hostns)] -> [nf egress (phys,hostns)*]

By contrast, if nf egress is put in front of tc egress (as you're
requesting above), the data path becomes:

  [nf egress (veth,podns)] -> [tc egress (veth,podns)] ->
  [tc ingress (veth,hostns)] ->
  [nf egress (phys,hostns)*] -> [tc egress (phys,hostns)]

So this order results in an *additional* nf egress hook in the data path,
which may cost performance.  Previously you voiced concerns that the
nf egress hook may degrade performance.  To address that concern,
we ordered nf egress *after* tc egress, thereby avoiding any performance
impact as much as possible.

I agree with your argument that an inverse order vis-a-vis ingress is
more logical because it avoids NAT incongruencies etc.  So I'll be happy
to reorder nf egress before tc egress.  I'm just confused that you're now
requesting an order which may be *more* detrimental to performance.

As a reminder:
- If no nf egress rules are in use on the system, there is no performance
  impact at all because the hook is patched out of the data path.
- If an interface on the system uses nf egress rules, all other interfaces
  suffer a minor performance impact because a NULL pointer check is
  performed for sk_buff->nf_hooks_egress for every packet.
- The actual netfilter processing only happens on the interface(s)
  for which nf egress rules have been configured.

You further request above to "add a flag for tc layer-redirected skbs
to skip the nf egress hook *".

However the data path *starts* with an nf egress hook.  So even if a flag
to skip subsequent nf egress processing is set upon tc redirect,
the initial nf egress hook is never skipped.  The benefit of such a flag
is therefore questionable, which is another source of confusion to me.

Again, please clarify what your concerns are and how they can be addressed.

Thanks,

Lukas
Daniel Borkmann Sept. 15, 2021, 9:45 a.m. UTC | #3
Hi Lukas,

On 9/11/21 11:26 PM, Lukas Wunner wrote:
> On Tue, Jan 26, 2021 at 08:13:19PM +0100, Daniel Borkmann wrote:

>> On 1/22/21 9:47 AM, Lukas Wunner wrote:

[...]
>> Wrt above objection, what needs to be done for the minimum case is to

>> i) fix the asymmetry problem from here, and

>> ii) add a flag for tc layer-redirected skbs to skip the nf egress hook *;

>> with that in place this concern should be resolved. Thanks!

> 

> Daniel, your reply above has left me utterly confused.  After thinking

> about it for a while, I'm requesting clarification what you really want.

> We do want the netfilter egress hook in mainline and we're happy to

> accommodate to your requirements, but we need you to specify them clearly.

> 

> Regarding the data path for packets which are going out from a container,

> I think you've erroneously mixed up the last two elements above:

> If the nf egress hook is placed after tc egress (as is done in the present

> patch), the data path is actually as follows:

> 

>    [tc egress (veth,podns)] -> [tc ingress (veth,hostns)] ->

>    [tc egress (phys,hostns)] -> [nf egress (phys,hostns)*]

> 

> By contrast, if nf egress is put in front of tc egress (as you're

> requesting above), the data path becomes:

> 

>    [nf egress (veth,podns)] -> [tc egress (veth,podns)] ->

>    [tc ingress (veth,hostns)] ->

>    [nf egress (phys,hostns)*] -> [tc egress (phys,hostns)]

> 

> So this order results in an *additional* nf egress hook in the data path,

> which may cost performance.  Previously you voiced concerns that the

> nf egress hook may degrade performance.  To address that concern,

> we ordered nf egress *after* tc egress, thereby avoiding any performance

> impact as much as possible.

> 

> I agree with your argument that an inverse order vis-a-vis ingress is

> more logical because it avoids NAT incongruencies etc.  So I'll be happy

> to reorder nf egress before tc egress.  I'm just confused that you're now

> requesting an order which may be *more* detrimental to performance.


Right, the issue is that placing it either in front or after the existing tc
egress hook just as-is results in layering violations given both tc/nf subsystems
will inevitably step on each other when both dataplanes are used by different
components where things break. To provide a walk-through on what I think would
work w/o breaking layers:

1) Packet going up hostns stack. Here, we'll end up with:

    [tc ingress (phys,hostns)] -> [nf ingress (phys,hostns)] ->
      upper stack ->
        [nf egress (phys,hostns)] -> [tc egress (phys,hostns)]

2) Packet going up podns stack. Here, if the forwarding happens entirely in tc
    layer, then the datapath (as-is today) operates _below_ nf and must look like:

    [tc ingress (phys,hostns)] -> [tc egress (veth,hostns)] ->
      netns switch -> (*) -> netns switch ->
        [tc ingress (veth,hostns)] -> [tc egress (phys,hostns)]

    For simplicity, I left out (*), but it's essentially the same as case 1)
    just for the Pod's/container's stack PoV.

3) Packet is locally forwarded between Pods. Same as 2) for the case where the
    forwarding happens entirely in tc (as-is today) which operates _below_ nf and
    must look like:

    (+) -> [tc ingress (veth,hostns)] -> [tc egress (veth,hostns)] ->
      netns switch -> (*) -> netns switch ->
        [tc ingress (veth,hostns)] -> [tc egress (veth,hostns)] -> (+)

    The (+) denotes the src/sink where we enter/exit the hostns after netns switch.

The skb bit marker would be that tc [& BPF]-related redirect functions are internally
setting that bit, so that nf egress hook is skipped for cases like 2)/3).

Walking through a similar 1/2/3) scenario from nf side one layer higher if you were
to do an equivalent, things would look like:

1) Packet going up hostns stack. Same as above:

    [tc ingress (phys,hostns)] -> [nf ingress (phys,hostns)] ->
      upper stack ->
        [nf egress (phys,hostns)] -> [tc egress (phys,hostns)]

2) Packet going up podns stack with forwarding from nf side:

    [tc ingress (phys,hostns)] -> [nf ingress (phys,hostns)] ->
      [nf egress (veth,hostns)] -> [tc egress (veth,hostns)] ->
        netns switch -> (*) -> netns switch ->
          [tc ingress (veth,hostns)] -> [nf ingress (veth,hostns)] ->
            [nf egress (phys,hostns)] -> [tc egress (phys,hostns)]

3) Packet is locally forwarded between Pods with forwarding from nf side:

    (+) -> [tc ingress (veth,hostns)] -> [nf ingress (veth,hostns)] ->
      [nf egress (veth,hostns)] -> [tc egress (veth,hostns)] ->
        netns switch -> (*) -> netns switch ->
          [tc ingress (veth,hostns)] -> [nf ingress (veth,hostns)] ->
            [nf egress (veth,hostns)] -> [tc egress (veth,hostns)] -> (+)

Thanks,
Daniel
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1ec3ac5d5bbf..af0774cc20d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1764,6 +1764,7 @@  enum netdev_priv_flags {
  *	@xps_maps:	XXX: need comments on this one
  *	@miniq_egress:		clsact qdisc specific data for
  *				egress processing
+ *	@nf_hooks_egress:	netfilter hooks executed for egress packets
  *	@qdisc_hash:		qdisc hash table
  *	@watchdog_timeo:	Represents the timeout that is used by
  *				the watchdog (see dev_watchdog())
@@ -2057,6 +2058,9 @@  struct net_device {
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc __rcu	*miniq_egress;
 #endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	struct nf_hook_entries __rcu *nf_hooks_egress;
+#endif
 
 #ifdef CONFIG_NET_SCHED
 	DECLARE_HASHTABLE	(qdisc_hash, 4);
diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h
index 5812b0fb0278..5ed6e90d46f6 100644
--- a/include/linux/netfilter_netdev.h
+++ b/include/linux/netfilter_netdev.h
@@ -50,11 +50,63 @@  static inline int nf_hook_ingress(struct sk_buff *skb)
 }
 #endif /* CONFIG_NETFILTER_INGRESS */
 
+#ifdef CONFIG_NETFILTER_EGRESS
+static inline bool nf_hook_egress_active(void)
+{
+#ifdef CONFIG_JUMP_LABEL
+	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EGRESS]))
+		return false;
+#endif
+	return true;
+}
+
+/* caller must hold rcu_read_lock */
+static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
+					     struct net_device *dev)
+{
+	struct nf_hook_entries *e = rcu_dereference(dev->nf_hooks_egress);
+	struct nf_hook_state state;
+	int ret;
+
+	if (!e)
+		return skb;
+
+	nf_hook_state_init(&state, NF_NETDEV_EGRESS,
+			   NFPROTO_NETDEV, dev, NULL, NULL,
+			   dev_net(dev), NULL);
+	ret = nf_hook_slow(skb, &state, e, 0);
+
+	if (ret == 1) {
+		return skb;
+	} else if (ret < 0) {
+		*rc = NET_XMIT_DROP;
+		return NULL;
+	} else { /* ret == 0 */
+		*rc = NET_XMIT_SUCCESS;
+		return NULL;
+	}
+}
+#else /* CONFIG_NETFILTER_EGRESS */
+static inline bool nf_hook_egress_active(void)
+{
+	return false;
+}
+
+static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
+					     struct net_device *dev)
+{
+	return skb;
+}
+#endif /* CONFIG_NETFILTER_EGRESS */
+
 static inline void nf_hook_netdev_init(struct net_device *dev)
 {
 #ifdef CONFIG_NETFILTER_INGRESS
 	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
 #endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	RCU_INIT_POINTER(dev->nf_hooks_egress, NULL);
+#endif
 }
 
 #endif /* _NETFILTER_NETDEV_H_ */
diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index ef9a44286e23..53411ccc69db 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -51,6 +51,7 @@  enum nf_inet_hooks {
 
 enum nf_dev_hooks {
 	NF_NETDEV_INGRESS,
+	NF_NETDEV_EGRESS,
 	NF_NETDEV_NUMHOOKS
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 931149bd654a..ecf881515b62 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3870,6 +3870,7 @@  EXPORT_SYMBOL(dev_loopback_xmit);
 static struct sk_buff *
 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 {
+#ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 	struct tcf_result cl_res;
 
@@ -3904,6 +3905,7 @@  sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 	default:
 		break;
 	}
+#endif /* CONFIG_NET_CLS_ACT */
 
 	return skb;
 }
@@ -4096,13 +4098,18 @@  static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 	qdisc_pkt_len_init(skb);
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_at_ingress = 0;
-# ifdef CONFIG_NET_EGRESS
+#endif
+#ifdef CONFIG_NET_EGRESS
 	if (static_branch_unlikely(&egress_needed_key)) {
 		skb = sch_handle_egress(skb, &rc, dev);
 		if (!skb)
 			goto out;
+		if (nf_hook_egress_active()) {
+			skb = nf_hook_egress(skb, &rc, dev);
+			if (!skb)
+				goto out;
+		}
 	}
-# endif
 #endif
 	/* If device/qdisc don't need skb->dst, release it right now while
 	 * its hot in this cpu cache.
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 49fbef0d99be..ade86afa3b1b 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -10,6 +10,14 @@  config NETFILTER_INGRESS
 	  This allows you to classify packets from ingress using the Netfilter
 	  infrastructure.
 
+config NETFILTER_EGRESS
+	bool "Netfilter egress support"
+	default y
+	select NET_EGRESS
+	help
+	  This allows you to classify packets before transmission using the
+	  Netfilter infrastructure.
+
 config NETFILTER_NETLINK
 	tristate
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 63d032191e62..3a32a813fcde 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -316,6 +316,12 @@  nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
 		if (dev && dev_net(dev) == net)
 			return &dev->nf_hooks_ingress;
 	}
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	if (hooknum == NF_NETDEV_EGRESS) {
+		if (dev && dev_net(dev) == net)
+			return &dev->nf_hooks_egress;
+	}
 #endif
 	WARN_ON_ONCE(1);
 	return NULL;
@@ -344,6 +350,11 @@  static inline bool nf_ingress_hook(const struct nf_hook_ops *reg, int pf)
 	return false;
 }
 
+static inline bool nf_egress_hook(const struct nf_hook_ops *reg, int pf)
+{
+	return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS;
+}
+
 static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf)
 {
 #ifdef CONFIG_JUMP_LABEL
@@ -383,9 +394,18 @@  static int __nf_register_net_hook(struct net *net, int pf,
 
 	switch (pf) {
 	case NFPROTO_NETDEV:
-		err = nf_ingress_check(net, reg, NF_NETDEV_INGRESS);
-		if (err < 0)
-			return err;
+#ifndef CONFIG_NETFILTER_INGRESS
+		if (reg->hooknum == NF_NETDEV_INGRESS)
+			return -EOPNOTSUPP;
+#endif
+#ifndef CONFIG_NETFILTER_EGRESS
+		if (reg->hooknum == NF_NETDEV_EGRESS)
+			return -EOPNOTSUPP;
+#endif
+		if ((reg->hooknum != NF_NETDEV_INGRESS &&
+		     reg->hooknum != NF_NETDEV_EGRESS) ||
+		    !reg->dev || dev_net(reg->dev) != net)
+			return -EINVAL;
 		break;
 	case NFPROTO_INET:
 		if (reg->hooknum != NF_INET_INGRESS)
@@ -417,6 +437,10 @@  static int __nf_register_net_hook(struct net *net, int pf,
 #ifdef CONFIG_NETFILTER_INGRESS
 	if (nf_ingress_hook(reg, pf))
 		net_inc_ingress_queue();
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	if (nf_egress_hook(reg, pf))
+		net_inc_egress_queue();
 #endif
 	nf_static_key_inc(reg, pf);
 
@@ -474,6 +498,10 @@  static void __nf_unregister_net_hook(struct net *net, int pf,
 #ifdef CONFIG_NETFILTER_INGRESS
 		if (nf_ingress_hook(reg, pf))
 			net_dec_ingress_queue();
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+		if (nf_egress_hook(reg, pf))
+			net_dec_egress_queue();
 #endif
 		nf_static_key_dec(reg, pf);
 	} else {
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index ff8528ad3dc6..c9dc5f36569b 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -310,9 +310,11 @@  static const struct nft_chain_type nft_chain_filter_netdev = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
 	.family		= NFPROTO_NETDEV,
-	.hook_mask	= (1 << NF_NETDEV_INGRESS),
+	.hook_mask	= (1 << NF_NETDEV_INGRESS) |
+			  (1 << NF_NETDEV_EGRESS),
 	.hooks		= {
 		[NF_NETDEV_INGRESS]	= nft_do_chain_netdev,
+		[NF_NETDEV_EGRESS]	= nft_do_chain_netdev,
 	},
 };