diff mbox series

[bpf-next,v2,4/7] net: sched: add lightweight update path for cls_bpf

Message ID 20210604063116.234316-5-memxor@gmail.com
State New
Headers show
Series Add bpf_link based TC-BPF API | expand

Commit Message

Kumar Kartikeya Dwivedi June 4, 2021, 6:31 a.m. UTC
This is used by BPF_LINK_UPDATE to replace the attach SCHED_CLS bpf prog
effectively changing the classifier implementation for a given filter
owned by a bpf_link.

Note that READ_ONCE suffices in this case as the ordering for loads from
the filter are implicitly provided by the data dependency on BPF prog
pointer.

On the writer side we can just use a relaxed WRITE_ONCE store to make
sure one or the other value is visible to a reader in cls_bpf_classify.
Lifetime is managed using RCU so bpf_prog_put path should wait until
readers are done for old_prog.

All other parties accessing the BPF prog are under RTNL protection, so
need no changes.

Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>.
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 net/sched/cls_bpf.c | 55 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 2 deletions(-)

Comments

Kumar Kartikeya Dwivedi June 5, 2021, 4:42 a.m. UTC | #1
On Fri, Jun 04, 2021 at 11:24:28PM IST, Alexei Starovoitov wrote:
> On Fri, Jun 04, 2021 at 12:01:13PM +0530, Kumar Kartikeya Dwivedi wrote:
> > This is used by BPF_LINK_UPDATE to replace the attach SCHED_CLS bpf prog
> > effectively changing the classifier implementation for a given filter
> > owned by a bpf_link.
> >
> > Note that READ_ONCE suffices in this case as the ordering for loads from
> > the filter are implicitly provided by the data dependency on BPF prog
> > pointer.
> >
> > On the writer side we can just use a relaxed WRITE_ONCE store to make
> > sure one or the other value is visible to a reader in cls_bpf_classify.
> > Lifetime is managed using RCU so bpf_prog_put path should wait until
> > readers are done for old_prog.
>
> Should those be rcu_deref and rcu_assign_pointer ?
> Typically the pointer would be __rcu annotated which would be
> another small change in struct cls_bpf_prog.
> That would make the life time easier to follow?
>

True, I'll make that change.

> > All other parties accessing the BPF prog are under RTNL protection, so
> > need no changes.
> >
> > Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>.
> > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> > ---
> >  net/sched/cls_bpf.c | 55 +++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 53 insertions(+), 2 deletions(-)
> >
> > diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
> > index bf61ffbb7fd0..f23304685c48 100644
> > --- a/net/sched/cls_bpf.c
> > +++ b/net/sched/cls_bpf.c
> > @@ -9,6 +9,7 @@
> >   * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
> >   */
> >
> > +#include <linux/atomic.h>
> >  #include <linux/module.h>
> >  #include <linux/types.h>
> >  #include <linux/skbuff.h>
> > @@ -104,11 +105,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
> >  			/* It is safe to push/pull even if skb_shared() */
> >  			__skb_push(skb, skb->mac_len);
> >  			bpf_compute_data_pointers(skb);
> > -			filter_res = BPF_PROG_RUN(prog->filter, skb);
> > +			filter_res = BPF_PROG_RUN(READ_ONCE(prog->filter), skb);
> >  			__skb_pull(skb, skb->mac_len);
> >  		} else {
> >  			bpf_compute_data_pointers(skb);
> > -			filter_res = BPF_PROG_RUN(prog->filter, skb);
> > +			filter_res = BPF_PROG_RUN(READ_ONCE(prog->filter), skb);
> >  		}
> >
> >  		if (prog->exts_integrated) {
> > @@ -775,6 +776,55 @@ static int cls_bpf_link_detach(struct bpf_link *link)
> >  	return 0;
> >  }
> >
> > +static int cls_bpf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
> > +			       struct bpf_prog *old_prog)
> > +{
> > +	struct cls_bpf_link *cls_link;
> > +	struct cls_bpf_prog cls_prog;
> > +	struct cls_bpf_prog *prog;
> > +	int ret;
> > +
> > +	rtnl_lock();
> > +
> > +	cls_link = container_of(link, struct cls_bpf_link, link);
> > +	if (!cls_link->prog) {
> > +		ret = -ENOLINK;
> > +		goto out;
> > +	}
> > +
> > +	prog = cls_link->prog;
> > +
> > +	/* BPF_F_REPLACEing? */
> > +	if (old_prog && prog->filter != old_prog) {
> > +		ret = -EINVAL;
>
> Other places like cgroup_bpf_replace and bpf_iter_link_replace
> return -EPERM in such case.
>

Ok, will change.

> > +		goto out;
> > +	}
> > +
> > +	old_prog = prog->filter;
> > +
> > +	if (new_prog == old_prog) {
> > +		ret = 0;
> > +		goto out;
> > +	}
> > +
> > +	cls_prog = *prog;
> > +	cls_prog.filter = new_prog;
> > +
> > +	ret = cls_bpf_offload(prog->tp, &cls_prog, prog, NULL);
> > +	if (ret < 0)
> > +		goto out;
> > +
> > +	WRITE_ONCE(prog->filter, new_prog);
> > +
> > +	bpf_prog_inc(new_prog);
> > +	/* release our reference */
> > +	bpf_prog_put(old_prog);
> > +
> > +out:
> > +	rtnl_unlock();
> > +	return ret;
> > +}
> > +
> >  static void __bpf_fill_link_info(struct cls_bpf_link *link,
> >  				 struct bpf_link_info *info)
> >  {
> > @@ -859,6 +909,7 @@ static const struct bpf_link_ops cls_bpf_link_ops = {
> >  	.show_fdinfo = cls_bpf_link_show_fdinfo,
> >  #endif
> >  	.fill_link_info = cls_bpf_link_fill_link_info,
> > +	.update_prog = cls_bpf_link_update,
> >  };
> >
> >  static inline char *cls_bpf_link_name(u32 prog_id, const char *name)
> > --
> > 2.31.1
> >
>
> --

--
Kartikeya
Andrii Nakryiko June 7, 2021, 11:32 p.m. UTC | #2
On Thu, Jun 3, 2021 at 11:32 PM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>

> This is used by BPF_LINK_UPDATE to replace the attach SCHED_CLS bpf prog

> effectively changing the classifier implementation for a given filter

> owned by a bpf_link.

>

> Note that READ_ONCE suffices in this case as the ordering for loads from

> the filter are implicitly provided by the data dependency on BPF prog

> pointer.

>

> On the writer side we can just use a relaxed WRITE_ONCE store to make

> sure one or the other value is visible to a reader in cls_bpf_classify.

> Lifetime is managed using RCU so bpf_prog_put path should wait until

> readers are done for old_prog.

>

> All other parties accessing the BPF prog are under RTNL protection, so

> need no changes.

>

> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>.

> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>

> ---

>  net/sched/cls_bpf.c | 55 +++++++++++++++++++++++++++++++++++++++++++--

>  1 file changed, 53 insertions(+), 2 deletions(-)

>

> diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c

> index bf61ffbb7fd0..f23304685c48 100644

> --- a/net/sched/cls_bpf.c

> +++ b/net/sched/cls_bpf.c

> @@ -9,6 +9,7 @@

>   * (C) 2013 Daniel Borkmann <dborkman@redhat.com>

>   */

>

> +#include <linux/atomic.h>

>  #include <linux/module.h>

>  #include <linux/types.h>

>  #include <linux/skbuff.h>

> @@ -104,11 +105,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,

>                         /* It is safe to push/pull even if skb_shared() */

>                         __skb_push(skb, skb->mac_len);

>                         bpf_compute_data_pointers(skb);

> -                       filter_res = BPF_PROG_RUN(prog->filter, skb);

> +                       filter_res = BPF_PROG_RUN(READ_ONCE(prog->filter), skb);

>                         __skb_pull(skb, skb->mac_len);

>                 } else {

>                         bpf_compute_data_pointers(skb);

> -                       filter_res = BPF_PROG_RUN(prog->filter, skb);

> +                       filter_res = BPF_PROG_RUN(READ_ONCE(prog->filter), skb);

>                 }

>

>                 if (prog->exts_integrated) {

> @@ -775,6 +776,55 @@ static int cls_bpf_link_detach(struct bpf_link *link)

>         return 0;

>  }

>

> +static int cls_bpf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,

> +                              struct bpf_prog *old_prog)

> +{

> +       struct cls_bpf_link *cls_link;

> +       struct cls_bpf_prog cls_prog;

> +       struct cls_bpf_prog *prog;

> +       int ret;

> +

> +       rtnl_lock();

> +

> +       cls_link = container_of(link, struct cls_bpf_link, link);

> +       if (!cls_link->prog) {

> +               ret = -ENOLINK;

> +               goto out;

> +       }

> +

> +       prog = cls_link->prog;

> +

> +       /* BPF_F_REPLACEing? */

> +       if (old_prog && prog->filter != old_prog) {

> +               ret = -EINVAL;

> +               goto out;

> +       }

> +

> +       old_prog = prog->filter;

> +

> +       if (new_prog == old_prog) {

> +               ret = 0;


So the contract is that if update is successful, new_prog's refcount
taken by link_update() in kernel/bpf/syscall.c is transferred here. On
error, it will be bpf_prog_put() by link_update(). So here you don't
need extra refcnt, but it's also not an error, so you need to
bpf_prog_put(new_prog) explicitly to balance out refcnt. See how it's
done for XDP, for example.


> +               goto out;

> +       }

> +

> +       cls_prog = *prog;

> +       cls_prog.filter = new_prog;

> +

> +       ret = cls_bpf_offload(prog->tp, &cls_prog, prog, NULL);

> +       if (ret < 0)

> +               goto out;

> +

> +       WRITE_ONCE(prog->filter, new_prog);

> +

> +       bpf_prog_inc(new_prog);


and you don't need this, you already got the reference from link_update()

> +       /* release our reference */

> +       bpf_prog_put(old_prog);

> +

> +out:

> +       rtnl_unlock();

> +       return ret;

> +}

> +

>  static void __bpf_fill_link_info(struct cls_bpf_link *link,

>                                  struct bpf_link_info *info)

>  {

> @@ -859,6 +909,7 @@ static const struct bpf_link_ops cls_bpf_link_ops = {

>         .show_fdinfo = cls_bpf_link_show_fdinfo,

>  #endif

>         .fill_link_info = cls_bpf_link_fill_link_info,

> +       .update_prog = cls_bpf_link_update,

>  };

>

>  static inline char *cls_bpf_link_name(u32 prog_id, const char *name)

> --

> 2.31.1

>
Kumar Kartikeya Dwivedi June 10, 2021, 2:14 p.m. UTC | #3
On Tue, Jun 08, 2021 at 05:02:04AM IST, Andrii Nakryiko wrote:
> On Thu, Jun 3, 2021 at 11:32 PM Kumar Kartikeya Dwivedi

> <memxor@gmail.com> wrote:

> >

> > This is used by BPF_LINK_UPDATE to replace the attach SCHED_CLS bpf prog

> > effectively changing the classifier implementation for a given filter

> > owned by a bpf_link.

> >

> > Note that READ_ONCE suffices in this case as the ordering for loads from

> > the filter are implicitly provided by the data dependency on BPF prog

> > pointer.

> >

> > On the writer side we can just use a relaxed WRITE_ONCE store to make

> > sure one or the other value is visible to a reader in cls_bpf_classify.

> > Lifetime is managed using RCU so bpf_prog_put path should wait until

> > readers are done for old_prog.

> >

> > All other parties accessing the BPF prog are under RTNL protection, so

> > need no changes.

> >

> > Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>.

> > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>

> > ---

> >  net/sched/cls_bpf.c | 55 +++++++++++++++++++++++++++++++++++++++++++--

> >  1 file changed, 53 insertions(+), 2 deletions(-)

> >

> > diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c

> > index bf61ffbb7fd0..f23304685c48 100644

> > --- a/net/sched/cls_bpf.c

> > +++ b/net/sched/cls_bpf.c

> > @@ -9,6 +9,7 @@

> >   * (C) 2013 Daniel Borkmann <dborkman@redhat.com>

> >   */

> >

> > +#include <linux/atomic.h>

> >  #include <linux/module.h>

> >  #include <linux/types.h>

> >  #include <linux/skbuff.h>

> > @@ -104,11 +105,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,

> >                         /* It is safe to push/pull even if skb_shared() */

> >                         __skb_push(skb, skb->mac_len);

> >                         bpf_compute_data_pointers(skb);

> > -                       filter_res = BPF_PROG_RUN(prog->filter, skb);

> > +                       filter_res = BPF_PROG_RUN(READ_ONCE(prog->filter), skb);

> >                         __skb_pull(skb, skb->mac_len);

> >                 } else {

> >                         bpf_compute_data_pointers(skb);

> > -                       filter_res = BPF_PROG_RUN(prog->filter, skb);

> > +                       filter_res = BPF_PROG_RUN(READ_ONCE(prog->filter), skb);

> >                 }

> >

> >                 if (prog->exts_integrated) {

> > @@ -775,6 +776,55 @@ static int cls_bpf_link_detach(struct bpf_link *link)

> >         return 0;

> >  }

> >

> > +static int cls_bpf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,

> > +                              struct bpf_prog *old_prog)

> > +{

> > +       struct cls_bpf_link *cls_link;

> > +       struct cls_bpf_prog cls_prog;

> > +       struct cls_bpf_prog *prog;

> > +       int ret;

> > +

> > +       rtnl_lock();

> > +

> > +       cls_link = container_of(link, struct cls_bpf_link, link);

> > +       if (!cls_link->prog) {

> > +               ret = -ENOLINK;

> > +               goto out;

> > +       }

> > +

> > +       prog = cls_link->prog;

> > +

> > +       /* BPF_F_REPLACEing? */

> > +       if (old_prog && prog->filter != old_prog) {

> > +               ret = -EINVAL;

> > +               goto out;

> > +       }

> > +

> > +       old_prog = prog->filter;

> > +

> > +       if (new_prog == old_prog) {

> > +               ret = 0;

>

> So the contract is that if update is successful, new_prog's refcount

> taken by link_update() in kernel/bpf/syscall.c is transferred here. On

> error, it will be bpf_prog_put() by link_update(). So here you don't

> need extra refcnt, but it's also not an error, so you need to

> bpf_prog_put(new_prog) explicitly to balance out refcnt. See how it's

> done for XDP, for example.

>


Yes, thanks for spotting this.

>

> > +               goto out;

> > +       }

> > +

> > +       cls_prog = *prog;

> > +       cls_prog.filter = new_prog;

> > +

> > +       ret = cls_bpf_offload(prog->tp, &cls_prog, prog, NULL);

> > +       if (ret < 0)

> > +               goto out;

> > +

> > +       WRITE_ONCE(prog->filter, new_prog);

> > +

> > +       bpf_prog_inc(new_prog);

>

> and you don't need this, you already got the reference from link_update()

>


So the reason I still keep an extra refcount is because the existing code on the
netlink side assumes that. Even though the link itself holds a refcount for us,
the actual freeing of cls_bpf_prog may happen independent of bpf_link.

I'll add a comment for this.

> > +       /* release our reference */

> > +       bpf_prog_put(old_prog);

> > +

> > +out:

> > +       rtnl_unlock();

> > +       return ret;

> > +}

> > +

> >  static void __bpf_fill_link_info(struct cls_bpf_link *link,

> >                                  struct bpf_link_info *info)

> >  {

> > @@ -859,6 +909,7 @@ static const struct bpf_link_ops cls_bpf_link_ops = {

> >         .show_fdinfo = cls_bpf_link_show_fdinfo,

> >  #endif

> >         .fill_link_info = cls_bpf_link_fill_link_info,

> > +       .update_prog = cls_bpf_link_update,

> >  };

> >

> >  static inline char *cls_bpf_link_name(u32 prog_id, const char *name)

> > --

> > 2.31.1

> >


--
Kartikeya
diff mbox series

Patch

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index bf61ffbb7fd0..f23304685c48 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -9,6 +9,7 @@ 
  * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
  */
 
+#include <linux/atomic.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/skbuff.h>
@@ -104,11 +105,11 @@  static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
 			bpf_compute_data_pointers(skb);
-			filter_res = BPF_PROG_RUN(prog->filter, skb);
+			filter_res = BPF_PROG_RUN(READ_ONCE(prog->filter), skb);
 			__skb_pull(skb, skb->mac_len);
 		} else {
 			bpf_compute_data_pointers(skb);
-			filter_res = BPF_PROG_RUN(prog->filter, skb);
+			filter_res = BPF_PROG_RUN(READ_ONCE(prog->filter), skb);
 		}
 
 		if (prog->exts_integrated) {
@@ -775,6 +776,55 @@  static int cls_bpf_link_detach(struct bpf_link *link)
 	return 0;
 }
 
+static int cls_bpf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
+			       struct bpf_prog *old_prog)
+{
+	struct cls_bpf_link *cls_link;
+	struct cls_bpf_prog cls_prog;
+	struct cls_bpf_prog *prog;
+	int ret;
+
+	rtnl_lock();
+
+	cls_link = container_of(link, struct cls_bpf_link, link);
+	if (!cls_link->prog) {
+		ret = -ENOLINK;
+		goto out;
+	}
+
+	prog = cls_link->prog;
+
+	/* BPF_F_REPLACEing? */
+	if (old_prog && prog->filter != old_prog) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	old_prog = prog->filter;
+
+	if (new_prog == old_prog) {
+		ret = 0;
+		goto out;
+	}
+
+	cls_prog = *prog;
+	cls_prog.filter = new_prog;
+
+	ret = cls_bpf_offload(prog->tp, &cls_prog, prog, NULL);
+	if (ret < 0)
+		goto out;
+
+	WRITE_ONCE(prog->filter, new_prog);
+
+	bpf_prog_inc(new_prog);
+	/* release our reference */
+	bpf_prog_put(old_prog);
+
+out:
+	rtnl_unlock();
+	return ret;
+}
+
 static void __bpf_fill_link_info(struct cls_bpf_link *link,
 				 struct bpf_link_info *info)
 {
@@ -859,6 +909,7 @@  static const struct bpf_link_ops cls_bpf_link_ops = {
 	.show_fdinfo = cls_bpf_link_show_fdinfo,
 #endif
 	.fill_link_info = cls_bpf_link_fill_link_info,
+	.update_prog = cls_bpf_link_update,
 };
 
 static inline char *cls_bpf_link_name(u32 prog_id, const char *name)