[bpf-next,2/3] bpf: Add verifier checks for bpf_timer.

Message ID	20210527040259.77823-3-alexei.starovoitov@gmail.com
State	New
Headers	show Return-Path: <netdev-owner@kernel.org> From: Alexei Starovoitov <alexei.starovoitov@gmail.com> To: davem@davemloft.net Cc: daniel@iogearbox.net, andrii@kernel.org, netdev@vger.kernel.org, bpf@vger.kernel.org, kernel-team@fb.com Subject: [PATCH bpf-next 2/3] bpf: Add verifier checks for bpf_timer. Date: Wed, 26 May 2021 21:02:58 -0700 Message-Id: <20210527040259.77823-3-alexei.starovoitov@gmail.com> In-Reply-To: <20210527040259.77823-1-alexei.starovoitov@gmail.com> References: <20210527040259.77823-1-alexei.starovoitov@gmail.com> Precedence: bulk
Series	bpf: Introduce BPF timers. \| expand [bpf-next,0/3] bpf: Introduce BPF timers. [bpf-next,2/3] bpf: Add verifier checks for bpf_timer.

Message ID

20210527040259.77823-3-alexei.starovoitov@gmail.com

State

New

Headers

From: Alexei Starovoitov <alexei.starovoitov@gmail.com>
To: davem@davemloft.net
Cc: daniel@iogearbox.net, andrii@kernel.org, netdev@vger.kernel.org,
	bpf@vger.kernel.org, kernel-team@fb.com
Subject: [PATCH bpf-next 2/3] bpf: Add verifier checks for bpf_timer.
Date: Wed, 26 May 2021 21:02:58 -0700
Message-Id: <20210527040259.77823-3-alexei.starovoitov@gmail.com>
In-Reply-To: <20210527040259.77823-1-alexei.starovoitov@gmail.com>
References: <20210527040259.77823-1-alexei.starovoitov@gmail.com>
Precedence: bulk

Series

bpf: Introduce BPF timers. | expand

Commit Message

Alexei Starovoitov May 27, 2021, 4:02 a.m. UTC

From: Alexei Starovoitov <ast@kernel.org>

Add appropriate safety checks for bpf_timer:
- restrict to array, hash, lru. per-cpu maps cannot be supported.
- kfree bpf_timer during map_delete_elem and map_free.
- verifier btf checks.
- safe interaction with lookup/update/delete operations and iterator.
- relax the first field only requirement of the previous patch.
- allow bpf_timer in global data and search for it in datasec.
- check prog_rdonly, frozen flags.
- mmap is allowed. otherwise global timer is not possible.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h        | 36 +++++++++++++-----
 include/linux/btf.h        |  1 +
 kernel/bpf/arraymap.c      |  7 ++++
 kernel/bpf/btf.c           | 77 +++++++++++++++++++++++++++++++-------
 kernel/bpf/hashtab.c       | 53 ++++++++++++++++++++------
 kernel/bpf/helpers.c       |  2 +-
 kernel/bpf/local_storage.c |  4 +-
 kernel/bpf/syscall.c       | 23 ++++++++++--
 kernel/bpf/verifier.c      | 30 +++++++++++++--
 9 files changed, 190 insertions(+), 43 deletions(-)

Comments

Andrii Nakryiko June 2, 2021, 10:34 p.m. UTC | #1

On Wed, May 26, 2021 at 9:03 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>

> From: Alexei Starovoitov <ast@kernel.org>

>

> Add appropriate safety checks for bpf_timer:

> - restrict to array, hash, lru. per-cpu maps cannot be supported.

> - kfree bpf_timer during map_delete_elem and map_free.

> - verifier btf checks.

> - safe interaction with lookup/update/delete operations and iterator.

> - relax the first field only requirement of the previous patch.

> - allow bpf_timer in global data and search for it in datasec.

> - check prog_rdonly, frozen flags.

> - mmap is allowed. otherwise global timer is not possible.

>

> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

> ---

>  include/linux/bpf.h        | 36 +++++++++++++-----

>  include/linux/btf.h        |  1 +

>  kernel/bpf/arraymap.c      |  7 ++++

>  kernel/bpf/btf.c           | 77 +++++++++++++++++++++++++++++++-------

>  kernel/bpf/hashtab.c       | 53 ++++++++++++++++++++------

>  kernel/bpf/helpers.c       |  2 +-

>  kernel/bpf/local_storage.c |  4 +-

>  kernel/bpf/syscall.c       | 23 ++++++++++--

>  kernel/bpf/verifier.c      | 30 +++++++++++++--

>  9 files changed, 190 insertions(+), 43 deletions(-)

>


[...]

>  /* copy everything but bpf_spin_lock */

>  static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)

>  {

> +       u32 off = 0, size = 0;

> +

>         if (unlikely(map_value_has_spin_lock(map))) {

> -               u32 off = map->spin_lock_off;

> +               off = map->spin_lock_off;

> +               size = sizeof(struct bpf_spin_lock);

> +       } else if (unlikely(map_value_has_timer(map))) {

> +               off = map->timer_off;

> +               size = sizeof(struct bpf_timer);

> +       }


so the need to handle 0, 1, or 2 gaps seems to be the only reason to
disallow both bpf_spinlock and bpf_timer in one map element, right?
Isn't it worth addressing it from the very beginning to lift the
artificial restriction? E.g., for speed, you'd do:

if (likely(neither spinlock nor timer)) {
 /* fastest pass */
} else if (only one of spinlock or timer) {
  /* do what you do here */
} else {
  int off1, off2, sz1, sz2;

  if (spinlock_off < timer_off) {
    off1 = spinlock_off;
    sz1 = spinlock_sz;
    off2 = timer_off;
    sz2 = timer_sz;
  } else {
    ... you get the idea
  }

  memcpy(0, off1);
  memcpy(off1+sz1, off2);
  memcpy(off2+sz2, total_sz);
}

It's not that bad, right?

>

> +       if (unlikely(size)) {

>                 memcpy(dst, src, off);

> -               memcpy(dst + off + sizeof(struct bpf_spin_lock),

> -                      src + off + sizeof(struct bpf_spin_lock),

> -                      map->value_size - off - sizeof(struct bpf_spin_lock));

> +               memcpy(dst + off + size,

> +                      src + off + size,

> +                      map->value_size - off - size);

>         } else {

>                 memcpy(dst, src, map->value_size);

>         }


[...]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

> index f386f85aee5c..0a828dc4968e 100644

> --- a/kernel/bpf/verifier.c

> +++ b/kernel/bpf/verifier.c

> @@ -3241,6 +3241,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,

>                         return -EACCES;

>                 }

>         }

> +       if (map_value_has_timer(map)) {

> +               u32 t = map->timer_off;

> +

> +               if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&


<= ? Otherwise we allow accessing the first byte, unless I'm mistaken.

> +                    t < reg->umax_value + off + size) {

> +                       verbose(env, "bpf_timer cannot be accessed directly by load/store\n");

> +                       return -EACCES;

> +               }

> +       }

>         return err;

>  }

>

> @@ -4675,9 +4684,24 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,

>                         map->name);

>                 return -EINVAL;

>         }

> -       if (val) {

> -               /* todo: relax this requirement */

> -               verbose(env, "bpf_timer field can only be first in the map value element\n");


ok, this was confusing, but now I see why you did that...

> +       if (!map_value_has_timer(map)) {

> +               if (map->timer_off == -E2BIG)

> +                       verbose(env,

> +                               "map '%s' has more than one 'struct bpf_timer'\n",

> +                               map->name);

> +               else if (map->timer_off == -ENOENT)

> +                       verbose(env,

> +                               "map '%s' doesn't have 'struct bpf_timer'\n",

> +                               map->name);

> +               else

> +                       verbose(env,

> +                               "map '%s' is not a struct type or bpf_timer is mangled\n",

> +                               map->name);

> +               return -EINVAL;

> +       }

> +       if (map->timer_off != val + reg->off) {

> +               verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",

> +                       val + reg->off, map->timer_off);

>                 return -EINVAL;

>         }

>         WARN_ON(meta->map_ptr);

> --

> 2.30.2

>

Andrii Nakryiko June 2, 2021, 10:38 p.m. UTC | #2

On Wed, Jun 2, 2021 at 3:34 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>

> On Wed, May 26, 2021 at 9:03 PM Alexei Starovoitov

> <alexei.starovoitov@gmail.com> wrote:

> >

> > From: Alexei Starovoitov <ast@kernel.org>

> >

> > Add appropriate safety checks for bpf_timer:

> > - restrict to array, hash, lru. per-cpu maps cannot be supported.

> > - kfree bpf_timer during map_delete_elem and map_free.

> > - verifier btf checks.

> > - safe interaction with lookup/update/delete operations and iterator.

> > - relax the first field only requirement of the previous patch.

> > - allow bpf_timer in global data and search for it in datasec.


I'll mention it here for completeness. I don't think safety
implications are worth it to support timer or spinlock in
memory-mapped maps. It's way too easy to abuse it (or even
accidentally corrupt kernel state). Sure it's nice, but doing an
explicit single-element map for "global" timer is just fine. And it
generalizes nicely to having 2, 3, ..., N timers.

> > - check prog_rdonly, frozen flags.

> > - mmap is allowed. otherwise global timer is not possible.

> >

> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>

> > ---

> >  include/linux/bpf.h        | 36 +++++++++++++-----

> >  include/linux/btf.h        |  1 +

> >  kernel/bpf/arraymap.c      |  7 ++++

> >  kernel/bpf/btf.c           | 77 +++++++++++++++++++++++++++++++-------

> >  kernel/bpf/hashtab.c       | 53 ++++++++++++++++++++------

> >  kernel/bpf/helpers.c       |  2 +-

> >  kernel/bpf/local_storage.c |  4 +-

> >  kernel/bpf/syscall.c       | 23 ++++++++++--

> >  kernel/bpf/verifier.c      | 30 +++++++++++++--

> >  9 files changed, 190 insertions(+), 43 deletions(-)

> >

>

> [...]

>

> >  /* copy everything but bpf_spin_lock */

> >  static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)

> >  {

> > +       u32 off = 0, size = 0;

> > +

> >         if (unlikely(map_value_has_spin_lock(map))) {

> > -               u32 off = map->spin_lock_off;

> > +               off = map->spin_lock_off;

> > +               size = sizeof(struct bpf_spin_lock);

> > +       } else if (unlikely(map_value_has_timer(map))) {

> > +               off = map->timer_off;

> > +               size = sizeof(struct bpf_timer);

> > +       }

>

> so the need to handle 0, 1, or 2 gaps seems to be the only reason to

> disallow both bpf_spinlock and bpf_timer in one map element, right?

> Isn't it worth addressing it from the very beginning to lift the

> artificial restriction? E.g., for speed, you'd do:

>

> if (likely(neither spinlock nor timer)) {

>  /* fastest pass */

> } else if (only one of spinlock or timer) {

>   /* do what you do here */

> } else {

>   int off1, off2, sz1, sz2;

>

>   if (spinlock_off < timer_off) {

>     off1 = spinlock_off;

>     sz1 = spinlock_sz;

>     off2 = timer_off;

>     sz2 = timer_sz;

>   } else {

>     ... you get the idea

>   }

>

>   memcpy(0, off1);

>   memcpy(off1+sz1, off2);

>   memcpy(off2+sz2, total_sz);

> }

>

> It's not that bad, right?

>

> >

> > +       if (unlikely(size)) {

> >                 memcpy(dst, src, off);

> > -               memcpy(dst + off + sizeof(struct bpf_spin_lock),

> > -                      src + off + sizeof(struct bpf_spin_lock),

> > -                      map->value_size - off - sizeof(struct bpf_spin_lock));

> > +               memcpy(dst + off + size,

> > +                      src + off + size,

> > +                      map->value_size - off - size);

> >         } else {

> >                 memcpy(dst, src, map->value_size);

> >         }

>

> [...]

>

> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

> > index f386f85aee5c..0a828dc4968e 100644

> > --- a/kernel/bpf/verifier.c

> > +++ b/kernel/bpf/verifier.c

> > @@ -3241,6 +3241,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,

> >                         return -EACCES;

> >                 }

> >         }

> > +       if (map_value_has_timer(map)) {

> > +               u32 t = map->timer_off;

> > +

> > +               if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&

>

> <= ? Otherwise we allow accessing the first byte, unless I'm mistaken.

>

> > +                    t < reg->umax_value + off + size) {

> > +                       verbose(env, "bpf_timer cannot be accessed directly by load/store\n");

> > +                       return -EACCES;

> > +               }

> > +       }

> >         return err;

> >  }

> >

> > @@ -4675,9 +4684,24 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,

> >                         map->name);

> >                 return -EINVAL;

> >         }

> > -       if (val) {

> > -               /* todo: relax this requirement */

> > -               verbose(env, "bpf_timer field can only be first in the map value element\n");

>

> ok, this was confusing, but now I see why you did that...

>

> > +       if (!map_value_has_timer(map)) {

> > +               if (map->timer_off == -E2BIG)

> > +                       verbose(env,

> > +                               "map '%s' has more than one 'struct bpf_timer'\n",

> > +                               map->name);

> > +               else if (map->timer_off == -ENOENT)

> > +                       verbose(env,

> > +                               "map '%s' doesn't have 'struct bpf_timer'\n",

> > +                               map->name);

> > +               else

> > +                       verbose(env,

> > +                               "map '%s' is not a struct type or bpf_timer is mangled\n",

> > +                               map->name);

> > +               return -EINVAL;

> > +       }

> > +       if (map->timer_off != val + reg->off) {

> > +               verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",

> > +                       val + reg->off, map->timer_off);

> >                 return -EINVAL;

> >         }

> >         WARN_ON(meta->map_ptr);

> > --

> > 2.30.2

> >

Alexei Starovoitov June 3, 2021, 2:04 a.m. UTC | #3

On Wed, Jun 02, 2021 at 03:34:29PM -0700, Andrii Nakryiko wrote:
> 

> >  /* copy everything but bpf_spin_lock */

> >  static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)

> >  {

> > +       u32 off = 0, size = 0;

> > +

> >         if (unlikely(map_value_has_spin_lock(map))) {

> > -               u32 off = map->spin_lock_off;

> > +               off = map->spin_lock_off;

> > +               size = sizeof(struct bpf_spin_lock);

> > +       } else if (unlikely(map_value_has_timer(map))) {

> > +               off = map->timer_off;

> > +               size = sizeof(struct bpf_timer);

> > +       }

> 

> so the need to handle 0, 1, or 2 gaps seems to be the only reason to

> disallow both bpf_spinlock and bpf_timer in one map element, right?


exactly.

> Isn't it worth addressing it from the very beginning to lift the

> artificial restriction? E.g., for speed, you'd do:

> 

> if (likely(neither spinlock nor timer)) {

>  /* fastest pass */

> } else if (only one of spinlock or timer) {

>   /* do what you do here */

> } else {

>   int off1, off2, sz1, sz2;

> 

>   if (spinlock_off < timer_off) {

>     off1 = spinlock_off;

>     sz1 = spinlock_sz;

>     off2 = timer_off;

>     sz2 = timer_sz;

>   } else {

>     ... you get the idea


Not really :)
Are you suggesting to support one bpf_spin_lock and one
bpf_timer inside single map element, but not two spin_locks
and/or not two bpf_timers?
Two me it's either one or support any.
Anything in-between doesn't seem worth extra code.

> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

> > index f386f85aee5c..0a828dc4968e 100644

> > --- a/kernel/bpf/verifier.c

> > +++ b/kernel/bpf/verifier.c

> > @@ -3241,6 +3241,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,

> >                         return -EACCES;

> >                 }

> >         }

> > +       if (map_value_has_timer(map)) {

> > +               u32 t = map->timer_off;

> > +

> > +               if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&

> 

> <= ? Otherwise we allow accessing the first byte, unless I'm mistaken.


I don't think so. See the comment above in if (map_value_has_spin_lock(map))
I didn't copy-paste it, because it's the same logic.

> > -       if (val) {

> > -               /* todo: relax this requirement */

> > -               verbose(env, "bpf_timer field can only be first in the map value element\n");

> 

> ok, this was confusing, but now I see why you did that...


I'll clarify the comment to say that the next patch fixes it.

Andrii Nakryiko June 3, 2021, 5:35 p.m. UTC | #4

On Wed, Jun 2, 2021 at 7:04 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>

> On Wed, Jun 02, 2021 at 03:34:29PM -0700, Andrii Nakryiko wrote:

> >

> > >  /* copy everything but bpf_spin_lock */

> > >  static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)

> > >  {

> > > +       u32 off = 0, size = 0;

> > > +

> > >         if (unlikely(map_value_has_spin_lock(map))) {

> > > -               u32 off = map->spin_lock_off;

> > > +               off = map->spin_lock_off;

> > > +               size = sizeof(struct bpf_spin_lock);

> > > +       } else if (unlikely(map_value_has_timer(map))) {

> > > +               off = map->timer_off;

> > > +               size = sizeof(struct bpf_timer);

> > > +       }

> >

> > so the need to handle 0, 1, or 2 gaps seems to be the only reason to

> > disallow both bpf_spinlock and bpf_timer in one map element, right?

>

> exactly.

>

> > Isn't it worth addressing it from the very beginning to lift the

> > artificial restriction? E.g., for speed, you'd do:

> >

> > if (likely(neither spinlock nor timer)) {

> >  /* fastest pass */

> > } else if (only one of spinlock or timer) {

> >   /* do what you do here */

> > } else {

> >   int off1, off2, sz1, sz2;

> >

> >   if (spinlock_off < timer_off) {

> >     off1 = spinlock_off;

> >     sz1 = spinlock_sz;

> >     off2 = timer_off;

> >     sz2 = timer_sz;

> >   } else {

> >     ... you get the idea

>

> Not really :)


hm, really? I meant that else will be:

     off1 = timer_off;
     sz1 = timer_sz;
     off2 = spinlock_off;
     sz2 = spinlock_sz;

Just making sure that off1 < off2 always and sz1 and sz2 are matching

> Are you suggesting to support one bpf_spin_lock and one

> bpf_timer inside single map element, but not two spin_locks

> and/or not two bpf_timers?


Yes, exactly. I see bpf_spinlock and bpf_timer as two independent
orthogonal features and I don't understand why we restrict using just
one of them in a given map element. I think those 20 lines of code
that decouples them and removes artificial restriction that users need
to remember (or discover with surprise) is totally worth it.

> Two me it's either one or support any.


I think it's fine to start with supporting one. But one of each. They
are independent of each other.

> Anything in-between doesn't seem worth extra code.


Up to you, but I disagree, obviously. It's possible to work-around
that limitation with extra maps/complexity, so if I ever need to both
lock an element and schedule the timer with it, it's not going to stop
me. :)

>

> > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

> > > index f386f85aee5c..0a828dc4968e 100644

> > > --- a/kernel/bpf/verifier.c

> > > +++ b/kernel/bpf/verifier.c

> > > @@ -3241,6 +3241,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,

> > >                         return -EACCES;

> > >                 }

> > >         }

> > > +       if (map_value_has_timer(map)) {

> > > +               u32 t = map->timer_off;

> > > +

> > > +               if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&

> >

> > <= ? Otherwise we allow accessing the first byte, unless I'm mistaken.

>

> I don't think so. See the comment above in if (map_value_has_spin_lock(map))

> I didn't copy-paste it, because it's the same logic.


Oh, I didn't realize that this is the interval intersection check I
suggested a long time ago :) yeah, that still looks correct

>

> > > -       if (val) {

> > > -               /* todo: relax this requirement */

> > > -               verbose(env, "bpf_timer field can only be first in the map value element\n");

> >

> > ok, this was confusing, but now I see why you did that...

>

> I'll clarify the comment to say that the next patch fixes it.


ok, thanks

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 925b8416ea0a..7deff4438808 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -168,6 +168,7 @@  struct bpf_map {
 	u32 max_entries;
 	u32 map_flags;
 	int spin_lock_off; /* >=0 valid offset, <0 error */
+	int timer_off; /* >=0 valid offset, <0 error */
 	u32 id;
 	int numa_node;
 	u32 btf_key_type_id;
@@ -197,24 +198,41 @@  static inline bool map_value_has_spin_lock(const struct bpf_map *map)
 	return map->spin_lock_off >= 0;
 }
 
-static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
+static inline bool map_value_has_timer(const struct bpf_map *map)
 {
-	if (likely(!map_value_has_spin_lock(map)))
-		return;
-	*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
-		(struct bpf_spin_lock){};
+	return map->timer_off >= 0;
+}
+
+void bpf_timer_cancel_and_free(void *timer);
+
+static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
+{
+	if (unlikely(map_value_has_spin_lock(map)))
+		*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
+			(struct bpf_spin_lock){};
+	if (unlikely(map_value_has_timer(map)))
+		*(struct bpf_timer *)(dst + map->timer_off) =
+			(struct bpf_timer){};
 }
 
 /* copy everything but bpf_spin_lock */
 static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 {
+	u32 off = 0, size = 0;
+
 	if (unlikely(map_value_has_spin_lock(map))) {
-		u32 off = map->spin_lock_off;
+		off = map->spin_lock_off;
+		size = sizeof(struct bpf_spin_lock);
+	} else if (unlikely(map_value_has_timer(map))) {
+		off = map->timer_off;
+		size = sizeof(struct bpf_timer);
+	}
 
+	if (unlikely(size)) {
 		memcpy(dst, src, off);
-		memcpy(dst + off + sizeof(struct bpf_spin_lock),
-		       src + off + sizeof(struct bpf_spin_lock),
-		       map->value_size - off - sizeof(struct bpf_spin_lock));
+		memcpy(dst + off + size,
+		       src + off + size,
+		       map->value_size - off - size);
 	} else {
 		memcpy(dst, src, map->value_size);
 	}
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 94a0c976c90f..214fde93214b 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -99,6 +99,7 @@  bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   const struct btf_member *m,
 			   u32 expected_offset, u32 expected_size);
 int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
+int btf_find_timer(const struct btf *btf, const struct btf_type *t);
 bool btf_type_is_void(const struct btf_type *t);
 s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
 const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3c4105603f9d..3fedd9209770 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -378,10 +378,17 @@  static void *array_map_vmalloc_addr(struct bpf_array *array)
 static void array_map_free(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
 
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 		bpf_array_free_percpu(array);
 
+	if (unlikely(map_value_has_timer(map)))
+		for (i = 0; i < array->map.max_entries; i++)
+			bpf_timer_cancel_and_free(array->value +
+						  array->elem_size * i +
+						  map->timer_off);
+
 	if (array->map.map_flags & BPF_F_MMAPABLE)
 		bpf_map_area_free(array_map_vmalloc_addr(array));
 	else
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a6e39c5ea0bf..28a8014b8379 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3046,43 +3046,92 @@  static void btf_struct_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
+				 const char *name, int sz, int align)
 {
 	const struct btf_member *member;
 	u32 i, off = -ENOENT;
 
-	if (!__btf_type_is_struct(t))
-		return -EINVAL;
-
 	for_each_member(i, t, member) {
 		const struct btf_type *member_type = btf_type_by_id(btf,
 								    member->type);
 		if (!__btf_type_is_struct(member_type))
 			continue;
-		if (member_type->size != sizeof(struct bpf_spin_lock))
+		if (member_type->size != sz)
 			continue;
-		if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
-			   "bpf_spin_lock"))
+		if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
 			continue;
 		if (off != -ENOENT)
-			/* only one 'struct bpf_spin_lock' is allowed */
+			/* only one such field is allowed */
 			return -E2BIG;
 		off = btf_member_bit_offset(t, member);
 		if (off % 8)
 			/* valid C code cannot generate such BTF */
 			return -EINVAL;
 		off /= 8;
-		if (off % __alignof__(struct bpf_spin_lock))
-			/* valid struct bpf_spin_lock will be 4 byte aligned */
+		if (off % align)
+			return -EINVAL;
+	}
+	return off;
+}
+
+static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
+				const char *name, int sz, int align)
+{
+	const struct btf_var_secinfo *vsi;
+	u32 i, off = -ENOENT;
+
+	for_each_vsi(i, t, vsi) {
+		const struct btf_type *var = btf_type_by_id(btf, vsi->type);
+		const struct btf_type *var_type = btf_type_by_id(btf, var->type);
+
+		if (!__btf_type_is_struct(var_type))
+			continue;
+		if (var_type->size != sz)
+			continue;
+		if (vsi->size != sz)
+			continue;
+		if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
+			continue;
+		if (off != -ENOENT)
+			/* only one such field is allowed */
+			return -E2BIG;
+		off = vsi->offset;
+		if (off % align)
 			return -EINVAL;
 	}
 	return off;
 }
 
+static int btf_find_field(const struct btf *btf, const struct btf_type *t,
+			  const char *name, int sz, int align)
+{
+
+	if (__btf_type_is_struct(t))
+		return btf_find_struct_field(btf, t, name, sz, align);
+	else if (btf_type_is_datasec(t))
+		return btf_find_datasec_var(btf, t, name, sz, align);
+	return -EINVAL;
+}
+
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+	return btf_find_field(btf, t, "bpf_spin_lock",
+			      sizeof(struct bpf_spin_lock),
+			      __alignof__(struct bpf_spin_lock));
+}
+
+int btf_find_timer(const struct btf *btf, const struct btf_type *t)
+{
+	return btf_find_field(btf, t, "bpf_timer",
+			      sizeof(struct bpf_timer),
+			      __alignof__(struct bpf_timer));
+}
+
 static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
 			      u32 type_id, void *data, u8 bits_offset,
 			      struct btf_show *show)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 6f6681b07364..28d66fa74780 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -244,6 +244,17 @@  static void htab_free_elems(struct bpf_htab *htab)
 		cond_resched();
 	}
 free_elems:
+	if (unlikely(map_value_has_timer(&htab->map)))
+		for (i = 0; i < htab->map.max_entries; i++) {
+			struct htab_elem *elem;
+
+			elem = get_htab_elem(htab, i);
+			bpf_timer_cancel_and_free(elem->key +
+						  round_up(htab->map.key_size, 8) +
+						  htab->map.timer_off);
+			cond_resched();
+		}
+
 	bpf_map_area_free(htab->elems);
 }
 
@@ -265,8 +276,11 @@  static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
 	struct htab_elem *l;
 
 	if (node) {
+		u32 key_size = htab->map.key_size;
 		l = container_of(node, struct htab_elem, lru_node);
-		memcpy(l->key, key, htab->map.key_size);
+		memcpy(l->key, key, key_size);
+		check_and_init_map_value(&htab->map,
+					 l->key + round_up(key_size, 8));
 		return l;
 	}
 
@@ -785,10 +799,19 @@  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return -ENOENT;
 }
 
+static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
+{
+	if (unlikely(map_value_has_timer(&htab->map)))
+		bpf_timer_cancel_and_free(elem->key +
+					  round_up(htab->map.key_size, 8) +
+					  htab->map.timer_off);
+}
+
 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 {
 	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
 		free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+	check_and_free_timer(htab, l);
 	kfree(l);
 }
 
@@ -816,6 +839,7 @@  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 	htab_put_fd_value(htab, l);
 
 	if (htab_is_prealloc(htab)) {
+		check_and_free_timer(htab, l);
 		__pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
 		atomic_dec(&htab->count);
@@ -919,8 +943,8 @@  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 			l_new = ERR_PTR(-ENOMEM);
 			goto dec_count;
 		}
-		check_and_init_map_lock(&htab->map,
-					l_new->key + round_up(key_size, 8));
+		check_and_init_map_value(&htab->map,
+					 l_new->key + round_up(key_size, 8));
 	}
 
 	memcpy(l_new->key, key, key_size);
@@ -1067,6 +1091,12 @@  static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	return ret;
 }
 
+static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
+{
+	check_and_free_timer(htab, elem);
+	bpf_lru_push_free(&htab->lru, &elem->lru_node);
+}
+
 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 				    u64 map_flags)
 {
@@ -1099,7 +1129,8 @@  static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 	l_new = prealloc_lru_pop(htab, key, hash);
 	if (!l_new)
 		return -ENOMEM;
-	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
+	copy_map_value(&htab->map,
+		       l_new->key + round_up(map->key_size, 8), value);
 
 	ret = htab_lock_bucket(htab, b, hash, &flags);
 	if (ret)
@@ -1125,9 +1156,9 @@  static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 	htab_unlock_bucket(htab, b, hash, flags);
 
 	if (ret)
-		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+		htab_lru_push_free(htab, l_new);
 	else if (l_old)
-		bpf_lru_push_free(&htab->lru, &l_old->lru_node);
+		htab_lru_push_free(htab, l_old);
 
 	return ret;
 }
@@ -1332,7 +1363,7 @@  static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 
 	htab_unlock_bucket(htab, b, hash, flags);
 	if (l)
-		bpf_lru_push_free(&htab->lru, &l->lru_node);
+		htab_lru_push_free(htab, l);
 	return ret;
 }
 
@@ -1449,7 +1480,7 @@  static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
 			else
 				copy_map_value(map, value, l->key +
 					       roundup_key_size);
-			check_and_init_map_lock(map, value);
+			check_and_init_map_value(map, value);
 		}
 
 		hlist_nulls_del_rcu(&l->hash_node);
@@ -1460,7 +1491,7 @@  static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
 	htab_unlock_bucket(htab, b, hash, bflags);
 
 	if (is_lru_map && l)
-		bpf_lru_push_free(&htab->lru, &l->lru_node);
+		htab_lru_push_free(htab, l);
 
 	return ret;
 }
@@ -1638,7 +1669,7 @@  __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 						      true);
 			else
 				copy_map_value(map, dst_val, value);
-			check_and_init_map_lock(map, dst_val);
+			check_and_init_map_value(map, dst_val);
 		}
 		if (do_delete) {
 			hlist_nulls_del_rcu(&l->hash_node);
@@ -1665,7 +1696,7 @@  __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 	while (node_to_free) {
 		l = node_to_free;
 		node_to_free = node_to_free->batch_flink;
-		bpf_lru_push_free(&htab->lru, &l->lru_node);
+		htab_lru_push_free(htab, l);
 	}
 
 next_batch:
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6f9620cbe95d..8580b7bfc8bb 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1039,7 +1039,7 @@  BPF_CALL_5(bpf_timer_init, struct bpf_timer_kern *, timer, void *, cb, int, flag
 	if (!t)
 		return -ENOMEM;
 	t->callback_fn = cb;
-	t->value = (void *)timer /* - offset of bpf_timer inside elem */;
+	t->value = (void *)timer - map->timer_off;
 	t->key = t->value - round_up(map->key_size, 8);
 	t->map = map;
 	t->prog = prog;
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index bd11db9774c3..95d70a08325d 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -173,7 +173,7 @@  static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
 		return -ENOMEM;
 
 	memcpy(&new->data[0], value, map->value_size);
-	check_and_init_map_lock(map, new->data);
+	check_and_init_map_value(map, new->data);
 
 	new = xchg(&storage->buf, new);
 	kfree_rcu(new, rcu);
@@ -509,7 +509,7 @@  struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
 						    map->numa_node);
 		if (!storage->buf)
 			goto enomem;
-		check_and_init_map_lock(map, storage->buf->data);
+		check_and_init_map_value(map, storage->buf->data);
 	} else {
 		storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp);
 		if (!storage->percpu_buf)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 50457019da27..d49ba04d549e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -259,8 +259,8 @@  static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
 				copy_map_value_locked(map, value, ptr, true);
 			else
 				copy_map_value(map, value, ptr);
-			/* mask lock, since value wasn't zero inited */
-			check_and_init_map_lock(map, value);
+			/* mask lock and timer, since value wasn't zero inited */
+			check_and_init_map_value(map, value);
 		}
 		rcu_read_unlock();
 	}
@@ -792,6 +792,21 @@  static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 		}
 	}
 
+	map->timer_off = btf_find_timer(btf, value_type);
+	if (map_value_has_timer(map)) {
+		if (map->map_flags & BPF_F_RDONLY_PROG)
+			return -EACCES;
+		if (map->map_type != BPF_MAP_TYPE_HASH &&
+		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+		    map->map_type != BPF_MAP_TYPE_ARRAY)
+			return -ENOTSUPP;
+		if (map_value_has_spin_lock(map))
+			/* map values with bpf_spin_lock and bpf_timer
+			 * are not supported yet.
+			 */
+			return -EOPNOTSUPP;
+	}
+
 	if (map->ops->map_check_btf)
 		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
 
@@ -843,6 +858,7 @@  static int map_create(union bpf_attr *attr)
 	mutex_init(&map->freeze_mutex);
 
 	map->spin_lock_off = -EINVAL;
+	map->timer_off = -EINVAL;
 	if (attr->btf_key_type_id || attr->btf_value_type_id ||
 	    /* Even the map's value is a kernel's struct,
 	     * the bpf_prog.o must have BTF to begin with
@@ -1590,7 +1606,8 @@  static int map_freeze(const union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
+	    map_value_has_timer(map)) {
 		fdput(f);
 		return -ENOTSUPP;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f386f85aee5c..0a828dc4968e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3241,6 +3241,15 @@  static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 	}
+	if (map_value_has_timer(map)) {
+		u32 t = map->timer_off;
+
+		if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
+		     t < reg->umax_value + off + size) {
+			verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
+			return -EACCES;
+		}
+	}
 	return err;
 }
 
@@ -4675,9 +4684,24 @@  static int process_timer_func(struct bpf_verifier_env *env, int regno,
 			map->name);
 		return -EINVAL;
 	}
-	if (val) {
-		/* todo: relax this requirement */
-		verbose(env, "bpf_timer field can only be first in the map value element\n");
+	if (!map_value_has_timer(map)) {
+		if (map->timer_off == -E2BIG)
+			verbose(env,
+				"map '%s' has more than one 'struct bpf_timer'\n",
+				map->name);
+		else if (map->timer_off == -ENOENT)
+			verbose(env,
+				"map '%s' doesn't have 'struct bpf_timer'\n",
+				map->name);
+		else
+			verbose(env,
+				"map '%s' is not a struct type or bpf_timer is mangled\n",
+				map->name);
+		return -EINVAL;
+	}
+	if (map->timer_off != val + reg->off) {
+		verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
+			val + reg->off, map->timer_off);
 		return -EINVAL;
 	}
 	WARN_ON(meta->map_ptr);

[bpf-next,2/3] bpf: Add verifier checks for bpf_timer.

Commit Message

Comments

Patch