diff mbox series

[v18,2/5] random: add vgetrandom_alloc() syscall

Message ID 20240620005339.1273434-3-Jason@zx2c4.com
State New
Headers show
Series implement getrandom() in vDSO | expand

Commit Message

Jason A. Donenfeld June 20, 2024, 12:53 a.m. UTC
The vDSO getrandom() works over an opaque per-thread state of an
unexported size, which must be marked VM_WIPEONFORK, VM_DONTDUMP,
VM_NORESERVE, and VM_DROPPABLE for proper operation. Over time, the
nuances of these allocations may change or grow or even differ based on
architectural features.

The syscall has the signature:

  void *vgetrandom_alloc(unsigned int *num, unsigned int *size_per_each,
                         unsigned long addr, unsigned int flags);

This takes a hinted number of opaque states in `num`, and returns a
pointer to an array of opaque states, the number actually allocated back
in `num`, and the size in bytes of each one in `size_per_each`, enabling
a libc to slice up the returned array into a state per each thread,
while ensuring that no single state straddles a page boundary. (The
`flags` and `addr` arguments, as well as the `*size_per_each` input
value, are reserved for the future and are forced to be zero zero for
now.)

Libc is expected to allocate a chunk of these on first use, and then
dole them out to threads as they're created, allocating more when
needed. The returned address of the first state may be passed to
munmap(2) with a length of `DIV_ROUND_UP(num, PAGE_SIZE / size_per_each)
* PAGE_SIZE`, in order to deallocate the memory.

We very intentionally do *not* leave state allocation for vDSO
getrandom() up to userspace itself, but rather provide this new syscall
for such allocations. vDSO getrandom() must not store its state in just
any old memory address, but rather just ones that the kernel specially
allocates for it, leaving the particularities of those allocations up to
the kernel.

The allocation of states is intended to be integrated into libc's thread
management. As an illustrative example, the following code might be used
to do the same outside of libc. Though, vgetrandom_alloc() is not
expected to be exposed outside of libc, and the pthread usage here is
expected to be elided into libc internals. This allocation scheme is
very naive and does not shrink; other implementations may choose to be
more complex.

  static void *vgetrandom_alloc(unsigned int *num, unsigned int *size_per_each)
  {
    *size_per_each = 0; /* Must be zero on input. */
    return (void *)syscall(__NR_vgetrandom_alloc, &num, &size_per_each,
                           0 /* reserved @addr */, 0 /* reserved @flags */);
  }

  static struct {
    pthread_mutex_t lock;
    void **states;
    size_t len, cap, size_per_each;
  } grnd_allocator = {
    .lock = PTHREAD_MUTEX_INITIALIZER
  };

  static void *vgetrandom_get_state(void)
  {
    void *state = NULL;

    pthread_mutex_lock(&grnd_allocator.lock);
    if (!grnd_allocator.len) {
      size_t new_cap;
      size_t page_size = getpagesize();
      unsigned int num = sysconf(_SC_NPROCESSORS_ONLN); /* Could be arbitrary, just a hint. */
      unsigned int size_per_each;
      void *new_block = vgetrandom_alloc(&num, &size_per_each);
      void *new_states;

      if (new_block == MAP_FAILED)
        goto out;
      if (grnd_allocator.size_per_each && grnd_allocator.size_per_each != size_per_each)
        goto unmap;
      grnd_allocator.size_per_each = size_per_each;
      new_cap = grnd_allocator.cap + num;
      new_states = reallocarray(grnd_allocator.states, new_cap, sizeof(*grnd_allocator.states));
      if (!new_states)
        goto unmap;
      grnd_allocator.cap = new_cap;
      grnd_allocator.states = new_states;

      for (size_t i = 0; i < num; ++i) {
        grnd_allocator.states[i] = new_block;
        if (((uintptr_t)new_block & (page_size - 1)) + size_per_each > page_size)
          new_block = (void *)(((uintptr_t)new_block + page_size) & (page_size - 1));
        else
          new_block += size_per_each;
      }
      grnd_allocator.len = num;
      goto success;

    unmap:
      munmap(new_block, DIV_ROUND_UP(num, page_size / size_per_each) * page_size);
      goto out;
    }
  success:
    state = grnd_allocator.states[--grnd_allocator.len];

  out:
    pthread_mutex_unlock(&grnd_allocator.lock);
    return state;
  }

  static void vgetrandom_put_state(void *state)
  {
    if (!state)
      return;
    pthread_mutex_lock(&grnd_allocator.lock);
    grnd_allocator.states[grnd_allocator.len++] = state;
    pthread_mutex_unlock(&grnd_allocator.lock);
  }

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 MAINTAINERS              |   1 +
 drivers/char/random.c    | 135 ++++++++++++++++++++++++++++++++++++++-
 include/linux/syscalls.h |   3 +
 include/vdso/getrandom.h |  16 +++++
 kernel/sys_ni.c          |   3 +
 lib/vdso/Kconfig         |   6 ++
 6 files changed, 163 insertions(+), 1 deletion(-)
 create mode 100644 include/vdso/getrandom.h

Comments

Thomas Gleixner June 28, 2024, 1:56 p.m. UTC | #1
Jason!

On Thu, Jun 20 2024 at 14:18, Jason A. Donenfeld wrote:
> On Wed, Jun 19, 2024 at 07:13:26PM -0700, Aleksa Sarai wrote:
>> Then again, I guess since libc is planned to be the primary user,
>> creating a new syscall in a decade if necessary is probably not that big
>> of an issue.
>
> I'm not sure going the whole big struct thing is really necessary, and
> for an additional reason: this is only meant to be used with the vDSO
> function, which is also coupled with the kernel. It doesn't return
> information that's made to be used (or allowed to be used) anywhere
> else. So both the vdso code and the syscall code are part of the same
> basic thing that will evolve together. So I'm not convinced extensible
> struct really makes sense for this, as neat as it is.
>
> If there's wide consensus that it's desirable, in contrast to what I'm
> saying, I'm not vehemently opposed to it and could do it, but it just
> seems like massive overkill and not at all necessary. Things are
> intentionally as simple and straightforward as can be.

Right, but the problem is that this is a syscall, so people are free to
explore it even without the vdso part. Now when you want to change it
later then you are caught in the no-regression trap.

So making it extensible with backwards compability in place (add the
unused flag field and check for 0) will allow you to expand without
breaking users.

Thanks,

        tglx
Jason A. Donenfeld June 28, 2024, 2:09 p.m. UTC | #2
On Fri, Jun 28, 2024 at 03:56:12PM +0200, Thomas Gleixner wrote:
> Jason!
> 
> On Thu, Jun 20 2024 at 14:18, Jason A. Donenfeld wrote:
> > On Wed, Jun 19, 2024 at 07:13:26PM -0700, Aleksa Sarai wrote:
> >> Then again, I guess since libc is planned to be the primary user,
> >> creating a new syscall in a decade if necessary is probably not that big
> >> of an issue.
> >
> > I'm not sure going the whole big struct thing is really necessary, and
> > for an additional reason: this is only meant to be used with the vDSO
> > function, which is also coupled with the kernel. It doesn't return
> > information that's made to be used (or allowed to be used) anywhere
> > else. So both the vdso code and the syscall code are part of the same
> > basic thing that will evolve together. So I'm not convinced extensible
> > struct really makes sense for this, as neat as it is.
> >
> > If there's wide consensus that it's desirable, in contrast to what I'm
> > saying, I'm not vehemently opposed to it and could do it, but it just
> > seems like massive overkill and not at all necessary. Things are
> > intentionally as simple and straightforward as can be.
> 
> Right, but the problem is that this is a syscall, so people are free to
> explore it even without the vdso part. Now when you want to change it
> later then you are caught in the no-regression trap.
> 
> So making it extensible with backwards compability in place (add the
> unused flag field and check for 0) will allow you to expand without
> breaking users.

Okay, so it sounds like you're also in camp-struct. I guess let's do it
then. This opens up a few questions, but I think we can get them sorted.
Right now this version of the patch has this signature:

  void *vgetrandom_alloc(unsigned int *num, unsigned int *size_per_each,
                         unsigned long addr, unsigned int flags);

The semantics are currently:

- [in] unsigned int num - desired number of states
- [in] unsigned long addr - reserved, nothing
- [in] unsigned int flags - reserved, nothing
- [out] unsigned int num - actual number of states
- [out] unsigned int size_per_each - size of each state
- [out] void* return value - the allocated thing

Following Aleksa's suggestion, we keep the `[out] void* return value` as
a return value, but move all the other into a struct:

    void *vgetrandom_alloc(struct vgetrandom_args *arg, size_t size);

So now the struct can become:

    struct vgetrandom_args {
        [in] u64 flags;
        [in/out] u32 num;
        [out] u32 size_per_each;
    }

Alternatively, this now opens the possibility to incorporate Eric's
suggestion of also returning the number of allocated bytes, which is
perhaps definitely to deal with, but I didn't do because I wanted
symmetry in the argument list. So doing that, now we have:

    struct vgetrandom_args {
        [in] u64 flags;
        [in/out] u32 num;
        [out] u32 size_per_each;
        [out] u64 bytes_allocated;
    }

Does that seem reasonable? There's a little bit of mixing of ins and
outs within the struct, and the return value is still a return value,
rather than a `[out] void *ret` inside of the struct. But maybe that's
fine. Also I used u32 there for the two smaller arguments, but maybe
that's silly and we should go straight to u64?

Anyway, how does that look to you?

Jason
Jason A. Donenfeld June 28, 2024, 2:11 p.m. UTC | #3
On Fri, Jun 28, 2024 at 4:09 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> perhaps definitely to deal with,

What?! I meant "definitely easier to deal with".
Jason A. Donenfeld July 1, 2024, 11:53 a.m. UTC | #4
On Fri, Jun 28, 2024 at 04:09:01PM +0200, Jason A. Donenfeld wrote:
> fine. Also I used u32 there for the two smaller arguments, but maybe
> that's silly and we should go straight to u64?

Judging by `struct clone_args`, it looks like I've got to use
__aligned_u64 for every argument:

    struct clone_args {
        __aligned_u64 flags;
        __aligned_u64 pidfd;
        __aligned_u64 child_tid;
        __aligned_u64 parent_tid;
        __aligned_u64 exit_signal;
        __aligned_u64 stack;
        __aligned_u64 stack_size;
        __aligned_u64 tls;
        __aligned_u64 set_tid;
        __aligned_u64 set_tid_size;
        __aligned_u64 cgroup;
    };
    #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
    #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
    #define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
    
So okay, I'll do that, and will have an ARGS_SIZE_VER0 macro too.

Jason
Jason A. Donenfeld July 1, 2024, 1:59 p.m. UTC | #5
On Mon, Jul 1, 2024 at 1:53 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> On Fri, Jun 28, 2024 at 04:09:01PM +0200, Jason A. Donenfeld wrote:
> > fine. Also I used u32 there for the two smaller arguments, but maybe
> > that's silly and we should go straight to u64?
>
> Judging by `struct clone_args`, it looks like I've got to use
> __aligned_u64 for every argument:
>
>     struct clone_args {
>         __aligned_u64 flags;
>         __aligned_u64 pidfd;
>         __aligned_u64 child_tid;
>         __aligned_u64 parent_tid;
>         __aligned_u64 exit_signal;
>         __aligned_u64 stack;
>         __aligned_u64 stack_size;
>         __aligned_u64 tls;
>         __aligned_u64 set_tid;
>         __aligned_u64 set_tid_size;
>         __aligned_u64 cgroup;
>     };
>     #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
>     #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
>     #define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
>
> So okay, I'll do that, and will have an ARGS_SIZE_VER0 macro too.

This is now covered by v19 of this patchset:
https://lore.kernel.org/lkml/20240701135801.3698-1-Jason@zx2c4.com/
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 8aa17e515ef3..8480c4c39915 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18747,6 +18747,7 @@  T:	git https://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git
 F:	Documentation/devicetree/bindings/rng/microsoft,vmgenid.yaml
 F:	drivers/char/random.c
 F:	drivers/virt/vmgenid.c
+F:	include/vdso/getrandom.h
 
 RAPIDIO SUBSYSTEM
 M:	Matt Porter <mporter@kernel.crashing.org>
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 2597cb43f438..ccb35f390c85 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1,6 +1,6 @@ 
 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
 /*
- * Copyright (C) 2017-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2017-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  * Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005
  * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All rights reserved.
  *
@@ -8,6 +8,7 @@ 
  * into roughly six sections, each with a section header:
  *
  *   - Initialization and readiness waiting.
+ *   - vDSO support helpers.
  *   - Fast key erasure RNG, the "crng".
  *   - Entropy accumulation and extraction routines.
  *   - Entropy collection routines.
@@ -39,6 +40,7 @@ 
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
+#include <linux/mman.h>
 #include <linux/nodemask.h>
 #include <linux/spinlock.h>
 #include <linux/kthread.h>
@@ -56,6 +58,9 @@ 
 #include <linux/sched/isolation.h>
 #include <crypto/chacha.h>
 #include <crypto/blake2s.h>
+#ifdef CONFIG_VDSO_GETRANDOM
+#include <vdso/getrandom.h>
+#endif
 #include <asm/archrandom.h>
 #include <asm/processor.h>
 #include <asm/irq.h>
@@ -169,6 +174,134 @@  int __cold execute_with_initialized_rng(struct notifier_block *nb)
 				__func__, (void *)_RET_IP_, crng_init)
 
 
+
+/********************************************************************
+ *
+ * vDSO support helpers.
+ *
+ * The actual vDSO function is defined over in lib/vdso/getrandom.c,
+ * but this section contains the kernel-mode helpers to support that.
+ *
+ ********************************************************************/
+
+#ifdef CONFIG_VDSO_GETRANDOM
+/**
+ * sys_vgetrandom_alloc - Allocate opaque states for use with vDSO getrandom().
+ *
+ * @num:	   On input, a pointer to a suggested hint of how many states to
+ * 		   allocate, and on return the number of states actually allocated.
+ *
+ * @size_per_each: On input, must be zero. On return, the size of each state allocated,
+ * 		   so that the caller can split up the returned allocation into
+ * 		   individual states.
+ *
+ * @addr:	   Reserved, must be zero.
+ *
+ * @flags:	   Reserved, must be zero.
+ *
+ * The getrandom() vDSO function in userspace requires an opaque state, which
+ * this function allocates by mapping a certain number of special pages into
+ * the calling process. It takes a hint as to the number of opaque states
+ * desired, and provides the caller with the number of opaque states actually
+ * allocated, the size of each one in bytes, and the address of the first
+ * state, which may be split up into @num states of @size_per_each bytes each,
+ * by adding @size_per_each to the returned first state @num times, while
+ * ensuring that no single state straddles a page boundary.
+ *
+ * Returns the address of the first state in the allocation on success, or a
+ * negative error value on failure.
+ *
+ * The returned address of the first state may be passed to munmap(2) with a
+ * length of `DIV_ROUND_UP(num, PAGE_SIZE / size_per_each) * PAGE_SIZE`, in
+ * order to deallocate the memory, after which it is invalid to pass it to vDSO
+ * getrandom().
+ *
+ * States allocated by this function must not be dereferenced, written, read,
+ * or otherwise manipulated. The *only* supported operations are:
+ *   - Splitting up the states in intervals of @size_per_each, no more than
+ *     @num times from the first state, while ensuring that no single state
+ *     straddles a page boundary.
+ *   - Passing a state to the getrandom() vDSO function's @opaque_state
+ *     parameter, but not passing the same state at the same time to two such
+ *     calls.
+ *   - Passing the first state and the total length to munmap(2), as described
+ *     above.
+ * All other uses are undefined behavior, which is subject to change or removal.
+ */
+SYSCALL_DEFINE4(vgetrandom_alloc, unsigned int __user *, num,
+		unsigned int __user *, size_per_each, unsigned long, addr,
+		unsigned int, flags)
+{
+	size_t state_size, alloc_size, num_states;
+	unsigned long pages_addr, populate;
+	unsigned int num_hint;
+	vm_flags_t vm_flags;
+	int ret;
+
+	/*
+	 * @flags and @addr are currently unused, so in order to reserve them
+	 * for the future, force them to be set to zero by current callers.
+	 */
+	if (flags || addr)
+		return -EINVAL;
+
+	/*
+	 * Also enforce that *size_per_each is zero on input, in case this becomes
+	 * useful later on.
+	 */
+	if (get_user(num_hint, size_per_each))
+		return -EFAULT;
+	if (num_hint)
+		return -EINVAL;
+
+	if (get_user(num_hint, num))
+		return -EFAULT;
+
+	state_size = sizeof(struct vgetrandom_state);
+	num_states = clamp_t(size_t, num_hint, 1, (SIZE_MAX & PAGE_MASK) / state_size);
+	alloc_size = PAGE_ALIGN(num_states * state_size);
+	/*
+	 * States cannot straddle page boundaries, so calculate the number of
+	 * states that can fit inside of a page without being split, and then
+	 * multiply that out by the number of pages allocated.
+	 */
+	num_states = (PAGE_SIZE / state_size) * (alloc_size / PAGE_SIZE);
+
+	vm_flags =
+		/*
+		 * Don't allow state to be written to swap, to preserve forward secrecy.
+		 * But also don't mlock it or pre-reserve it, and allow it to
+		 * be discarded under memory pressure. If no memory is available, returns
+		 * zeros rather than segfaulting.
+		 */
+		VM_DROPPABLE | VM_NORESERVE |
+
+		/* Don't allow the state to survive forks, to prevent random number re-use. */
+		VM_WIPEONFORK |
+
+		/* Don't write random state into coredumps. */
+		VM_DONTDUMP;
+
+	if (mmap_write_lock_killable(current->mm))
+		return -EINTR;
+	pages_addr = do_mmap(NULL, 0, alloc_size, PROT_READ | PROT_WRITE,
+			     MAP_PRIVATE | MAP_ANONYMOUS, vm_flags, 0, &populate, NULL);
+	mmap_write_unlock(current->mm);
+	if (IS_ERR_VALUE(pages_addr))
+		return pages_addr;
+
+	ret = -EFAULT;
+	if (put_user(num_states, num) || put_user(state_size, size_per_each))
+		goto err_unmap;
+
+	return pages_addr;
+
+err_unmap:
+	vm_munmap(pages_addr, alloc_size);
+	return ret;
+}
+#endif
+
 /*********************************************************************
  *
  * Fast key erasure RNG, the "crng".
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 9104952d323d..56368ea4f510 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -906,6 +906,9 @@  asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 			    void __user *uargs);
 asmlinkage long sys_getrandom(char __user *buf, size_t count,
 			      unsigned int flags);
+asmlinkage long sys_vgetrandom_alloc(unsigned int __user *num,
+				     unsigned int __user *size_per_each,
+				     unsigned long addr, unsigned int flags);
 asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
 asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 asmlinkage long sys_execveat(int dfd, const char __user *filename,
diff --git a/include/vdso/getrandom.h b/include/vdso/getrandom.h
new file mode 100644
index 000000000000..69037519d20b
--- /dev/null
+++ b/include/vdso/getrandom.h
@@ -0,0 +1,16 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _VDSO_GETRANDOM_H
+#define _VDSO_GETRANDOM_H
+
+/**
+ * struct vgetrandom_state - State used by vDSO getrandom() and allocated by vgetrandom_alloc().
+ *
+ * Currently empty, as the vDSO getrandom() function has not yet been implemented.
+ */
+struct vgetrandom_state { int placeholder; };
+
+#endif /* _VDSO_GETRANDOM_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d7eee421d4bc..6b17fadb0f59 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -272,6 +272,9 @@  COND_SYSCALL(pkey_free);
 /* memfd_secret */
 COND_SYSCALL(memfd_secret);
 
+/* random */
+COND_SYSCALL(vgetrandom_alloc);
+
 /*
  * Architecture specific weak syscall entries.
  */
diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig
index c46c2300517c..99661b731834 100644
--- a/lib/vdso/Kconfig
+++ b/lib/vdso/Kconfig
@@ -38,3 +38,9 @@  config GENERIC_VDSO_OVERFLOW_PROTECT
 	  in the hotpath.
 
 endif
+
+config VDSO_GETRANDOM
+	bool
+	select NEED_VM_DROPPABLE
+	help
+	  Selected by architectures that support vDSO getrandom().