diff mbox series

[v10,1/4] random: add vgetrandom_alloc() syscall

Message ID 20221129210639.42233-2-Jason@zx2c4.com
State New
Headers show
Series implement getrandom() in vDSO | expand

Commit Message

Jason A. Donenfeld Nov. 29, 2022, 9:06 p.m. UTC
The vDSO getrandom() works over an opaque per-thread state of an
unexported size, which must be marked as MADV_WIPEONFORK and be
mlock()'d for proper operation. Over time, the nuances of these
allocations may change or grow or even differ based on architectural
features.

The syscall has the signature:

  void *vgetrandom_alloc([inout] unsigned int *num,
                         [out] unsigned int *size_per_each,
                         unsigned int flags);

This takes the desired number of opaque states in `num`, and returns a
pointer to an array of opaque states, the number actually allocated back
in `num`, and the size in bytes of each one in `size_per_each`, enabling
a libc to slice up the returned array into a state per each thread. (The
`flags` argument is always zero for now.) Libc is expected to allocate a
chunk of these on first use, and then dole them out to threads as
they're created, allocating more when needed. The following commit shows
an example of this, being used in conjunction with the getrandom() vDSO
function.

We very intentionally do *not* leave state allocation for vDSO
getrandom() up to userspace itself, but rather provide this new syscall
for such allocations. vDSO getrandom() must not store its state in just
any old memory address, but rather just ones that the kernel specially
allocates for it, leaving the particularities of those allocations up to
the kernel.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 MAINTAINERS              |  1 +
 drivers/char/random.c    | 75 ++++++++++++++++++++++++++++++++++++++++
 include/linux/syscalls.h |  3 ++
 include/vdso/getrandom.h | 24 +++++++++++++
 kernel/sys_ni.c          |  3 ++
 lib/vdso/Kconfig         |  7 ++++
 6 files changed, 113 insertions(+)
 create mode 100644 include/vdso/getrandom.h

Comments

Thomas Gleixner Nov. 29, 2022, 10:02 p.m. UTC | #1
Jason!

On Tue, Nov 29 2022 at 22:06, Jason A. Donenfeld wrote:
> +
> +/********************************************************************
> + *
> + * vDSO support helpers.
> + *
> + * The actual vDSO function is defined over in lib/vdso/getrandom.c,
> + * but this section contains the kernel-mode helpers to support that.
> + *
> + ********************************************************************/
> +
> +#ifdef CONFIG_VGETRANDOM_ALLOC_SYSCALL
> +/**
> + * vgetrandom_alloc - allocate opaque states for use with vDSO getrandom().
> + *
> + * @num: on input, a pointer to a suggested hint of how many states to
> + * allocate, and on output the number of states actually allocated.
> + *
> + * @size_per_each: the size of each state allocated, so that the caller can
> + * split up the returned allocation into individual states.
> + *
> + * @flags: currently always zero.

NIT!

I personally prefer and ask for it in stuff I maintain:

 * @num:		On input, a pointer to a suggested hint of how many states to
 *			allocate, and on output the number of states actually allocated.
 *
 * @size_per_each: 	The size of each state allocated, so that the caller can
 * 			split up the returned allocation into individual states.
 *
 * @flags: 		Currently always zero.

But your turf :)

> + *
> + * The getrandom() vDSO function in userspace requires an opaque state, which
> + * this function allocates by mapping a certain number of special pages into
> + * the calling process. It takes a hint as to the number of opaque states
> + * desired, and provides the caller with the number of opaque states actually
> + * allocated, the size of each one in bytes, and the address of the first
> + * state.

make W=1 rightfully complains about:

> +

drivers/char/random.c:182: warning: bad line: 

> + * Returns a pointer to the first state in the allocation.

I have serious doubts that this statement is correct.

Think about this comment and documentation as a boiler plate for the
mandatory man page for a new syscall (hint...)

> + *
> + */

and W=1 also complains rightfully here:

> +SYSCALL_DEFINE3(vgetrandom_alloc, unsigned int __user *, num,
> +		unsigned int __user *, size_per_each, unsigned int, flags)

drivers/char/random.c:188: warning: expecting prototype for vgetrandom_alloc(). Prototype was for sys_vgetrandom_alloc() instead

> +{
> diff --git a/include/vdso/getrandom.h b/include/vdso/getrandom.h
> new file mode 100644
> index 000000000000..5f04c8bf4bd4
> --- /dev/null
> +++ b/include/vdso/getrandom.h
> @@ -0,0 +1,24 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> + */
> +
> +#ifndef _VDSO_GETRANDOM_H
> +#define _VDSO_GETRANDOM_H
> +
> +#include <crypto/chacha.h>
> +
> +struct vgetrandom_state {
> +	union {
> +		struct {
> +			u8 batch[CHACHA_BLOCK_SIZE * 3 / 2];
> +			u32 key[CHACHA_KEY_SIZE / sizeof(u32)];
> +		};
> +		u8 batch_key[CHACHA_BLOCK_SIZE * 2];
> +	};
> +	unsigned long generation;
> +	u8 pos;
> +	bool in_use;
> +};

Again, please make this properly tabular:

struct vgetrandom_state {
	union {
		struct {
			u8	batch[CHACHA_BLOCK_SIZE * 3 / 2];
			u32	key[CHACHA_KEY_SIZE / sizeof(u32)];
		};
		u8	batch_key[CHACHA_BLOCK_SIZE * 2];
	};
	unsigned long	generation;
	u8		pos;
	bool		in_use;
};

Plus some kernel doc which explains what this is about.

Thanks,

        tglx
Jason A. Donenfeld Nov. 30, 2022, 12:59 a.m. UTC | #2
Hi Thomas,

Thanks again for the big review. Comments inline below.

On Tue, Nov 29, 2022 at 11:02:29PM +0100, Thomas Gleixner wrote:
> > +/**
> > + * vgetrandom_alloc - allocate opaque states for use with vDSO getrandom().
> > + *
> > + * @num: on input, a pointer to a suggested hint of how many states to
> > + * allocate, and on output the number of states actually allocated.
> > + *
> > + * @size_per_each: the size of each state allocated, so that the caller can
> > + * split up the returned allocation into individual states.
> > + *
> > + * @flags: currently always zero.
> 
> NIT!
> 
> I personally prefer and ask for it in stuff I maintain:
> 
>  * @num:		On input, a pointer to a suggested hint of how many states to
>  *			allocate, and on output the number of states actually allocated.
>  *
>  * @size_per_each: 	The size of each state allocated, so that the caller can
>  * 			split up the returned allocation into individual states.
>  *
>  * @flags: 		Currently always zero.
> 
> But your turf :)

Hm. Caps and punctuation seem mostly missing in kernel/time/, though it
is that way in some places, so I'll do it with caps and punctuation.
Presumably that's the "newer" style you prefer, though I didn't look at
the dates in git-blame to confirm that supposition.

> 
> > + *
> > + * The getrandom() vDSO function in userspace requires an opaque state, which
> > + * this function allocates by mapping a certain number of special pages into
> > + * the calling process. It takes a hint as to the number of opaque states
> > + * desired, and provides the caller with the number of opaque states actually
> > + * allocated, the size of each one in bytes, and the address of the first
> > + * state.
> 
> make W=1 rightfully complains about:
> 
> > +
> 
> drivers/char/random.c:182: warning: bad line: 
> 
> > + * Returns a pointer to the first state in the allocation.
> 
> I have serious doubts that this statement is correct.

"Returns the address of the first state in the allocation" is better I
guess.

> and W=1 also complains rightfully here:
> 
> > +SYSCALL_DEFINE3(vgetrandom_alloc, unsigned int __user *, num,
> > +		unsigned int __user *, size_per_each, unsigned int, flags)
> 
> drivers/char/random.c:188: warning: expecting prototype for vgetrandom_alloc(). Prototype was for sys_vgetrandom_alloc() instead

Squinted at a lot of headers before realizing that's a kernel-doc
warning. Fixed, thanks.

> > +#ifndef _VDSO_GETRANDOM_H
> > +#define _VDSO_GETRANDOM_H
> > +
> > +#include <crypto/chacha.h>
> > +
> > +struct vgetrandom_state {
> > +	union {
> > +		struct {
> > +			u8 batch[CHACHA_BLOCK_SIZE * 3 / 2];
> > +			u32 key[CHACHA_KEY_SIZE / sizeof(u32)];
> > +		};
> > +		u8 batch_key[CHACHA_BLOCK_SIZE * 2];
> > +	};
> > +	unsigned long generation;
> > +	u8 pos;
> > +	bool in_use;
> > +};
> 
> Again, please make this properly tabular:
> 
> struct vgetrandom_state {
> 	union {
> 		struct {
> 			u8	batch[CHACHA_BLOCK_SIZE * 3 / 2];
> 			u32	key[CHACHA_KEY_SIZE / sizeof(u32)];
> 		};
> 		u8	batch_key[CHACHA_BLOCK_SIZE * 2];
> 	};
> 	unsigned long	generation;
> 	u8		pos;
> 	bool		in_use;
> };
> 
> Plus some kernel doc which explains what this is about.

Will do. Though, I'm going to move this to the vDSO commit, and for the
syscall commit, which needs the struct to merely exist, I'll have no
members in it. This should make review a bit easier.

Jason
Thomas Gleixner Nov. 30, 2022, 1:37 a.m. UTC | #3
On Wed, Nov 30 2022 at 01:59, Jason A. Donenfeld wrote:
> On Tue, Nov 29, 2022 at 11:02:29PM +0100, Thomas Gleixner wrote:
>> > + * Returns a pointer to the first state in the allocation.
>> 
>> I have serious doubts that this statement is correct.
>
> "Returns the address of the first state in the allocation" is better I
> guess.

Does not even come close to correct.

As my previous hint of 'using this as template for the (hint:missing)
man page' did not work well, may I suggest that you look at the various
return statements in that function and validate whether your proposed
return value documentation is valid for all of them?

Thanks,

        tglx
Jason A. Donenfeld Nov. 30, 2022, 1:42 a.m. UTC | #4
On Wed, Nov 30, 2022 at 02:37:32AM +0100, Thomas Gleixner wrote:
> On Wed, Nov 30 2022 at 01:59, Jason A. Donenfeld wrote:
> > On Tue, Nov 29, 2022 at 11:02:29PM +0100, Thomas Gleixner wrote:
> >> > + * Returns a pointer to the first state in the allocation.
> >> 
> >> I have serious doubts that this statement is correct.
> >
> > "Returns the address of the first state in the allocation" is better I
> > guess.
> 
> Does not even come close to correct.
> 
> As my previous hint of 'using this as template for the (hint:missing)
> man page' did not work well, may I suggest that you look at the various
> return statements in that function and validate whether your proposed
> return value documentation is valid for all of them?

Ahh, the error values and such. Righto. Will do. I'll match the style of
similar functions.

Jason
Florian Weimer Nov. 30, 2022, 10:51 a.m. UTC | #5
* Jason A. Donenfeld:

> +#ifdef CONFIG_VGETRANDOM_ALLOC_SYSCALL
> +/**
> + * vgetrandom_alloc - allocate opaque states for use with vDSO getrandom().
> + *
> + * @num: on input, a pointer to a suggested hint of how many states to
> + * allocate, and on output the number of states actually allocated.

Should userspace call this system call again if it needs more states?
The interface description doesn't make this clear.

> + * @size_per_each: the size of each state allocated, so that the caller can
> + * split up the returned allocation into individual states.
> + *
> + * @flags: currently always zero.
> + *
> + * The getrandom() vDSO function in userspace requires an opaque state, which
> + * this function allocates by mapping a certain number of special pages into
> + * the calling process. It takes a hint as to the number of opaque states
> + * desired, and provides the caller with the number of opaque states actually
> + * allocated, the size of each one in bytes, and the address of the first
> + * state.
> +
> + * Returns a pointer to the first state in the allocation.
> + *
> + */

How do we deallocate this memory?  Must it remain permanently allocated?

Can userspace use the memory for something else if it's not passed to
getrandom?  The separate system call strongly suggests that the
allocation is completely owned by the kernel, but there isn't
documentation here how the allocation life-cycle is supposed to look
like.  In particular, it is not clear if vgetrandom_alloc or getrandom
could retain a reference to the allocation in a future implementation of
these interfaces.

Some users might want to zap the memory for extra hardening after use,
and it's not clear if that's allowed, either.

> +SYSCALL_DEFINE3(vgetrandom_alloc, unsigned int __user *, num,
> +		unsigned int __user *, size_per_each, unsigned int, flags)
> +{

ABI-wise, that should work.

> +	const size_t state_size = sizeof(struct vgetrandom_state);
> +	size_t alloc_size, num_states;
> +	unsigned long pages_addr;
> +	unsigned int num_hint;
> +	int ret;
> +
> +	if (flags)
> +		return -EINVAL;
> +
> +	if (get_user(num_hint, num))
> +		return -EFAULT;
> +
> +	num_states = clamp_t(size_t, num_hint, 1, (SIZE_MAX & PAGE_MASK) / state_size);
> +	alloc_size = PAGE_ALIGN(num_states * state_size);

Doesn't this waste space for one state if state_size happens to be a
power of 2?  Why do this SIZE_MAX & PAGE_MASK thing at all?  Shouldn't
it be PAGE_SIZE / state_size?

> +	if (put_user(alloc_size / state_size, num) || put_user(state_size, size_per_each))
> +		return -EFAULT;
> +
> +	pages_addr = vm_mmap(NULL, 0, alloc_size, PROT_READ | PROT_WRITE,
> +			     MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, 0);

I think Rasmus has already raised questions about MAP_LOCKED.

I think the kernel cannot rely on it because userspace could call
munlock on the allocation.

> +	if (IS_ERR_VALUE(pages_addr))
> +		return pages_addr;
> +
> +	ret = do_madvise(current->mm, pages_addr, alloc_size, MADV_WIPEONFORK);
> +	if (ret < 0)
> +		goto err_unmap;
> +
> +	return pages_addr;
> +
> +err_unmap:
> +	vm_munmap(pages_addr, alloc_size);
> +	return ret;
> +}
> +#endif

If there's no registration of the allocation, it's not clear why we need
a separate system call for this.  From a documentation perspective, it
may be easier to describe proper use of the getrandom vDSO call if
ownership resides with userspace.  But it will constrain future
evolution of the implementation because you can't add registration
(retaining a reference to the passed-in area in getrandom) after the
fact.  But I'm not sure if this is possible with the current interface,
either.  Userspace has to make some assumptions about the life-cycle to
avoid a memory leak on thread exit.

Thanks,
Florian
Jason A. Donenfeld Nov. 30, 2022, 3:39 p.m. UTC | #6
Hi Florian,

On Wed, Nov 30, 2022 at 11:51:59AM +0100, Florian Weimer wrote:
> * Jason A. Donenfeld:
> 
> > +#ifdef CONFIG_VGETRANDOM_ALLOC_SYSCALL
> > +/**
> > + * vgetrandom_alloc - allocate opaque states for use with vDSO getrandom().
> > + *
> > + * @num: on input, a pointer to a suggested hint of how many states to
> > + * allocate, and on output the number of states actually allocated.
> 
> Should userspace call this system call again if it needs more states?
> The interface description doesn't make this clear.

Yes. And indeed that's what Adhemerval's patch does.

> 
> > + * @size_per_each: the size of each state allocated, so that the caller can
> > + * split up the returned allocation into individual states.
> > + *
> > + * @flags: currently always zero.
> > + *
> > + * The getrandom() vDSO function in userspace requires an opaque state, which
> > + * this function allocates by mapping a certain number of special pages into
> > + * the calling process. It takes a hint as to the number of opaque states
> > + * desired, and provides the caller with the number of opaque states actually
> > + * allocated, the size of each one in bytes, and the address of the first
> > + * state.
> > +
> > + * Returns a pointer to the first state in the allocation.
> > + *
> > + */
> 
> How do we deallocate this memory?  Must it remain permanently allocated?

It can be deallocated with munmap.

> Can userspace use the memory for something else if it's not passed to
> getrandom?

I suspect the documentation answer here is, "no", even if technically it
might happen to work on this kernel or that kernel. I suppose this could
even be quasi-enforced by xoring the top bits with some vdso
compile-time constant, so you can't rely on being able to dereference
it yourself.

> The separate system call strongly suggests that the
> allocation is completely owned by the kernel, but there isn't
> documentation here how the allocation life-cycle is supposed to look
> like.  In particular, it is not clear if vgetrandom_alloc or getrandom
> could retain a reference to the allocation in a future implementation of
> these interfaces.
> 
> Some users might want to zap the memory for extra hardening after use,
> and it's not clear if that's allowed, either.

I don't think zapping that memory is supported, or even a sensible thing
to do. In the first place, I don't think we should suggest that the user
can dereference that pointer, at all. In that sense, maybe it's best to
call it a "handle" or something similar (a "HANDLE"! a "HWND"? a "HRNG"?
just kidding). In the second place, the fast erasure aspect of this
means that such hardening would have no effect -- the key is overwritten
after using for forward secrecy, anyway, and batched bytes are zeroed.
(There is a corner case that might make it interesting to wipe in the
parent, not just the child, on fork, but that's sort of a separate
matter and would ideally be handled by kernel space anyway.)

> If there's no registration of the allocation, it's not clear why we need
> a separate system call for this.  From a documentation perspective, it
> may be easier to describe proper use of the getrandom vDSO call if
> ownership resides with userspace.

No, absolutely not, for the multiple reasons already listed in the
commit messages and cover letter and previous emails. But you seem
aware of this:

> But it will constrain future
> evolution of the implementation because you can't add registration
> (retaining a reference to the passed-in area in getrandom) after the
> fact.  But I'm not sure if this is possible with the current interface,
> either.  Userspace has to make some assumptions about the life-cycle to
> avoid a memory leak on thread exit.

It sounds like this is sort of a different angle on Rasmus' earlier
comment about how munmap leaks implementation details. Maybe there's
something to that after all? Or not? I see two approaches:

1) Keep munmap as the allocation function. If later on we do fancy
   registration and in-kernel state tracking, or add fancy protection
   flags, or whatever else, munmap should be able to identify these
   pages and carry out whatever special treatment is necessary.

2) Convert vgetrandom_alloc() into a clone3-style syscall, as Christian
   suggested earlier, which might allow for a bit more overloading
   capability. That would be a struct that looks like:

      struct vgetrandom_alloc_args {
	  __aligned_u64 flags;
          __aligned_u64 states;
	  __aligned_u64 num;
	  __aligned_u64 size_of_each;
      }

  - If flags is VGRA_ALLOCATE, states and size_of_each must be zero on
    input, while num is the hint, as is the case now. On output, states,
    size_of_each, and num are filled in.

  - If flags is VGRA_DEALLOCATE, states, size_of_each, and num must be as
    they were originally, and then it deallocates.

I suppose (2) would alleviate your concerns entirely, without future
uncertainty over what it'd be like to add special cases to munmap(). And
it'd add a bit more future proofing to the syscall, depending on what we
do.

So maybe I'm warming up to that approach a bit.

> > +	num_states = clamp_t(size_t, num_hint, 1, (SIZE_MAX & PAGE_MASK) / state_size);
> > +	alloc_size = PAGE_ALIGN(num_states * state_size);
> 
> Doesn't this waste space for one state if state_size happens to be a
> power of 2?  Why do this SIZE_MAX & PAGE_MASK thing at all?  Shouldn't
> it be PAGE_SIZE / state_size?

The first line is a clamp. That fixes num_hint between 1 and the largest
number that when multiplied and rounded up won't overflow.

So, if state_size is a power of two, let's say 256, and there's only one
state, here's what that looks like:

    num_states = clamp(1, 1, (0xffffffff & (~(4096 - 1))) / 256 = 16777200) = 1
    alloc_size = PAGE_ALIGN(1 * 256) = 4096

So that seems like it's working as intended, right? Or if not, maybe
it'd help to write out the digits you're concerned about?

> > +	if (put_user(alloc_size / state_size, num) || put_user(state_size, size_per_each))
> > +		return -EFAULT;
> > +
> > +	pages_addr = vm_mmap(NULL, 0, alloc_size, PROT_READ | PROT_WRITE,
> > +			     MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, 0);
> 
> I think Rasmus has already raised questions about MAP_LOCKED.
> 
> I think the kernel cannot rely on it because userspace could call
> munlock on the allocation.

Then they're caught holding the bag? This doesn't seem much different
from userspace shooting themselves in general, like writing garbage into
the allocated states and then trying to use them. If this is something
you really, really are concerned about, then maybe my cheesy dumb xor
thing mentioned above would be a low effort mitigation here.

Jason
Jason A. Donenfeld Nov. 30, 2022, 4:38 p.m. UTC | #7
On Wed, Nov 30, 2022 at 04:39:55PM +0100, Jason A. Donenfeld wrote:
> 2) Convert vgetrandom_alloc() into a clone3-style syscall, as Christian
>    suggested earlier, which might allow for a bit more overloading
>    capability. That would be a struct that looks like:
> 
>       struct vgetrandom_alloc_args {
> 	  __aligned_u64 flags;
>           __aligned_u64 states;
> 	  __aligned_u64 num;
> 	  __aligned_u64 size_of_each;
>       }
> 
>   - If flags is VGRA_ALLOCATE, states and size_of_each must be zero on
>     input, while num is the hint, as is the case now. On output, states,
>     size_of_each, and num are filled in.
> 
>   - If flags is VGRA_DEALLOCATE, states, size_of_each, and num must be as
>     they were originally, and then it deallocates.
> 
> I suppose (2) would alleviate your concerns entirely, without future
> uncertainty over what it'd be like to add special cases to munmap(). And
> it'd add a bit more future proofing to the syscall, depending on what we
> do.
> 
> So maybe I'm warming up to that approach a bit.

So I just did a little quick implementation to see what it'd feel like,
and actually, it's quite simple, and might address a lot of concerns all
at once. What do you think of the below? Documentation and such still
needs work obviously, but the bones should be there.

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 4341c6a91207..dae6095b937d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -189,44 +189,53 @@ int __cold execute_with_initialized_rng(struct notifier_block *nb)
 /**
  * sys_vgetrandom_alloc - Allocate opaque states for use with vDSO getrandom().
  *
- * @num:	   On input, a pointer to a suggested hint of how many states to
- * 		   allocate, and on output the number of states actually allocated.
- *
- * @size_per_each: The size of each state allocated, so that the caller can
- *		   split up the returned allocation into individual states.
- *
- * @flags:	   Currently always zero.
+ * @uargs:	A vgetrandom_alloc_args which may be updated on return.
+ * 		allocate, and on output the number of states actually allocated.
+ * @usize:	The size of @uargs, which determines the version of the struct used.
  *
  * The getrandom() vDSO function in userspace requires an opaque state, which
  * this function allocates by mapping a certain number of special pages into
  * the calling process. It takes a hint as to the number of opaque states
  * desired, and provides the caller with the number of opaque states actually
  * allocated, the size of each one in bytes, and the address of the first
- * state.
+ * state. Alternatively, if the VGRA_DEALLOCATE flag is specified, the provided
+ * states parameter is unmapped.
  *
- * Returns the address of the first state in the allocation on success, or a
- * negative error value on failure.
+ * Returns 0 on success and an error value otherwise.
  */
-SYSCALL_DEFINE3(vgetrandom_alloc, unsigned int __user *, num,
-		unsigned int __user *, size_per_each, unsigned int, flags)
+SYSCALL_DEFINE2(vgetrandom_alloc, struct vgetrandom_alloc_args __user *, uargs, size_t, usize)
 {
 	const size_t state_size = sizeof(struct vgetrandom_state);
+	const size_t max_states = (SIZE_MAX & PAGE_MASK) / state_size;
+	struct vgetrandom_alloc_args args;
 	size_t alloc_size, num_states;
 	unsigned long pages_addr;
-	unsigned int num_hint;
 	int ret;

-	if (flags)
+	if (usize > PAGE_SIZE)
+		return -E2BIG;
+	if (usize < VGETRANDOM_ALLOC_ARGS_SIZE_VER0)
 		return -EINVAL;
+	ret = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+	if (ret)
+		return ret;

-	if (get_user(num_hint, num))
-		return -EFAULT;
+	/* Currently only VGRA_DEALLOCATE is defined. */
+	if (args.flags & ~VGRA_DEALLOCATE)
+		return -EINVAL;

-	num_states = clamp_t(size_t, num_hint, 1, (SIZE_MAX & PAGE_MASK) / state_size);
-	alloc_size = PAGE_ALIGN(num_states * state_size);
+	if (args.flags & VGRA_DEALLOCATE) {
+		if (args.size_per_each != state_size || args.num > max_states || !args.states)
+			return -EINVAL;
+		return vm_munmap(args.states, args.num * state_size);
+	}

-	if (put_user(alloc_size / state_size, num) || put_user(state_size, size_per_each))
-		return -EFAULT;
+	/* These don't make sense as input values if allocating, so reject them. */
+	if (args.size_per_each || args.states)
+		return -EINVAL;
+
+	num_states = clamp_t(size_t, args.num, 1, max_states);
+	alloc_size = PAGE_ALIGN(num_states * state_size);

 	pages_addr = vm_mmap(NULL, 0, alloc_size, PROT_READ | PROT_WRITE,
 			     MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, 0);
@@ -237,7 +246,14 @@ SYSCALL_DEFINE3(vgetrandom_alloc, unsigned int __user *, num,
 	if (ret < 0)
 		goto err_unmap;

-	return pages_addr;
+	args.num = num_states;
+	args.size_per_each = state_size;
+	args.states = pages_addr;
+
+	ret = -EFAULT;
+	if (copy_to_user(uargs, &args, sizeof(args)))
+		goto err_unmap;
+	return 0;

 err_unmap:
 	vm_munmap(pages_addr, alloc_size);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7741dc94f10c..de4338e26db0 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -72,6 +72,7 @@ struct open_how;
 struct mount_attr;
 struct landlock_ruleset_attr;
 enum landlock_rule_type;
+struct vgetrandom_alloc_args;

 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -1006,9 +1007,8 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 			    void __user *uargs);
 asmlinkage long sys_getrandom(char __user *buf, size_t count,
 			      unsigned int flags);
-asmlinkage long sys_vgetrandom_alloc(unsigned int __user *num,
-				     unsigned int __user *size_per_each,
-				     unsigned int flags);
+asmlinkage long sys_vgetrandom_alloc(struct vgetrandom_alloc_args __user *uargs,
+				     size_t size);
 asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
 asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 asmlinkage long sys_execveat(int dfd, const char __user *filename,
diff --git a/include/uapi/linux/random.h b/include/uapi/linux/random.h
index e744c23582eb..49911ea2c343 100644
--- a/include/uapi/linux/random.h
+++ b/include/uapi/linux/random.h
@@ -55,4 +55,30 @@ struct rand_pool_info {
 #define GRND_RANDOM	0x0002
 #define GRND_INSECURE	0x0004

+/*
+ * Flags for vgetrandom_alloc(2)
+ *
+ * VGRA_DEALLOCATE	Deallocate supplied states.
+ */
+#define VGRA_DEALLOCATE	0x0001ULL
+
+/**
+ * struct vgetrandom_alloc_args - Arguments for the vgetrandom_alloc(2) syscall.
+ *
+ * @flags:	   Zero or more VGRA_* flags.
+ * @states:	   Zero on input if allocating, and filled in on successful
+ *		   return. An existing allocation, if deallocating.
+ * @num:	   A hint as to the desired number of states, if allocating. The
+ *		   number of existing states in @states, if deallocating
+ * @size_per_each: The size of each state in @states.
+ */
+struct vgetrandom_alloc_args {
+	__aligned_u64 flags;
+	__aligned_u64 states;
+	__aligned_u64 num;
+	__aligned_u64 size_per_each;
+};
+
+#define VGETRANDOM_ALLOC_ARGS_SIZE_VER0 32 /* sizeof first published struct */
+
 #endif /* _UAPI_LINUX_RANDOM_H */
David Laight Nov. 30, 2022, 10:39 p.m. UTC | #8
From: Thomas Gleixner
> Sent: 29 November 2022 22:02
> 
> Jason!
> 
> On Tue, Nov 29 2022 at 22:06, Jason A. Donenfeld wrote:
> > +
> > +/********************************************************************
> > + *
> > + * vDSO support helpers.
> > + *
> > + * The actual vDSO function is defined over in lib/vdso/getrandom.c,
> > + * but this section contains the kernel-mode helpers to support that.
> > + *
> > + ********************************************************************/
> > +
> > +#ifdef CONFIG_VGETRANDOM_ALLOC_SYSCALL
> > +/**
> > + * vgetrandom_alloc - allocate opaque states for use with vDSO getrandom().
> > + *
> > + * @num: on input, a pointer to a suggested hint of how many states to
> > + * allocate, and on output the number of states actually allocated.
> > + *
> > + * @size_per_each: the size of each state allocated, so that the caller can
> > + * split up the returned allocation into individual states.
> > + *
> > + * @flags: currently always zero.
> 
> NIT!
> 
> I personally prefer and ask for it in stuff I maintain:
> 
>  * @num:		On input, a pointer to a suggested hint of how many states to
>  *			allocate, and on output the number of states actually allocated.
>  *
>  * @size_per_each: 	The size of each state allocated, so that the caller can
>  * 			split up the returned allocation into individual states.
>  *
>  * @flags: 		Currently always zero.
> 
> But your turf :)
> 
> > + *
> > + * The getrandom() vDSO function in userspace requires an opaque state, which
> > + * this function allocates by mapping a certain number of special pages into
> > + * the calling process. It takes a hint as to the number of opaque states
> > + * desired, and provides the caller with the number of opaque states actually
> > + * allocated, the size of each one in bytes, and the address of the first
> > + * state.
> 
> make W=1 rightfully complains about:
> 
> > +
> 
> drivers/char/random.c:182: warning: bad line:
> 
> > + * Returns a pointer to the first state in the allocation.
> 
> I have serious doubts that this statement is correct.
> 
> Think about this comment and documentation as a boiler plate for the
> mandatory man page for a new syscall (hint...)
> 
> > + *
> > + */
> 
> and W=1 also complains rightfully here:
> 
> > +SYSCALL_DEFINE3(vgetrandom_alloc, unsigned int __user *, num,
> > +		unsigned int __user *, size_per_each, unsigned int, flags)
> 
> drivers/char/random.c:188: warning: expecting prototype for vgetrandom_alloc(). Prototype was for
> sys_vgetrandom_alloc() instead
> 
> > +{
> > diff --git a/include/vdso/getrandom.h b/include/vdso/getrandom.h
> > new file mode 100644
> > index 000000000000..5f04c8bf4bd4
> > --- /dev/null
> > +++ b/include/vdso/getrandom.h
> > @@ -0,0 +1,24 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> > + */
> > +
> > +#ifndef _VDSO_GETRANDOM_H
> > +#define _VDSO_GETRANDOM_H
> > +
> > +#include <crypto/chacha.h>
> > +
> > +struct vgetrandom_state {
> > +	union {
> > +		struct {
> > +			u8 batch[CHACHA_BLOCK_SIZE * 3 / 2];
> > +			u32 key[CHACHA_KEY_SIZE / sizeof(u32)];
> > +		};
> > +		u8 batch_key[CHACHA_BLOCK_SIZE * 2];
> > +	};
> > +	unsigned long generation;
> > +	u8 pos;
> > +	bool in_use;
> > +};
> 
> Again, please make this properly tabular:
> 
> struct vgetrandom_state {
> 	union {
> 		struct {
> 			u8	batch[CHACHA_BLOCK_SIZE * 3 / 2];
> 			u32	key[CHACHA_KEY_SIZE / sizeof(u32)];
> 		};
> 		u8	batch_key[CHACHA_BLOCK_SIZE * 2];
> 	};
> 	unsigned long	generation;
> 	u8		pos;
> 	bool		in_use;
> };
> 
> Plus some kernel doc which explains what this is about.

That structure looks horrid - especially for something shared
between entities.
The 'unsigned long' should be either u32 or u64.
There is 'hidden' padding at the end.
If 'pos' is an index into something (longer name would be
better) then there is no reason to squeeze the value into
1 byte - it doesn't save anything and might make things bigger.

(I think Jason might have blocked my emails, he doesn't like
critisicism/feedback.)

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Jason A. Donenfeld Dec. 1, 2022, 12:14 a.m. UTC | #9
On Wed, Nov 30, 2022 at 10:39:38PM +0000, David Laight wrote:
> > struct vgetrandom_state {
> > 	union {
> > 		struct {
> > 			u8	batch[CHACHA_BLOCK_SIZE * 3 / 2];
> > 			u32	key[CHACHA_KEY_SIZE / sizeof(u32)];
> > 		};
> > 		u8	batch_key[CHACHA_BLOCK_SIZE * 2];
> > 	};
> > 	unsigned long	generation;
> > 	u8		pos;
> > 	bool		in_use;
> > };
> > 
> > Plus some kernel doc which explains what this is about.
> 
> That structure looks horrid - especially for something shared
> between entities.
> The 'unsigned long' should be either u32 or u64.

This struct isn't shared. It's used only by user mode code.
There may well be other issues with that long, though.

Jason
Jason A. Donenfeld Dec. 1, 2022, 2:16 a.m. UTC | #10
On Wed, Nov 30, 2022 at 04:39:55PM +0100, Jason A. Donenfeld wrote:
> > Can userspace use the memory for something else if it's not passed to
> > getrandom?
> 
> I suspect the documentation answer here is, "no", even if technically it
> might happen to work on this kernel or that kernel. I suppose this could
> even be quasi-enforced by xoring the top bits with some vdso
> compile-time constant, so you can't rely on being able to dereference
> it yourself.
> [...]
> Then they're caught holding the bag? This doesn't seem much different
> from userspace shooting themselves in general, like writing garbage into
> the allocated states and then trying to use them. If this is something
> you really, really are concerned about, then maybe my cheesy dumb xor
> thing mentioned above would be a low effort mitigation here.

I implemented a sample of this, below. I think this is a bit silly,
though, and making this fully robust could take some effort. Overall, I
don't think we should do this.

However, the more I think about the args thing from the last email,
the more I like *that* idea. So I think I'll roll with that.

But this cheesy pointer obfuscation thing here, meh. But here's what it
could look like anyway:

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 2aaeb48d11be..7aff45165ce5 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -228,7 +228,7 @@ SYSCALL_DEFINE2(vgetrandom_alloc, struct vgetrandom_alloc_args __user *, uargs,
 	if (args.flags & VGRA_DEALLOCATE) {
 		if (args.size_per_each != state_size || args.num > max_states || !args.states)
 			return -EINVAL;
-		return vm_munmap(args.states, args.num * state_size);
+		return vm_munmap(args.states ^ VGETRANDOM_STATE_HI_TAINT, args.num * state_size);
 	}

 	/* These don't make sense as input values if allocating, so reject them. */
@@ -249,7 +249,7 @@ SYSCALL_DEFINE2(vgetrandom_alloc, struct vgetrandom_alloc_args __user *, uargs,

 	args.num = num_states;
 	args.size_per_each = state_size;
-	args.states = pages_addr;
+	args.states = pages_addr ^ VGETRANDOM_STATE_HI_TAINT;

 	ret = -EFAULT;
 	if (copy_to_user(uargs, &args, sizeof(args)))
diff --git a/include/vdso/getrandom.h b/include/vdso/getrandom.h
index cb624799a8e7..9a6aaf4d99d4 100644
--- a/include/vdso/getrandom.h
+++ b/include/vdso/getrandom.h
@@ -8,6 +8,7 @@

 #include <crypto/chacha.h>
 #include <vdso/limits.h>
+#include <linux/version.h>

 /**
  * struct vgetrandom_state - State used by vDSO getrandom() and allocated by vgetrandom_alloc().
@@ -41,4 +42,10 @@ struct vgetrandom_state {
 	bool 			in_use;
 };

+/* Be annoying by changing frequently enough. */
+#define VGETRANDOM_STATE_HI_TAINT ((unsigned long)(((LINUX_VERSION_CODE >> 16) + \
+				    (LINUX_VERSION_CODE >>  8) + (LINUX_VERSION_CODE >>  0) + \
+				    __GNUC__ + __GNUC_MINOR__ + __GNUC_PATCHLEVEL__) \
+				   & 0xff) << (BITS_PER_LONG - 8))
+
 #endif /* _VDSO_GETRANDOM_H */
diff --git a/lib/vdso/getrandom.c b/lib/vdso/getrandom.c
index 9ca624756432..14cbd349186c 100644
--- a/lib/vdso/getrandom.c
+++ b/lib/vdso/getrandom.c
@@ -57,7 +57,7 @@ __cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_
 		       unsigned int flags, void *opaque_state)
 {
 	ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len);
-	struct vgetrandom_state *state = opaque_state;
+	struct vgetrandom_state *state = (void *)((unsigned long)opaque_state ^ VGETRANDOM_STATE_HI_TAINT);
 	size_t batch_len, nblocks, orig_len = len;
 	unsigned long current_generation;
 	void *orig_buffer = buffer;
Jason A. Donenfeld Dec. 2, 2022, 2:38 p.m. UTC | #11
On Wed, Nov 30, 2022 at 05:38:13PM +0100, Jason A. Donenfeld wrote:
> On Wed, Nov 30, 2022 at 04:39:55PM +0100, Jason A. Donenfeld wrote:
> > 2) Convert vgetrandom_alloc() into a clone3-style syscall, as Christian
> >    suggested earlier, which might allow for a bit more overloading
> >    capability. That would be a struct that looks like:
> > 
> >       struct vgetrandom_alloc_args {
> > 	  __aligned_u64 flags;
> >           __aligned_u64 states;
> > 	  __aligned_u64 num;
> > 	  __aligned_u64 size_of_each;
> >       }
> > 
> >   - If flags is VGRA_ALLOCATE, states and size_of_each must be zero on
> >     input, while num is the hint, as is the case now. On output, states,
> >     size_of_each, and num are filled in.
> > 
> >   - If flags is VGRA_DEALLOCATE, states, size_of_each, and num must be as
> >     they were originally, and then it deallocates.
> > 
> > I suppose (2) would alleviate your concerns entirely, without future
> > uncertainty over what it'd be like to add special cases to munmap(). And
> > it'd add a bit more future proofing to the syscall, depending on what we
> > do.
> > 
> > So maybe I'm warming up to that approach a bit.
> 
> So I just did a little quick implementation to see what it'd feel like,
> and actually, it's quite simple, and might address a lot of concerns all
> at once. What do you think of the below? Documentation and such still
> needs work obviously, but the bones should be there.

Well, despite writing into the ether here, I continue to chase my tail
around in circles over this. After Adhemerval expressed a sort of "meh"
opinion to me on IRC around doing the clone3-like thing, I went down a
mm rabbit hole and started looking at all the various ways memory is
allocated in userspace and under what conditions and for what and why.
Turns out there are a few drivers doing interesting things in this
space.

The long and short of it is that:
- All addresses involve maps and page tables.
- Allocating is mapping, deallocating is unmapping, and there's no way
  around that.
- Memory that's "special" usually comes with special attributes or
  operations on its vma.

So, this makes me think that `munmap` is the fine *and correct* API for
deallocation. It's what everything else uses, even "special" things. And
it doesn't constrain us in the future in case this gets "registered"
somehow, as Florian described it, because it's still attached to
current->mm and will still always go through the same mapping APIs
anyway.

In light of that, I'm going to stick with the original API design, and
not do the clone3() args struct thing and the VGRA_DEALLOCATE flag.
However, I think it'd be a good idea to add an additional parameter of
"unsigned long addr", which is enforced/reserved to be always 0 for now.
This might prove useful for something together with the currently unused
flags argument, sometime in the future.

Jason
Florian Weimer Dec. 2, 2022, 5:17 p.m. UTC | #12
* Jason A. Donenfeld:

> I don't think zapping that memory is supported, or even a sensible thing
> to do. In the first place, I don't think we should suggest that the user
> can dereference that pointer, at all. In that sense, maybe it's best to
> call it a "handle" or something similar (a "HANDLE"! a "HWND"? a "HRNG"?

Surely the caller has to carve up the allocation, so the returned
pointer is not opaque at all.  From Adhemerval's glibc patch:

      grnd_allocator.cap = new_cap;
      grnd_allocator.states = new_states;

      for (size_t i = 0; i < num; ++i)
	{
	  grnd_allocator.states[i] = new_block;
	  new_block += size_per_each;
	}
      grnd_allocator.len = num;
    }

That's the opposite of a handle, really.

>> But it will constrain future
>> evolution of the implementation because you can't add registration
>> (retaining a reference to the passed-in area in getrandom) after the
>> fact.  But I'm not sure if this is possible with the current interface,
>> either.  Userspace has to make some assumptions about the life-cycle to
>> avoid a memory leak on thread exit.
>
> It sounds like this is sort of a different angle on Rasmus' earlier
> comment about how munmap leaks implementation details. Maybe there's
> something to that after all? Or not? I see two approaches:
>
> 1) Keep munmap as the allocation function. If later on we do fancy
>    registration and in-kernel state tracking, or add fancy protection
>    flags, or whatever else, munmap should be able to identify these
>    pages and carry out whatever special treatment is necessary.

munmap is fine, but the interface needs to say how to use it, and what
length to pass.

>> > +	num_states = clamp_t(size_t, num_hint, 1, (SIZE_MAX & PAGE_MASK) / state_size);
>> > +	alloc_size = PAGE_ALIGN(num_states * state_size);
>> 
>> Doesn't this waste space for one state if state_size happens to be a
>> power of 2?  Why do this SIZE_MAX & PAGE_MASK thing at all?  Shouldn't
>> it be PAGE_SIZE / state_size?
>
> The first line is a clamp. That fixes num_hint between 1 and the largest
> number that when multiplied and rounded up won't overflow.
>
> So, if state_size is a power of two, let's say 256, and there's only one
> state, here's what that looks like:
>
>     num_states = clamp(1, 1, (0xffffffff & (~(4096 - 1))) / 256 = 16777200) = 1
>     alloc_size = PAGE_ALIGN(1 * 256) = 4096
>
> So that seems like it's working as intended, right? Or if not, maybe
> it'd help to write out the digits you're concerned about?

I think I was just confused.

>> > +	if (put_user(alloc_size / state_size, num) || put_user(state_size, size_per_each))
>> > +		return -EFAULT;
>> > +
>> > +	pages_addr = vm_mmap(NULL, 0, alloc_size, PROT_READ | PROT_WRITE,
>> > +			     MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, 0);
>> 
>> I think Rasmus has already raised questions about MAP_LOCKED.
>> 
>> I think the kernel cannot rely on it because userspace could call
>> munlock on the allocation.
>
> Then they're caught holding the bag? This doesn't seem much different
> from userspace shooting themselves in general, like writing garbage into
> the allocated states and then trying to use them. If this is something
> you really, really are concerned about, then maybe my cheesy dumb xor
> thing mentioned above would be a low effort mitigation here.

So the MAP_LOCKED is just there to prevent leakage to swap?

Thanks,
Florian
Jason A. Donenfeld Dec. 2, 2022, 6:29 p.m. UTC | #13
Hi Florian,

On Fri, Dec 02, 2022 at 06:17:17PM +0100, Florian Weimer wrote:
> * Jason A. Donenfeld:
> 
> > I don't think zapping that memory is supported, or even a sensible thing
> > to do. In the first place, I don't think we should suggest that the user
> > can dereference that pointer, at all. In that sense, maybe it's best to
> > call it a "handle" or something similar (a "HANDLE"! a "HWND"? a "HRNG"?
> 
> Surely the caller has to carve up the allocation, so the returned
> pointer is not opaque at all.  From Adhemerval's glibc patch:
> 
>       grnd_allocator.cap = new_cap;
>       grnd_allocator.states = new_states;
> 
>       for (size_t i = 0; i < num; ++i)
> 	{
> 	  grnd_allocator.states[i] = new_block;
> 	  new_block += size_per_each;
> 	}
>       grnd_allocator.len = num;
>     }
> 
> That's the opposite of a handle, really.

Right. (And the same code is in the commit message example too.)

> 
> >> But it will constrain future
> >> evolution of the implementation because you can't add registration
> >> (retaining a reference to the passed-in area in getrandom) after the
> >> fact.  But I'm not sure if this is possible with the current interface,
> >> either.  Userspace has to make some assumptions about the life-cycle to
> >> avoid a memory leak on thread exit.
> >
> > It sounds like this is sort of a different angle on Rasmus' earlier
> > comment about how munmap leaks implementation details. Maybe there's
> > something to that after all? Or not? I see two approaches:
> >
> > 1) Keep munmap as the allocation function. If later on we do fancy
> >    registration and in-kernel state tracking, or add fancy protection
> >    flags, or whatever else, munmap should be able to identify these
> >    pages and carry out whatever special treatment is necessary.
> 
> munmap is fine, but the interface needs to say how to use it, and what
> length to pass.

Glad we're on the same page. Indeed I've now documented this for my
in-progress v11. A blurb like:

+ * sys_vgetrandom_alloc - Allocate opaque states for use with vDSO getrandom().
+ *
+ * @num:          On input, a pointer to a suggested hint of how many states to
+ *                allocate, and on return the number of states actually allocated.
+ *
+ * @size_per_each: On input, must be zero. On return, the size of each state allocated,
+ *                so that the caller can split up the returned allocation into
+ *                individual states.
+ *
+ * @addr:         Reserved, must be zero.
+ *
+ * @flags:        Reserved, must be zero.
+ *
+ * The getrandom() vDSO function in userspace requires an opaque state, which
+ * this function allocates by mapping a certain number of special pages into
+ * the calling process. It takes a hint as to the number of opaque states
+ * desired, and provides the caller with the number of opaque states actually
+ * allocated, the size of each one in bytes, and the address of the first
+ * state, which may be split up into @num states of @size_per_each bytes each,
+ * by adding @size_per_each to the returned first state @num times.
+ *
+ * Returns the address of the first state in the allocation on success, or a
+ * negative error value on failure.
+ *
+ * The returned address of the first state may be passed to munmap(2) with a
+ * length of `(size_t)num * (size_t)size_per_each`, in order to deallocate the
+ * memory, after which it is invalid to pass it to vDSO getrandom().

What do you think of that text?

> > Then they're caught holding the bag? This doesn't seem much different
> > from userspace shooting themselves in general, like writing garbage into
> > the allocated states and then trying to use them. If this is something
> > you really, really are concerned about, then maybe my cheesy dumb xor
> > thing mentioned above would be a low effort mitigation here.
> 
> So the MAP_LOCKED is just there to prevent leakage to swap?

Right. I can combine that with MLOCK_ONFAULT and NORESERVED to avoid
having to commit the memory immediately. I've got this in my tree for
v11.

In case you're curious to see the WIP, it's in here:
https://git.zx2c4.com/linux-rng/log/?h=vdso

Jason
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 256f03904987..3894f947a507 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17287,6 +17287,7 @@  T:	git https://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git
 S:	Maintained
 F:	drivers/char/random.c
 F:	drivers/virt/vmgenid.c
+F:	include/vdso/getrandom.h
 
 RAPIDIO SUBSYSTEM
 M:	Matt Porter <mporter@kernel.crashing.org>
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 67558b95d531..b81d67f3ebab 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -8,6 +8,7 @@ 
  * into roughly six sections, each with a section header:
  *
  *   - Initialization and readiness waiting.
+ *   - vDSO support helpers.
  *   - Fast key erasure RNG, the "crng".
  *   - Entropy accumulation and extraction routines.
  *   - Entropy collection routines.
@@ -39,6 +40,7 @@ 
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
+#include <linux/mman.h>
 #include <linux/nodemask.h>
 #include <linux/spinlock.h>
 #include <linux/kthread.h>
@@ -55,6 +57,9 @@ 
 #include <linux/siphash.h>
 #include <crypto/chacha.h>
 #include <crypto/blake2s.h>
+#ifdef CONFIG_VGETRANDOM_ALLOC_SYSCALL
+#include <vdso/getrandom.h>
+#endif
 #include <asm/processor.h>
 #include <asm/irq.h>
 #include <asm/irq_regs.h>
@@ -167,6 +172,76 @@  int __cold execute_with_initialized_rng(struct notifier_block *nb)
 				__func__, (void *)_RET_IP_, crng_init)
 
 
+
+/********************************************************************
+ *
+ * vDSO support helpers.
+ *
+ * The actual vDSO function is defined over in lib/vdso/getrandom.c,
+ * but this section contains the kernel-mode helpers to support that.
+ *
+ ********************************************************************/
+
+#ifdef CONFIG_VGETRANDOM_ALLOC_SYSCALL
+/**
+ * vgetrandom_alloc - allocate opaque states for use with vDSO getrandom().
+ *
+ * @num: on input, a pointer to a suggested hint of how many states to
+ * allocate, and on output the number of states actually allocated.
+ *
+ * @size_per_each: the size of each state allocated, so that the caller can
+ * split up the returned allocation into individual states.
+ *
+ * @flags: currently always zero.
+ *
+ * The getrandom() vDSO function in userspace requires an opaque state, which
+ * this function allocates by mapping a certain number of special pages into
+ * the calling process. It takes a hint as to the number of opaque states
+ * desired, and provides the caller with the number of opaque states actually
+ * allocated, the size of each one in bytes, and the address of the first
+ * state.
+
+ * Returns a pointer to the first state in the allocation.
+ *
+ */
+SYSCALL_DEFINE3(vgetrandom_alloc, unsigned int __user *, num,
+		unsigned int __user *, size_per_each, unsigned int, flags)
+{
+	const size_t state_size = sizeof(struct vgetrandom_state);
+	size_t alloc_size, num_states;
+	unsigned long pages_addr;
+	unsigned int num_hint;
+	int ret;
+
+	if (flags)
+		return -EINVAL;
+
+	if (get_user(num_hint, num))
+		return -EFAULT;
+
+	num_states = clamp_t(size_t, num_hint, 1, (SIZE_MAX & PAGE_MASK) / state_size);
+	alloc_size = PAGE_ALIGN(num_states * state_size);
+
+	if (put_user(alloc_size / state_size, num) || put_user(state_size, size_per_each))
+		return -EFAULT;
+
+	pages_addr = vm_mmap(NULL, 0, alloc_size, PROT_READ | PROT_WRITE,
+			     MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, 0);
+	if (IS_ERR_VALUE(pages_addr))
+		return pages_addr;
+
+	ret = do_madvise(current->mm, pages_addr, alloc_size, MADV_WIPEONFORK);
+	if (ret < 0)
+		goto err_unmap;
+
+	return pages_addr;
+
+err_unmap:
+	vm_munmap(pages_addr, alloc_size);
+	return ret;
+}
+#endif
+
 /*********************************************************************
  *
  * Fast key erasure RNG, the "crng".
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a34b0f9a9972..7741dc94f10c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1006,6 +1006,9 @@  asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 			    void __user *uargs);
 asmlinkage long sys_getrandom(char __user *buf, size_t count,
 			      unsigned int flags);
+asmlinkage long sys_vgetrandom_alloc(unsigned int __user *num,
+				     unsigned int __user *size_per_each,
+				     unsigned int flags);
 asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
 asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 asmlinkage long sys_execveat(int dfd, const char __user *filename,
diff --git a/include/vdso/getrandom.h b/include/vdso/getrandom.h
new file mode 100644
index 000000000000..5f04c8bf4bd4
--- /dev/null
+++ b/include/vdso/getrandom.h
@@ -0,0 +1,24 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _VDSO_GETRANDOM_H
+#define _VDSO_GETRANDOM_H
+
+#include <crypto/chacha.h>
+
+struct vgetrandom_state {
+	union {
+		struct {
+			u8 batch[CHACHA_BLOCK_SIZE * 3 / 2];
+			u32 key[CHACHA_KEY_SIZE / sizeof(u32)];
+		};
+		u8 batch_key[CHACHA_BLOCK_SIZE * 2];
+	};
+	unsigned long generation;
+	u8 pos;
+	bool in_use;
+};
+
+#endif /* _VDSO_GETRANDOM_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 860b2dcf3ac4..f28196cb919b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -360,6 +360,9 @@  COND_SYSCALL(pkey_free);
 /* memfd_secret */
 COND_SYSCALL(memfd_secret);
 
+/* random */
+COND_SYSCALL(vgetrandom_alloc);
+
 /*
  * Architecture specific weak syscall entries.
  */
diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig
index d883ac299508..b22584f8da03 100644
--- a/lib/vdso/Kconfig
+++ b/lib/vdso/Kconfig
@@ -31,3 +31,10 @@  config GENERIC_VDSO_TIME_NS
 	  VDSO
 
 endif
+
+config VGETRANDOM_ALLOC_SYSCALL
+	bool
+	select ADVISE_SYSCALLS
+	help
+	  Selected by the getrandom() vDSO function, which requires this
+	  for state allocation.