diff mbox series

[v7,1/3] random: add vgetrandom_alloc() syscall

Message ID 20221124165536.1631325-2-Jason@zx2c4.com
State New
Headers show
Series [v7,1/3] random: add vgetrandom_alloc() syscall | expand

Commit Message

Jason A. Donenfeld Nov. 24, 2022, 4:55 p.m. UTC
The vDSO getrandom() works over an opaque per-thread state of an
unexported size, which must be marked as MADV_WIPEONFORK and be
mlock()'d for proper operation. Over time, the nuances of these
allocations may change or grow or even differ based on architectural
features.

The syscall has the signature:

  void *vgetrandom_alloc([inout] unsigned int *num,
                         [out] unsigned int *size_per_each,
                         unsigned int flags);

This takes the desired number of opaque states in `num`, and returns a
pointer to an array of opaque states, the number actually allocated back
in `num`, and the size in bytes of each one in `size_per_each`, enabling
a libc to slice up the returned array into a state per each thread. (The
`flags` argument is always zero for now.) Libc is expected to allocate a
chunk of these on first use, and then dole them out to threads as
they're created, allocating more when needed. The following commit shows
an example of this, being used in conjunction with the getrandom() vDSO
function.

We very intentionally do *not* leave state allocation for vDSO
getrandom() up to userspace itself, but rather provide this new syscall
for such allocations. vDSO getrandom() must not store its state in just
any old memory address, but rather just ones that the kernel specially
allocates for it, leaving the particularities of those allocations up to
the kernel.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 MAINTAINERS                             |  1 +
 arch/x86/Kconfig                        |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl  |  1 +
 arch/x86/include/asm/unistd.h           |  1 +
 drivers/char/random.c                   | 59 +++++++++++++++++++++++++
 include/uapi/asm-generic/unistd.h       |  7 ++-
 kernel/sys_ni.c                         |  3 ++
 lib/vdso/getrandom.h                    | 23 ++++++++++
 scripts/checksyscalls.sh                |  4 ++
 tools/include/uapi/asm-generic/unistd.h |  7 ++-
 10 files changed, 105 insertions(+), 2 deletions(-)
 create mode 100644 lib/vdso/getrandom.h

Comments

Jason A. Donenfeld Nov. 27, 2022, 8:18 p.m. UTC | #1
Hi Thomas,

Thanks a lot for the thorough review, here, and in the other two emails.
I appreciate you taking the time to look at it, and my apologies for
parts that are unclear or sloppy or otherwise unpolished. I'll try to
make v8 a lot better.

Comments inline below:

On Fri, Nov 25, 2022 at 09:45:31PM +0100, Thomas Gleixner wrote:
> On Thu, Nov 24 2022 at 17:55, Jason A. Donenfeld wrote:
> > ---
> >  MAINTAINERS                             |  1 +
> >  arch/x86/Kconfig                        |  1 +
> >  arch/x86/entry/syscalls/syscall_64.tbl  |  1 +
> >  arch/x86/include/asm/unistd.h           |  1 +
> >  drivers/char/random.c                   | 59 +++++++++++++++++++++++++
> >  include/uapi/asm-generic/unistd.h       |  7 ++-
> >  kernel/sys_ni.c                         |  3 ++
> >  lib/vdso/getrandom.h                    | 23 ++++++++++
> >  scripts/checksyscalls.sh                |  4 ++
> >  tools/include/uapi/asm-generic/unistd.h |  7 ++-
> >  10 files changed, 105 insertions(+), 2 deletions(-)
> >  create mode 100644 lib/vdso/getrandom.h
> 
> I think I asked for this before:
> 
> Please split these things properly up. Provide the syscall and then wire
> it up.

Before I split it into "syscall, generic vdso, x86 vdso", as that's how
I interpreted your email. Next, I'll split it up into "generic syscall,
generic vdso, x86 vdso & syscall", since enabling the syscall without
the vdso function, or vice-versa, doesn't make sense, and having that
last step be all at once there will provide an easy thing for other
archs to look at.

> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index 67745ceab0db..331e21ba961a 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -59,6 +59,7 @@ config X86
> >  	#
> >  	select ACPI_LEGACY_TABLES_LOOKUP	if ACPI
> >  	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
> > +	select ADVISE_SYSCALLS			if X86_64
> 
> Why is this x86_64 specific?
> 
> > --- a/arch/x86/include/asm/unistd.h
> > +++ b/arch/x86/include/asm/unistd.h
> > @@ -27,6 +27,7 @@
> >  #  define __ARCH_WANT_COMPAT_SYS_PWRITEV64
> >  #  define __ARCH_WANT_COMPAT_SYS_PREADV64V2
> >  #  define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
> > +#  define __ARCH_WANT_VGETRANDOM_ALLOC
> 
> So instead of this define, why can't you do:
> 
> config VGETRADOM_ALLOC
>        bool
>        select ADVISE_SYSCALLS
> 
> and then have
> 
> config GENERIC_VDSO_RANDOM_WHATEVER
>        bool
>        select VGETRANDOM_ALLOC
> 
> This gives a clear Kconfig dependency instead of the random
> ADVISE_SYSCALLS select.

That's much better indeed. I was trying to straddle the two conventions
of `#define __ARCH_...` for syscalls and a Kconfig for vDSO functions,
but doing it all together as you've suggested is nicer.

I'll try to figure this out, though so far futzing around suggests there
might have to be both, because of unistd.h being a userspace header.
That is, include/uapi/asm-generic/unistd.h typically needs a `#if
__ARCH_WANT..., #define ...` in it. I'll give it a spin and you'll see
for v8. At the very least it should get rid of the more awkward
`select ADVISE_SYSCALLS if X86_64` part, and will better separate the
arch code from non-arch code.

> 
> >--- a/drivers/char/random.c
> > +++ b/drivers/char/random.c
> 
> > +#include "../../lib/vdso/getrandom.h"
> 
> Seriously?
> 
> include/vdso/ exists for a reason.

Er, yes, thanks.

> 
> > +#ifdef __ARCH_WANT_VGETRANDOM_ALLOC
> > +/*
> > + * The vgetrandom() function in userspace requires an opaque state, which this
> > + * function provides to userspace, by mapping a certain number of special pages
> > + * into the calling process. It takes a hint as to the number of opaque states
> > + * desired, and returns the number of opaque states actually allocated, the
> > + * size of each one in bytes, and the address of the first state.
> 
> As this is a syscall which can be invoked outside of the VDSO, can you
> please provide proper kernel-doc which explains the arguments, the
> functionality and the return value?

Yes, will do.

> 
> > + */
> > +SYSCALL_DEFINE3(vgetrandom_alloc, unsigned int __user *, num,
> > +		unsigned int __user *, size_per_each, unsigned int, flags)
> > +{
> > +	size_t alloc_size, num_states;
> > +	unsigned long pages_addr;
> > +	unsigned int num_hint;
> > +	int ret;
> > +
> > +	if (flags)
> > +		return -EINVAL;
> > +
> > +	if (get_user(num_hint, num))
> > +		return -EFAULT;
> > +
> > +	num_states = clamp_t(size_t, num_hint, 1, (SIZE_MAX & PAGE_MASK) / sizeof(struct vgetrandom_state));
> > +	alloc_size = PAGE_ALIGN(num_states * sizeof(struct vgetrandom_state));
> > +
> > +	if (put_user(alloc_size / sizeof(struct vgetrandom_state), num) ||
> > +	    put_user(sizeof(struct vgetrandom_state), size_per_each))
> > +		return -EFAULT;
> 
> That's a total of four sizeof(struct vgetrandom_state) usage sites.
> 
>        size_t state_size = sizeof(struct vgetrandom_state);
> 
> perhaps?

Not my style -- I like to have the constant expression at the usage site
so I don't have to remember the variable -- but I'm fine going with your
suggestion, so I'll do that for v8.

Jason
Jason A. Donenfeld Nov. 28, 2022, 5:17 p.m. UTC | #2
Hi Arnd,

On Mon, Nov 28, 2022 at 02:54:39PM +0100, Arnd Bergmann wrote:
> On Sun, Nov 27, 2022, at 21:18, Jason A. Donenfeld wrote:
> >> 
> >> config GENERIC_VDSO_RANDOM_WHATEVER
> >>        bool
> >>        select VGETRANDOM_ALLOC
> >> 
> >> This gives a clear Kconfig dependency instead of the random
> >> ADVISE_SYSCALLS select.
> >
> > That's much better indeed. I was trying to straddle the two conventions
> > of `#define __ARCH_...` for syscalls and a Kconfig for vDSO functions,
> > but doing it all together as you've suggested is nicer.
> >
> > I'll try to figure this out, though so far futzing around suggests there
> > might have to be both, because of unistd.h being a userspace header.
> > That is, include/uapi/asm-generic/unistd.h typically needs a `#if
> > __ARCH_WANT..., #define ...` in it. I'll give it a spin and you'll see
> > for v8. At the very least it should get rid of the more awkward
> > `select ADVISE_SYSCALLS if X86_64` part, and will better separate the
> > arch code from non-arch code.
> 
> I think you should not need an __ARCH_WANT_SYS_* symbol for this,
> the only place we actually need them for is the asm-generic/unistd.h
> header which is still used on a couple of architectures (I have
> an experimental series for replacing it with a generic syscall.tbl
> file, but it's not ready for 6.2). In most cases, the __ARCH_WANT_SYS_*
> symbols are only used for syscalls that are part of the table for
> old architectures but get skipped on newer targets that always had
> a replacement syscalls (e.g. getrlimit getting replaced by prlimit64)
> 
> I think we should just reserve the syscall number for all architectures
> right away and #define the __NR_* macro. libc will generally need
> a runtime check anyway, and defining it now avoids the problem of
> the tables getting out of sync.
> 
> The Kconfig symbol is fine in this case.

Oh, great, okay. I'll get rid of the __ARCH stuff entirely then. I
jumped the gun and posted v8 earlier today, but I'll include this in a
v9, whenever it makes sense to send that. So when reading v8, just
assume all he __ARCH_WANT_SYS_* business has been removed.

Jason
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 256f03904987..843dd6a49538 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17287,6 +17287,7 @@  T:	git https://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git
 S:	Maintained
 F:	drivers/char/random.c
 F:	drivers/virt/vmgenid.c
+F:	lib/vdso/getrandom.h
 
 RAPIDIO SUBSYSTEM
 M:	Matt Porter <mporter@kernel.crashing.org>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 67745ceab0db..331e21ba961a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -59,6 +59,7 @@  config X86
 	#
 	select ACPI_LEGACY_TABLES_LOOKUP	if ACPI
 	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
+	select ADVISE_SYSCALLS			if X86_64
 	select ARCH_32BIT_OFF_T			if X86_32
 	select ARCH_CLOCKSOURCE_INIT
 	select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index c84d12608cd2..0186f173f0e8 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -372,6 +372,7 @@ 
 448	common	process_mrelease	sys_process_mrelease
 449	common	futex_waitv		sys_futex_waitv
 450	common	set_mempolicy_home_node	sys_set_mempolicy_home_node
+451	common	vgetrandom_alloc	sys_vgetrandom_alloc
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 761173ccc33c..1bf509eaeff1 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -27,6 +27,7 @@ 
 #  define __ARCH_WANT_COMPAT_SYS_PWRITEV64
 #  define __ARCH_WANT_COMPAT_SYS_PREADV64V2
 #  define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
+#  define __ARCH_WANT_VGETRANDOM_ALLOC
 #  define X32_NR_syscalls (__NR_x32_syscalls)
 #  define IA32_NR_syscalls (__NR_ia32_syscalls)
 
diff --git a/drivers/char/random.c b/drivers/char/random.c
index a2a18bd3d7d7..71db7b787a60 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -8,6 +8,7 @@ 
  * into roughly six sections, each with a section header:
  *
  *   - Initialization and readiness waiting.
+ *   - vDSO support helpers.
  *   - Fast key erasure RNG, the "crng".
  *   - Entropy accumulation and extraction routines.
  *   - Entropy collection routines.
@@ -39,6 +40,7 @@ 
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
+#include <linux/mman.h>
 #include <linux/nodemask.h>
 #include <linux/spinlock.h>
 #include <linux/kthread.h>
@@ -59,6 +61,7 @@ 
 #include <asm/irq.h>
 #include <asm/irq_regs.h>
 #include <asm/io.h>
+#include "../../lib/vdso/getrandom.h"
 
 /*********************************************************************
  *
@@ -167,6 +170,62 @@  int __cold execute_with_initialized_rng(struct notifier_block *nb)
 				__func__, (void *)_RET_IP_, crng_init)
 
 
+
+/********************************************************************
+ *
+ * vDSO support helpers.
+ *
+ * The actual vDSO function is defined over in lib/vdso/getrandom.c,
+ * but this section contains the kernel-mode helpers to support that.
+ *
+ ********************************************************************/
+
+#ifdef __ARCH_WANT_VGETRANDOM_ALLOC
+/*
+ * The vgetrandom() function in userspace requires an opaque state, which this
+ * function provides to userspace, by mapping a certain number of special pages
+ * into the calling process. It takes a hint as to the number of opaque states
+ * desired, and returns the number of opaque states actually allocated, the
+ * size of each one in bytes, and the address of the first state.
+ */
+SYSCALL_DEFINE3(vgetrandom_alloc, unsigned int __user *, num,
+		unsigned int __user *, size_per_each, unsigned int, flags)
+{
+	size_t alloc_size, num_states;
+	unsigned long pages_addr;
+	unsigned int num_hint;
+	int ret;
+
+	if (flags)
+		return -EINVAL;
+
+	if (get_user(num_hint, num))
+		return -EFAULT;
+
+	num_states = clamp_t(size_t, num_hint, 1, (SIZE_MAX & PAGE_MASK) / sizeof(struct vgetrandom_state));
+	alloc_size = PAGE_ALIGN(num_states * sizeof(struct vgetrandom_state));
+
+	if (put_user(alloc_size / sizeof(struct vgetrandom_state), num) ||
+	    put_user(sizeof(struct vgetrandom_state), size_per_each))
+		return -EFAULT;
+
+	pages_addr = vm_mmap(NULL, 0, alloc_size, PROT_READ | PROT_WRITE,
+			     MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, 0);
+	if (IS_ERR_VALUE(pages_addr))
+		return pages_addr;
+
+	ret = do_madvise(current->mm, pages_addr, alloc_size, MADV_WIPEONFORK);
+	if (ret < 0)
+		goto err_unmap;
+
+	return pages_addr;
+
+err_unmap:
+	vm_munmap(pages_addr, alloc_size);
+	return ret;
+}
+#endif
+
 /*********************************************************************
  *
  * Fast key erasure RNG, the "crng".
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 45fa180cc56a..77b6debe7e18 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -886,8 +886,13 @@  __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
 #define __NR_set_mempolicy_home_node 450
 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
 
+#ifdef __ARCH_WANT_VGETRANDOM_ALLOC
+#define __NR_vgetrandom_alloc 451
+__SYSCALL(__NR_vgetrandom_alloc, sys_vgetrandom_alloc)
+#endif
+
 #undef __NR_syscalls
-#define __NR_syscalls 451
+#define __NR_syscalls 452
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 860b2dcf3ac4..f28196cb919b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -360,6 +360,9 @@  COND_SYSCALL(pkey_free);
 /* memfd_secret */
 COND_SYSCALL(memfd_secret);
 
+/* random */
+COND_SYSCALL(vgetrandom_alloc);
+
 /*
  * Architecture specific weak syscall entries.
  */
diff --git a/lib/vdso/getrandom.h b/lib/vdso/getrandom.h
new file mode 100644
index 000000000000..c7f727db2aaa
--- /dev/null
+++ b/lib/vdso/getrandom.h
@@ -0,0 +1,23 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _VDSO_LIB_GETRANDOM_H
+#define _VDSO_LIB_GETRANDOM_H
+
+#include <crypto/chacha.h>
+
+struct vgetrandom_state {
+	union {
+		struct {
+			u8 batch[CHACHA_BLOCK_SIZE * 3 / 2];
+			u32 key[CHACHA_KEY_SIZE / sizeof(u32)];
+		};
+		u8 batch_key[CHACHA_BLOCK_SIZE * 2];
+	};
+	unsigned long generation;
+	u8 pos;
+};
+
+#endif /* _VDSO_LIB_GETRANDOM_H */
diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh
index f33e61aca93d..7f7928c6487f 100755
--- a/scripts/checksyscalls.sh
+++ b/scripts/checksyscalls.sh
@@ -44,6 +44,10 @@  cat << EOF
 #define __IGNORE_memfd_secret
 #endif
 
+#ifndef __ARCH_WANT_VGETRANDOM_ALLOC
+#define __IGNORE_vgetrandom_alloc
+#endif
+
 /* Missing flags argument */
 #define __IGNORE_renameat	/* renameat2 */
 
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index 45fa180cc56a..77b6debe7e18 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -886,8 +886,13 @@  __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
 #define __NR_set_mempolicy_home_node 450
 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
 
+#ifdef __ARCH_WANT_VGETRANDOM_ALLOC
+#define __NR_vgetrandom_alloc 451
+__SYSCALL(__NR_vgetrandom_alloc, sys_vgetrandom_alloc)
+#endif
+
 #undef __NR_syscalls
-#define __NR_syscalls 451
+#define __NR_syscalls 452
 
 /*
  * 32 bit systems traditionally used different