@@ -43,8 +43,24 @@ int gcs_alloc_thread_stack(struct task_struct *tsk,
{
unsigned long addr, size;
- if (!system_supports_gcs())
+ if (!system_supports_gcs()) {
+ if (args->shadow_stack_pointer)
+ return -EINVAL;
+
+ return 0;
+ }
+
+ /*
+ * If the user specified a GCS then use it, otherwise fall
+ * back to a default allocation strategy. Validation is done
+ * in arch_shstk_validate_clone().
+ */
+ if (args->shadow_stack_pointer) {
+ tsk->thread.gcs_base = 0;
+ tsk->thread.gcs_size = 0;
+ tsk->thread.gcspr_el0 = args->shadow_stack_pointer;
return 0;
+ }
if (!task_gcs_el0_enabled(tsk))
return 0;
@@ -68,6 +84,42 @@ int gcs_alloc_thread_stack(struct task_struct *tsk,
return 0;
}
+static bool gcs_consume_token(struct vm_area_struct *vma, struct page *page,
+ unsigned long user_addr)
+{
+ u64 expected = GCS_CAP(user_addr);
+ u64 *token = page_address(page) + offset_in_page(user_addr);
+
+ if (!cmpxchg_to_user_page(vma, page, user_addr, token, expected, 0))
+ return false;
+ set_page_dirty_lock(page);
+
+ return true;
+}
+
+int arch_shstk_validate_clone(struct task_struct *tsk,
+ struct vm_area_struct *vma,
+ struct page *page,
+ struct kernel_clone_args *args)
+{
+ unsigned long gcspr_el0;
+ int ret = 0;
+
+ /* Ensure that a token written as a result of a pivot is visible */
+ gcsb_dsync();
+
+ gcspr_el0 = args->shadow_stack_pointer;
+ if (!gcs_consume_token(vma, page, gcspr_el0))
+ return -EINVAL;
+
+ tsk->thread.gcspr_el0 = gcspr_el0 + sizeof(u64);
+
+ /* Ensure that our token consumption visible */
+ gcsb_dsync();
+
+ return ret;
+}
+
SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
{
unsigned long alloc_size;
@@ -6,6 +6,7 @@
#include <linux/types.h>
struct task_struct;
+struct kernel_clone_args;
struct ksignal;
#ifdef CONFIG_X86_USER_SHADOW_STACK
@@ -16,8 +17,8 @@ struct thread_shstk {
long shstk_prctl(struct task_struct *task, int option, unsigned long arg2);
void reset_thread_features(void);
-unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags,
- unsigned long stack_size);
+unsigned long shstk_alloc_thread_stack(struct task_struct *p,
+ const struct kernel_clone_args *args);
void shstk_free(struct task_struct *p);
int setup_signal_shadow_stack(struct ksignal *ksig);
int restore_signal_shadow_stack(void);
@@ -28,8 +29,10 @@ static inline long shstk_prctl(struct task_struct *task, int option,
unsigned long arg2) { return -EINVAL; }
static inline void reset_thread_features(void) {}
static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p,
- unsigned long clone_flags,
- unsigned long stack_size) { return 0; }
+ const struct kernel_clone_args *args)
+{
+ return 0;
+}
static inline void shstk_free(struct task_struct *p) {}
static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
static inline int restore_signal_shadow_stack(void) { return 0; }
@@ -207,7 +207,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
* is disabled, new_ssp will remain 0, and fpu_clone() will know not to
* update it.
*/
- new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size);
+ new_ssp = shstk_alloc_thread_stack(p, args);
if (IS_ERR_VALUE(new_ssp))
return PTR_ERR((void *)new_ssp);
@@ -191,18 +191,65 @@ void reset_thread_features(void)
current->thread.features_locked = 0;
}
-unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
- unsigned long stack_size)
+int arch_shstk_validate_clone(struct task_struct *t,
+ struct vm_area_struct *vma,
+ struct page *page,
+ struct kernel_clone_args *args)
+{
+ /*
+ * SSP is aligned, so reserved bits and mode bit are a zero, just mark
+ * the token 64-bit.
+ */
+ void *maddr = kmap_local_page(page);
+ int offset;
+ unsigned long addr, ssp;
+ u64 expected;
+
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ ssp = args->shadow_stack_pointer;
+ addr = ssp - SS_FRAME_SIZE;
+ expected = ssp | BIT(0);
+ offset = offset_in_page(addr);
+
+ if (!cmpxchg_to_user_page(vma, page, addr, (unsigned long *)(maddr + offset),
+ expected, 0))
+ return -EINVAL;
+ set_page_dirty_lock(page);
+
+ return 0;
+}
+
+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
+ const struct kernel_clone_args *args)
{
struct thread_shstk *shstk = &tsk->thread.shstk;
+ unsigned long clone_flags = args->flags;
unsigned long addr, size;
/*
* If shadow stack is not enabled on the new thread, skip any
- * switch to a new shadow stack.
+ * implicit switch to a new shadow stack and reject attempts to
+ * explicitly specify one.
*/
- if (!features_enabled(ARCH_SHSTK_SHSTK))
+ if (!features_enabled(ARCH_SHSTK_SHSTK)) {
+ if (args->shadow_stack_pointer)
+ return (unsigned long)ERR_PTR(-EINVAL);
+
return 0;
+ }
+
+ /*
+ * If the user specified a shadow stack then use it, otherwise
+ * fall back to a default allocation strategy. Validation is
+ * done in arch_shstk_validate_clone().
+ */
+ if (args->shadow_stack_pointer) {
+ shstk->base = 0;
+ shstk->size = 0;
+ return args->shadow_stack_pointer;
+ }
/*
* For CLONE_VFORK the child will share the parents shadow stack.
@@ -222,7 +269,7 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long cl
if (!(clone_flags & CLONE_VM))
return 0;
- size = adjust_shstk_size(stack_size);
+ size = adjust_shstk_size(args->stack_size);
addr = alloc_shstk(0, size, 0, false);
if (IS_ERR_VALUE(addr))
return addr;
@@ -124,4 +124,15 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
} while (0)
#endif
+#ifndef cmpxchg_to_user_page
+#define cmpxchg_to_user_page(vma, page, vaddr, ptr, old, new) \
+({ \
+ bool ret; \
+ \
+ ret = try_cmpxchg(ptr, &old, new); \
+ flush_icache_user_page(vma, page, vaddr, sizeof(*ptr)); \
+ ret; \
+})
+#endif
+
#endif /* _ASM_GENERIC_CACHEFLUSH_H */
@@ -16,6 +16,7 @@ struct task_struct;
struct rusage;
union thread_union;
struct css_set;
+struct vm_area_struct;
/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL
@@ -43,6 +44,7 @@ struct kernel_clone_args {
void *fn_arg;
struct cgroup *cgrp;
struct css_set *cset;
+ unsigned long shadow_stack_pointer;
};
/*
@@ -236,4 +238,19 @@ static inline void task_unlock(struct task_struct *p)
DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
+int arch_shstk_validate_clone(struct task_struct *p,
+ struct vm_area_struct *vma,
+ struct page *page,
+ struct kernel_clone_args *args);
+#else
+static inline int arch_shstk_validate_clone(struct task_struct *p,
+ struct vm_area_struct *vma,
+ struct page *page,
+ struct kernel_clone_args *args)
+{
+ return 0;
+}
+#endif
+
#endif /* _LINUX_SCHED_TASK_H */
@@ -84,6 +84,8 @@
* kernel's limit of nested PID namespaces.
* @cgroup: If CLONE_INTO_CGROUP is specified set this to
* a file descriptor for the cgroup.
+ * @shadow_stack_pointer: Value to use for shadow stack pointer in the
+ * child process.
*
* The structure is versioned by size and thus extensible.
* New struct members must go at the end of the struct and
@@ -101,12 +103,14 @@ struct clone_args {
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
__aligned_u64 cgroup;
+ __aligned_u64 shadow_stack_pointer;
};
#endif
-#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
-#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
-#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
+#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
+#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define CLONE_ARGS_SIZE_VER3 96 /* sizeof fourth published struct */
/*
* Scheduling policies
@@ -2128,6 +2128,51 @@ static void rv_task_fork(struct task_struct *p)
#define rv_task_fork(p) do {} while (0)
#endif
+static int shstk_validate_clone(struct task_struct *p,
+ struct kernel_clone_args *args)
+{
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ struct page *page;
+ unsigned long addr;
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_USER_SHADOW_STACK))
+ return 0;
+
+ if (!args->shadow_stack_pointer)
+ return 0;
+
+ mm = get_task_mm(p);
+ if (!mm)
+ return -EFAULT;
+
+ mmap_read_lock(mm);
+
+ addr = untagged_addr_remote(mm, args->shadow_stack_pointer);
+ page = get_user_page_vma_remote(mm, addr, FOLL_FORCE | FOLL_WRITE,
+ &vma);
+ if (IS_ERR(page)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!(vma->vm_flags & VM_SHADOW_STACK) ||
+ !(vma->vm_flags & VM_WRITE)) {
+ ret = -EFAULT;
+ goto out_page;
+ }
+
+ ret = arch_shstk_validate_clone(p, vma, page, args);
+
+out_page:
+ put_page(page);
+out:
+ mmap_read_unlock(mm);
+ mmput(mm);
+ return ret;
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -2402,6 +2447,9 @@ __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cleanup_namespaces;
retval = copy_thread(p, args);
+ if (retval)
+ goto bad_fork_cleanup_io;
+ retval = shstk_validate_clone(p, args);
if (retval)
goto bad_fork_cleanup_io;
@@ -2965,7 +3013,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
CLONE_ARGS_SIZE_VER1);
BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
CLONE_ARGS_SIZE_VER2);
- BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
+ BUILD_BUG_ON(offsetofend(struct clone_args, shadow_stack_pointer) !=
+ CLONE_ARGS_SIZE_VER3);
+ BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER3);
if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
@@ -2998,16 +3048,17 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
return -EINVAL;
*kargs = (struct kernel_clone_args){
- .flags = args.flags,
- .pidfd = u64_to_user_ptr(args.pidfd),
- .child_tid = u64_to_user_ptr(args.child_tid),
- .parent_tid = u64_to_user_ptr(args.parent_tid),
- .exit_signal = args.exit_signal,
- .stack = args.stack,
- .stack_size = args.stack_size,
- .tls = args.tls,
- .set_tid_size = args.set_tid_size,
- .cgroup = args.cgroup,
+ .flags = args.flags,
+ .pidfd = u64_to_user_ptr(args.pidfd),
+ .child_tid = u64_to_user_ptr(args.child_tid),
+ .parent_tid = u64_to_user_ptr(args.parent_tid),
+ .exit_signal = args.exit_signal,
+ .stack = args.stack,
+ .stack_size = args.stack_size,
+ .tls = args.tls,
+ .set_tid_size = args.set_tid_size,
+ .cgroup = args.cgroup,
+ .shadow_stack_pointer = args.shadow_stack_pointer,
};
if (args.set_tid &&
@@ -3048,6 +3099,27 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
return true;
}
+/**
+ * clone3_shadow_stack_valid - check and prepare shadow stack
+ * @kargs: kernel clone args
+ *
+ * Verify that shadow stacks are only enabled if supported.
+ */
+static inline bool clone3_shadow_stack_valid(struct kernel_clone_args *kargs)
+{
+ if (!kargs->shadow_stack_pointer)
+ return true;
+
+ if (!IS_ALIGNED(kargs->shadow_stack_pointer, sizeof(void *)))
+ return false;
+
+ /*
+ * The architecture must check support on the specific
+ * machine.
+ */
+ return IS_ENABLED(CONFIG_ARCH_HAS_USER_SHADOW_STACK);
+}
+
static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/* Verify that no unknown flags are passed along. */
@@ -3070,7 +3142,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
kargs->exit_signal)
return false;
- if (!clone3_stack_valid(kargs))
+ if (!clone3_stack_valid(kargs) || !clone3_shadow_stack_valid(kargs))
return false;
return true;