diff mbox series

[1/4] mm/vmalloc: allow arch-specific vmalloc_node overrides

Message ID 20240220203256.31153-2-mbland@motorola.com
State New
Headers show
Series arm64: mm: support dynamic vmalloc/pmd configuration | expand

Commit Message

Maxwell Bland Feb. 20, 2024, 8:32 p.m. UTC
Present non-uniform use of __vmalloc_node and __vmalloc_node_range makes
enforcing appropriate code and data seperation untenable on certain
microarchitectures, as VMALLOC_START and VMALLOC_END are monolithic
while the use of the vmalloc interface is non-monolithic: in particular,
appropriate randomness in ASLR makes it such that code regions must fall
in some region between VMALLOC_START and VMALLOC_end, but this
necessitates that code pages are intermingled with data pages, meaning
code-specific protections, such as arm64's PXNTable, cannot be
performantly runtime enforced.

The solution to this problem allows architectures to override the
vmalloc wrapper functions by enforcing that the rest of the kernel does
not reimplement __vmalloc_node by using __vmalloc_node_range with the
same parameters as __vmalloc_node or provides a __weak tag to those
functions using __vmalloc_node_range with parameters repeating those of
__vmalloc_node.

Two benefits of this approach are (1) greater flexibility to each
architecture for handling of virtual memory while not compromising the
kernel's vmalloc logic and (2) more uniform use of the __vmalloc_node
interface, reserving the more specialized __vmalloc_node_range for more
specialized cases, such as kasan's shadow memory.

Signed-off-by: Maxwell Bland <mbland@motorola.com>
---
 arch/arm/kernel/irq.c               |  2 +-
 arch/arm64/include/asm/vmap_stack.h |  2 +-
 arch/arm64/kernel/efi.c             |  2 +-
 arch/powerpc/kernel/irq.c           |  2 +-
 arch/riscv/include/asm/irq_stack.h  |  2 +-
 arch/s390/hypfs/hypfs_diag.c        |  2 +-
 arch/s390/kernel/setup.c            |  6 ++---
 arch/s390/kernel/sthyi.c            |  2 +-
 include/linux/vmalloc.h             | 15 ++++++++++-
 kernel/bpf/syscall.c                |  4 +--
 kernel/fork.c                       |  4 +--
 kernel/scs.c                        |  3 +--
 lib/objpool.c                       |  2 +-
 lib/test_vmalloc.c                  |  6 ++---
 mm/util.c                           |  3 +--
 mm/vmalloc.c                        | 39 +++++++++++------------------
 16 files changed, 47 insertions(+), 49 deletions(-)

Comments

Christophe Leroy Feb. 21, 2024, 6:59 a.m. UTC | #1
Le 20/02/2024 à 21:32, Maxwell Bland a écrit :
> [Vous ne recevez pas souvent de courriers de mbland@motorola.com. Découvrez pourquoi ceci est important à https://aka.ms/LearnAboutSenderIdentification ]
> 
> Present non-uniform use of __vmalloc_node and __vmalloc_node_range makes
> enforcing appropriate code and data seperation untenable on certain
> microarchitectures, as VMALLOC_START and VMALLOC_END are monolithic
> while the use of the vmalloc interface is non-monolithic: in particular,
> appropriate randomness in ASLR makes it such that code regions must fall
> in some region between VMALLOC_START and VMALLOC_end, but this
> necessitates that code pages are intermingled with data pages, meaning
> code-specific protections, such as arm64's PXNTable, cannot be
> performantly runtime enforced.
> 
> The solution to this problem allows architectures to override the
> vmalloc wrapper functions by enforcing that the rest of the kernel does
> not reimplement __vmalloc_node by using __vmalloc_node_range with the
> same parameters as __vmalloc_node or provides a __weak tag to those
> functions using __vmalloc_node_range with parameters repeating those of
> __vmalloc_node.
> 
> Two benefits of this approach are (1) greater flexibility to each
> architecture for handling of virtual memory while not compromising the
> kernel's vmalloc logic and (2) more uniform use of the __vmalloc_node
> interface, reserving the more specialized __vmalloc_node_range for more
> specialized cases, such as kasan's shadow memory.

I'm not sure I understand the message. What I understand is that you 
allow architectures to override vmalloc_node().

In the code you add __weak for that. But you also add the flags to the 
parameters and I can't understand why when reading the above description.

Christophe
diff mbox series

Patch

diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index fe28fc1f759d..109f4f363621 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -61,7 +61,7 @@  static void __init init_irq_stacks(void)
 						       THREAD_SIZE_ORDER);
 		else
 			stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
-					       THREADINFO_GFP, NUMA_NO_NODE,
+					       THREADINFO_GFP, 0, NUMA_NO_NODE,
 					       __builtin_return_address(0));
 
 		if (WARN_ON(!stack))
diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h
index 20873099c035..57a7eaa720d5 100644
--- a/arch/arm64/include/asm/vmap_stack.h
+++ b/arch/arm64/include/asm/vmap_stack.h
@@ -21,7 +21,7 @@  static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node)
 
 	BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK));
 
-	p = __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node,
+	p = __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, 0, node,
 			__builtin_return_address(0));
 	return kasan_reset_tag(p);
 }
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 0228001347be..48efa31a9161 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -205,7 +205,7 @@  static int __init arm64_efi_rt_init(void)
 		return 0;
 
 	p = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, GFP_KERNEL,
-			   NUMA_NO_NODE, &&l);
+			   0, NUMA_NO_NODE, &&l);
 l:	if (!p) {
 		pr_warn("Failed to allocate EFI runtime stack\n");
 		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 6f7d4edaa0bc..ceb7ea07ca28 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -308,7 +308,7 @@  DEFINE_INTERRUPT_HANDLER_ASYNC(do_IRQ)
 static void *__init alloc_vm_stack(void)
 {
 	return __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP,
-			      NUMA_NO_NODE, (void *)_RET_IP_);
+			      0, NUMA_NO_NODE, (void *)_RET_IP_);
 }
 
 static void __init vmap_irqstack_init(void)
diff --git a/arch/riscv/include/asm/irq_stack.h b/arch/riscv/include/asm/irq_stack.h
index 6441ded3b0cf..d2410735bde0 100644
--- a/arch/riscv/include/asm/irq_stack.h
+++ b/arch/riscv/include/asm/irq_stack.h
@@ -24,7 +24,7 @@  static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node)
 {
 	void *p;
 
-	p = __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node,
+	p = __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, 0, node,
 			__builtin_return_address(0));
 	return kasan_reset_tag(p);
 }
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index 279b7bba4d43..16359d854288 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -70,7 +70,7 @@  void *diag204_get_buffer(enum diag204_format fmt, int *pages)
 			return ERR_PTR(-EOPNOTSUPP);
 	}
 	diag204_buf = __vmalloc_node(array_size(*pages, PAGE_SIZE),
-				     PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE,
+				     PAGE_SIZE, GFP_KERNEL, 0, NUMA_NO_NODE,
 				     __builtin_return_address(0));
 	if (!diag204_buf)
 		return ERR_PTR(-ENOMEM);
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index d1f3b56e7afc..2c25b4e9f20a 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -254,7 +254,7 @@  static void __init conmode_default(void)
 		cpcmd("QUERY TERM", query_buffer, 1024, NULL);
 		ptr = strstr(query_buffer, "CONMODE");
 		/*
-		 * Set the conmode to 3215 so that the device recognition 
+		 * Set the conmode to 3215 so that the device recognition
 		 * will set the cu_type of the console to 3215. If the
 		 * conmode is 3270 and we don't set it back then both
 		 * 3215 and the 3270 driver will try to access the console
@@ -314,7 +314,7 @@  static inline void setup_zfcpdump(void) {}
 
  /*
  * Reboot, halt and power_off stubs. They just call _machine_restart,
- * _machine_halt or _machine_power_off. 
+ * _machine_halt or _machine_power_off.
  */
 
 void machine_restart(char *command)
@@ -364,7 +364,7 @@  unsigned long stack_alloc(void)
 	void *ret;
 
 	ret = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP,
-			     NUMA_NO_NODE, __builtin_return_address(0));
+			     0, NUMA_NO_NODE, __builtin_return_address(0));
 	kmemleak_not_leak(ret);
 	return (unsigned long)ret;
 #else
diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c
index 30bb20461db4..5bf239bcdae9 100644
--- a/arch/s390/kernel/sthyi.c
+++ b/arch/s390/kernel/sthyi.c
@@ -318,7 +318,7 @@  static void fill_diag(struct sthyi_sctns *sctns)
 		return;
 
 	diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE),
-				     PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE,
+				     PAGE_SIZE, GFP_KERNEL, 0, NUMA_NO_NODE,
 				     __builtin_return_address(0));
 	if (!diag204_buf)
 		return;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c720be70c8dd..f13bd711ad7d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -150,7 +150,8 @@  extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			pgprot_t prot, unsigned long vm_flags, int node,
 			const void *caller) __alloc_size(1);
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
-		int node, const void *caller) __alloc_size(1);
+		unsigned long vm_flags, int node, const void *caller)
+		__alloc_size(1);
 void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
 
 extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
@@ -295,4 +296,16 @@  bool vmalloc_dump_obj(void *object);
 static inline bool vmalloc_dump_obj(void *object) { return false; }
 #endif
 
+#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
+#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
+#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
+#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
+#else
+/*
+ * 64b systems should always have either DMA or DMA32 zones. For others
+ * GFP_DMA32 should do the right thing and use the normal zone.
+ */
+#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
+#endif
+
 #endif /* _LINUX_VMALLOC_H */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1f18681721c..79c11307ff40 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -303,8 +303,8 @@  static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
 			return area;
 	}
 
-	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
-			gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
+	return __vmalloc_node(size, align,
+			gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL,
 			flags, numa_node, __builtin_return_address(0));
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 0d944e92a43f..800bb1c76000 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -304,10 +304,8 @@  static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 	 * so memcg accounting is performed manually on assigning/releasing
 	 * stacks to tasks. Drop __GFP_ACCOUNT.
 	 */
-	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
-				     VMALLOC_START, VMALLOC_END,
+	stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
 				     THREADINFO_GFP & ~__GFP_ACCOUNT,
-				     PAGE_KERNEL,
 				     0, node, __builtin_return_address(0));
 	if (!stack)
 		return -ENOMEM;
diff --git a/kernel/scs.c b/kernel/scs.c
index d7809affe740..5b89fb08a392 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -43,8 +43,7 @@  static void *__scs_alloc(int node)
 		}
 	}
 
-	s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
-				    GFP_SCS, PAGE_KERNEL, 0, node,
+	s = __vmalloc_node(SCS_SIZE, 1, GFP_SCS, 0, node,
 				    __builtin_return_address(0));
 
 out:
diff --git a/lib/objpool.c b/lib/objpool.c
index cfdc02420884..f0acd421a652 100644
--- a/lib/objpool.c
+++ b/lib/objpool.c
@@ -80,7 +80,7 @@  objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
 			slot = kmalloc_node(size, pool->gfp, cpu_to_node(i));
 		else
 			slot = __vmalloc_node(size, sizeof(void *), pool->gfp,
-				cpu_to_node(i), __builtin_return_address(0));
+				0, cpu_to_node(i), __builtin_return_address(0));
 		if (!slot)
 			return -ENOMEM;
 		memset(slot, 0, size);
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index 3718d9886407..6bde73f892f9 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -97,7 +97,7 @@  static int random_size_align_alloc_test(void)
 		size = ((rnd % 10) + 1) * PAGE_SIZE;
 
 		ptr = __vmalloc_node(size, align, GFP_KERNEL | __GFP_ZERO, 0,
-				__builtin_return_address(0));
+				0, __builtin_return_address(0));
 		if (!ptr)
 			return -1;
 
@@ -120,7 +120,7 @@  static int align_shift_alloc_test(void)
 		align = ((unsigned long) 1) << i;
 
 		ptr = __vmalloc_node(PAGE_SIZE, align, GFP_KERNEL|__GFP_ZERO, 0,
-				__builtin_return_address(0));
+				0, __builtin_return_address(0));
 		if (!ptr)
 			return -1;
 
@@ -138,7 +138,7 @@  static int fix_align_alloc_test(void)
 	for (i = 0; i < test_loop_count; i++) {
 		ptr = __vmalloc_node(5 * PAGE_SIZE, THREAD_ALIGN << 1,
 				GFP_KERNEL | __GFP_ZERO, 0,
-				__builtin_return_address(0));
+				0, __builtin_return_address(0));
 		if (!ptr)
 			return -1;
 
diff --git a/mm/util.c b/mm/util.c
index 5a6a9802583b..c6b7111215e2 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -639,8 +639,7 @@  void *kvmalloc_node(size_t size, gfp_t flags, int node)
 	 * about the resulting pointer, and cannot play
 	 * protection games.
 	 */
-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-			flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+	return __vmalloc_node(size, 1, flags, VM_ALLOW_HUGE_VMAP,
 			node, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(kvmalloc_node);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c17..18ece28e79d3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3119,7 +3119,7 @@  static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 
 	/* Please note that the recursion is strictly bounded. */
 	if (array_size > PAGE_SIZE) {
-		area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
+		area->pages = __vmalloc_node(array_size, 1, nested_gfp, 0, node,
 					area->caller);
 	} else {
 		area->pages = kmalloc_node(array_size, nested_gfp, node);
@@ -3379,11 +3379,12 @@  void *__vmalloc_node_range(unsigned long size, unsigned long align,
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *__vmalloc_node(unsigned long size, unsigned long align,
-			    gfp_t gfp_mask, int node, const void *caller)
+__weak void *__vmalloc_node(unsigned long size, unsigned long align,
+			    gfp_t gfp_mask, unsigned long vm_flags, int node,
+			    const void *caller)
 {
 	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
-				gfp_mask, PAGE_KERNEL, 0, node, caller);
+				gfp_mask, PAGE_KERNEL, vm_flags, node, caller);
 }
 /*
  * This is only for performance analysis of vmalloc and stress purpose.
@@ -3396,7 +3397,7 @@  EXPORT_SYMBOL_GPL(__vmalloc_node);
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 {
-	return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
+	return __vmalloc_node(size, 1, gfp_mask, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__vmalloc);
@@ -3415,7 +3416,7 @@  EXPORT_SYMBOL(__vmalloc);
  */
 void *vmalloc(unsigned long size)
 {
-	return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
+	return __vmalloc_node(size, 1, GFP_KERNEL, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc);
@@ -3432,7 +3433,7 @@  EXPORT_SYMBOL(vmalloc);
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
+__weak void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
 {
 	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
 				    gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
@@ -3455,7 +3456,7 @@  EXPORT_SYMBOL_GPL(vmalloc_huge);
  */
 void *vzalloc(unsigned long size)
 {
-	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
+	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 EXPORT_SYMBOL(vzalloc);
@@ -3469,7 +3470,7 @@  EXPORT_SYMBOL(vzalloc);
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *vmalloc_user(unsigned long size)
+__weak void *vmalloc_user(unsigned long size)
 {
 	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
 				    GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
@@ -3493,7 +3494,7 @@  EXPORT_SYMBOL(vmalloc_user);
  */
 void *vmalloc_node(unsigned long size, int node)
 {
-	return __vmalloc_node(size, 1, GFP_KERNEL, node,
+	return __vmalloc_node(size, 1, GFP_KERNEL, 0, node,
 			__builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_node);
@@ -3511,23 +3512,11 @@  EXPORT_SYMBOL(vmalloc_node);
  */
 void *vzalloc_node(unsigned long size, int node)
 {
-	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
+	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, 0, node,
 				__builtin_return_address(0));
 }
 EXPORT_SYMBOL(vzalloc_node);
 
-#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
-#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
-#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
-#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
-#else
-/*
- * 64b systems should always have either DMA or DMA32 zones. For others
- * GFP_DMA32 should do the right thing and use the normal zone.
- */
-#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
-#endif
-
 /**
  * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
  * @size:	allocation size
@@ -3539,7 +3528,7 @@  EXPORT_SYMBOL(vzalloc_node);
  */
 void *vmalloc_32(unsigned long size)
 {
-	return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
+	return __vmalloc_node(size, 1, GFP_VMALLOC32, 0, NUMA_NO_NODE,
 			__builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_32);
@@ -3553,7 +3542,7 @@  EXPORT_SYMBOL(vmalloc_32);
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *vmalloc_32_user(unsigned long size)
+__weak void *vmalloc_32_user(unsigned long size)
 {
 	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
 				    GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,