diff mbox series

[v6,2/3] LoongArch: vDSO: Wire up getrandom() vDSO implementation

Message ID 20240901061315.15693-3-xry111@xry111.site
State Superseded
Headers show
Series LoongArch: Implement getrandom() in vDSO | expand

Commit Message

Xi Ruoyao Sept. 1, 2024, 6:13 a.m. UTC
Hook up the generic vDSO implementation to the LoongArch vDSO data page
by providing the required __arch_chacha20_blocks_nostack,
__arch_get_k_vdso_rng_data, and getrandom_syscall implementations.

Signed-off-by: Xi Ruoyao <xry111@xry111.site>
---
 arch/loongarch/Kconfig                      |   1 +
 arch/loongarch/include/asm/vdso/getrandom.h |  38 +++
 arch/loongarch/include/asm/vdso/vdso.h      |   6 +
 arch/loongarch/include/asm/vdso/vsyscall.h  |   8 +
 arch/loongarch/kernel/vdso.c                |   1 +
 arch/loongarch/vdso/Makefile                |   7 +-
 arch/loongarch/vdso/vdso.lds.S              |   1 +
 arch/loongarch/vdso/vgetrandom-chacha.S     | 242 ++++++++++++++++++++
 arch/loongarch/vdso/vgetrandom.c            |  10 +
 9 files changed, 313 insertions(+), 1 deletion(-)
 create mode 100644 arch/loongarch/include/asm/vdso/getrandom.h
 create mode 100644 arch/loongarch/vdso/vgetrandom-chacha.S
 create mode 100644 arch/loongarch/vdso/vgetrandom.c

Comments

Christophe Leroy Sept. 19, 2024, 7:08 a.m. UTC | #1
Hi Xi,

Le 01/09/2024 à 08:13, Xi Ruoyao a écrit :
> Hook up the generic vDSO implementation to the LoongArch vDSO data page
> by providing the required __arch_chacha20_blocks_nostack,
> __arch_get_k_vdso_rng_data, and getrandom_syscall implementations.
> 
> Signed-off-by: Xi Ruoyao <xry111@xry111.site>
> ---

...

> diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S
> new file mode 100644
> index 000000000000..7e86a50f6e85
> --- /dev/null
> +++ b/arch/loongarch/vdso/vgetrandom-chacha.S
> @@ -0,0 +1,242 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
> + */
> +
> +#include <asm/asm.h>
> +#include <asm/regdef.h>
> +#include <linux/linkage.h>
> +
> +.text
> +
> +/* Salsa20 quarter-round */
> +.macro	QR	a b c d
> +	add.w		\a, \a, \b
> +	xor		\d, \d, \a
> +	rotri.w		\d, \d, 16
> +
> +	add.w		\c, \c, \d
> +	xor		\b, \b, \c
> +	rotri.w		\b, \b, 20
> +
> +	add.w		\a, \a, \b
> +	xor		\d, \d, \a
> +	rotri.w		\d, \d, 24
> +
> +	add.w		\c, \c, \d
> +	xor		\b, \b, \c
> +	rotri.w		\b, \b, 25
> +.endm
> +

I know nothing about Loongarch assembly and execution performance, but I 
see that GCC groups operations by 4 when building 
reference_chacha20_blocks() from vdso_test_chacha, see below.

Shouldn't you do the same and group ROUNDs by 4 just like I did on 
powerpc ? 
(https://github.com/torvalds/linux/blob/master/arch/powerpc/kernel/vdso/vgetrandom-chacha.S)

0000000000000134 <.L3>:
  134:	001061d8 	add.w       	$s1, $t2, $s1
  138:	0015c312 	xor         	$t6, $s1, $t4
  13c:	26000070 	ldptr.d     	$t4, $sp, 0
  140:	001036d6 	add.w       	$fp, $fp, $t1
  144:	001065f9 	add.w       	$s2, $t3, $s2
  148:	0010335a 	add.w       	$s3, $s3, $t0
  14c:	00159ad3 	xor         	$t7, $fp, $a2
  150:	0015c344 	xor         	$a0, $s3, $t4
  154:	0015c731 	xor         	$t5, $s2, $t5
  158:	004cc273 	rotri.w     	$t7, $t7, 0x10
  15c:	004cc252 	rotri.w     	$t6, $t6, 0x10
  160:	004cc231 	rotri.w     	$t5, $t5, 0x10
  164:	004cc084 	rotri.w     	$a0, $a0, 0x10
  168:	00104766 	add.w       	$a2, $s4, $t5
  16c:	00102088 	add.w       	$a4, $a0, $a4
  170:	00102669 	add.w       	$a5, $t7, $a5
  174:	001048e7 	add.w       	$a3, $a3, $t6
  178:	0015b530 	xor         	$t4, $a5, $t1
  17c:	0015b10c 	xor         	$t0, $a4, $t0
  180:	0015b8ee 	xor         	$t2, $a3, $t2
  184:	0015bccf 	xor         	$t3, $a2, $t3
  188:	004cd18d 	rotri.w     	$t1, $t0, 0x14
  18c:	004cd210 	rotri.w     	$t4, $t4, 0x14
  190:	004cd1ce 	rotri.w     	$t2, $t2, 0x14
  194:	004cd1ef 	rotri.w     	$t3, $t3, 0x14
  198:	001042d6 	add.w       	$fp, $fp, $t4
  19c:	00103b18 	add.w       	$s1, $s1, $t2
  1a0:	00103f39 	add.w       	$s2, $s2, $t3
  1a4:	0010375a 	add.w       	$s3, $s3, $t1
  1a8:	0015ced3 	xor         	$t7, $fp, $t7
  1ac:	0015cb12 	xor         	$t6, $s1, $t6
  1b0:	0015c731 	xor         	$t5, $s2, $t5
  1b4:	00159344 	xor         	$a0, $s3, $a0
  1b8:	004ce274 	rotri.w     	$t8, $t7, 0x18
  1bc:	004ce084 	rotri.w     	$a0, $a0, 0x18
  1c0:	004ce253 	rotri.w     	$t7, $t6, 0x18
  1c4:	004ce232 	rotri.w     	$t6, $t5, 0x18
  1c8:	00105129 	add.w       	$a5, $a5, $t8
  1cc:	00101111 	add.w       	$t5, $a4, $a0
  1d0:	00104ce7 	add.w       	$a3, $a3, $t7
  1d4:	001048c6 	add.w       	$a2, $a2, $t6
  1d8:	0015c130 	xor         	$t4, $a5, $t4
  1dc:	0015b8ee 	xor         	$t2, $a3, $t2
  1e0:	0015bccf 	xor         	$t3, $a2, $t3
  1e4:	0015b62d 	xor         	$t1, $t5, $t1
  1e8:	004ce610 	rotri.w     	$t4, $t4, 0x19
  1ec:	004ce5ce 	rotri.w     	$t2, $t2, 0x19
  1f0:	004ce5ef 	rotri.w     	$t3, $t3, 0x19
  1f4:	004ce5ad 	rotri.w     	$t1, $t1, 0x19
  1f8:	00103ad6 	add.w       	$fp, $fp, $t2
  1fc:	00103f18 	add.w       	$s1, $s1, $t3
  200:	00103739 	add.w       	$s2, $s2, $t1
  204:	0010435a 	add.w       	$s3, $s3, $t4
  208:	001592c4 	xor         	$a0, $fp, $a0
  20c:	0015d314 	xor         	$t8, $s1, $t8
  210:	0015cf33 	xor         	$t7, $s2, $t7
  214:	0015cb52 	xor         	$t6, $s3, $t6
  218:	004cc084 	rotri.w     	$a0, $a0, 0x10
  21c:	004cc294 	rotri.w     	$t8, $t8, 0x10
  220:	004cc273 	rotri.w     	$t7, $t7, 0x10
  224:	004cc252 	rotri.w     	$t6, $t6, 0x10
  228:	001010dc 	add.w       	$s5, $a2, $a0
  22c:	0010523d 	add.w       	$s6, $t5, $t8
  230:	00104d3e 	add.w       	$s7, $a5, $t7
  234:	001048ff 	add.w       	$s8, $a3, $t6
  238:	0015c3ec 	xor         	$t0, $s8, $t4
  23c:	0015bb8e 	xor         	$t2, $s5, $t2
  240:	0015bfaf 	xor         	$t3, $s6, $t3
  244:	0015b7cd 	xor         	$t1, $s7, $t1
  248:	004cd1ad 	rotri.w     	$t1, $t1, 0x14
  24c:	004cd18c 	rotri.w     	$t0, $t0, 0x14
  250:	004cd1ce 	rotri.w     	$t2, $t2, 0x14
  254:	004cd1ef 	rotri.w     	$t3, $t3, 0x14
  258:	00103ad7 	add.w       	$s0, $fp, $t2
  25c:	00103f0a 	add.w       	$a6, $s1, $t3
  260:	0010372b 	add.w       	$a7, $s2, $t1
  264:	00103341 	add.w       	$ra, $s3, $t0
  268:	001592e4 	xor         	$a0, $s0, $a0
  26c:	0015d154 	xor         	$t8, $a6, $t8
  270:	0015cd73 	xor         	$t7, $a7, $t7
  274:	0015c832 	xor         	$t6, $ra, $t6
  278:	004ce084 	rotri.w     	$a0, $a0, 0x18
  27c:	004ce294 	rotri.w     	$t8, $t8, 0x18
  280:	004ce273 	rotri.w     	$t7, $t7, 0x18
  284:	004ce252 	rotri.w     	$t6, $t6, 0x18
  288:	0010139c 	add.w       	$s5, $s5, $a0
  28c:	001053bd 	add.w       	$s6, $s6, $t8
  290:	00104fde 	add.w       	$s7, $s7, $t7
  294:	00104bff 	add.w       	$s8, $s8, $t6
  298:	0015b7d1 	xor         	$t5, $s7, $t1
  29c:	0015bb8e 	xor         	$t2, $s5, $t2
  2a0:	0015b3ed 	xor         	$t1, $s8, $t0
  2a4:	0015bfaf 	xor         	$t3, $s6, $t3
  2a8:	0040808c 	slli.w      	$t0, $a0, 0x0
  2ac:	004ce631 	rotri.w     	$t5, $t5, 0x19
  2b0:	004ce5ce 	rotri.w     	$t2, $t2, 0x19
  2b4:	004ce5ef 	rotri.w     	$t3, $t3, 0x19
  2b8:	004ce5ad 	rotri.w     	$t1, $t1, 0x19
  2bc:	2700006c 	stptr.d     	$t0, $sp, 0
  2c0:	02bffca5 	addi.w      	$a1, $a1, -1(0xfff)
  2c4:	0040822c 	slli.w      	$t0, $t5, 0x0
  2c8:	004082f6 	slli.w      	$fp, $s0, 0x0
  2cc:	0040839b 	slli.w      	$s4, $s5, 0x0
  2d0:	004081ce 	slli.w      	$t2, $t2, 0x0
  2d4:	00408158 	slli.w      	$s1, $a6, 0x0
  2d8:	00408286 	slli.w      	$a2, $t8, 0x0
  2dc:	004083a8 	slli.w      	$a4, $s6, 0x0
  2e0:	004081ef 	slli.w      	$t3, $t3, 0x0
  2e4:	00408179 	slli.w      	$s2, $a7, 0x0
  2e8:	00408270 	slli.w      	$t4, $t7, 0x0
  2ec:	004083c9 	slli.w      	$a5, $s7, 0x0
  2f0:	0040803a 	slli.w      	$s3, $ra, 0x0
  2f4:	00408251 	slli.w      	$t5, $t6, 0x0
  2f8:	004083e7 	slli.w      	$a3, $s8, 0x0
  2fc:	004081ad 	slli.w      	$t1, $t1, 0x0
  300:	47fe34bf 	bnez        	$a1, -460(0x7ffe34)	# 134 <.L3>

Christophe
Xi Ruoyao Sept. 19, 2024, 8:31 a.m. UTC | #2
On Thu, 2024-09-19 at 09:08 +0200, Christophe Leroy wrote:
> I know nothing about Loongarch assembly and execution performance, but I 
> see that GCC groups operations by 4 when building 
> reference_chacha20_blocks() from vdso_test_chacha, see below.
> 
> Shouldn't you do the same and group ROUNDs by 4 just like I did on 
> powerpc ? 
> (https://github.com/torvalds/linux/blob/master/arch/powerpc/kernel/vdso/vgetrandom-chacha.S)

Maybe.  In theory the scheduling would improve the performance.  I'll
measure if the scheduling will make an observable performance
improvement.
diff mbox series

Patch

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 70f169210b52..14821c2aba5b 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -190,6 +190,7 @@  config LOONGARCH
 	select TRACE_IRQFLAGS_SUPPORT
 	select USE_PERCPU_NUMA_NODE_ID
 	select USER_STACKTRACE_SUPPORT
+	select VDSO_GETRANDOM
 	select ZONE_DMA32
 
 config 32BIT
diff --git a/arch/loongarch/include/asm/vdso/getrandom.h b/arch/loongarch/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..f2d17daec1e2
--- /dev/null
+++ b/arch/loongarch/include/asm/vdso/getrandom.h
@@ -0,0 +1,38 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+#ifndef __ASM_VDSO_GETRANDOM_H
+#define __ASM_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm/unistd.h>
+#include <asm/vdso/vdso.h>
+
+static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags)
+{
+	register long ret asm("a0");
+	register long nr asm("a7") = __NR_getrandom;
+	register void *buffer asm("a0") = _buffer;
+	register size_t len asm("a1") = _len;
+	register unsigned int flags asm("a2") = _flags;
+
+	asm volatile(
+	"      syscall 0\n"
+	: "+r" (ret)
+	: "r" (nr), "r" (buffer), "r" (len), "r" (flags)
+	: "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8",
+	  "memory");
+
+	return ret;
+}
+
+static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
+{
+	return (const struct vdso_rng_data *)(get_vdso_data() + VVAR_LOONGARCH_PAGES_START * PAGE_SIZE + offsetof(struct loongarch_vdso_data, rng_data));
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/loongarch/include/asm/vdso/vdso.h b/arch/loongarch/include/asm/vdso/vdso.h
index 5a12309d9fb5..e31ac7474513 100644
--- a/arch/loongarch/include/asm/vdso/vdso.h
+++ b/arch/loongarch/include/asm/vdso/vdso.h
@@ -4,6 +4,9 @@ 
  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
  */
 
+#ifndef _ASM_VDSO_VDSO_H
+#define _ASM_VDSO_VDSO_H
+
 #ifndef __ASSEMBLY__
 
 #include <asm/asm.h>
@@ -16,6 +19,7 @@  struct vdso_pcpu_data {
 
 struct loongarch_vdso_data {
 	struct vdso_pcpu_data pdata[NR_CPUS];
+	struct vdso_rng_data rng_data;
 };
 
 /*
@@ -63,3 +67,5 @@  static inline unsigned long get_vdso_data(void)
 }
 
 #endif /* __ASSEMBLY__ */
+
+#endif
diff --git a/arch/loongarch/include/asm/vdso/vsyscall.h b/arch/loongarch/include/asm/vdso/vsyscall.h
index 5de615383a22..b1273ce6f140 100644
--- a/arch/loongarch/include/asm/vdso/vsyscall.h
+++ b/arch/loongarch/include/asm/vdso/vsyscall.h
@@ -8,6 +8,7 @@ 
 #include <vdso/datapage.h>
 
 extern struct vdso_data *vdso_data;
+extern struct vdso_rng_data *vdso_rng_data;
 
 /*
  * Update the vDSO data page to keep in sync with kernel timekeeping.
@@ -19,6 +20,13 @@  struct vdso_data *__loongarch_get_k_vdso_data(void)
 }
 #define __arch_get_k_vdso_data __loongarch_get_k_vdso_data
 
+static __always_inline
+struct vdso_rng_data *__loongarch_get_k_vdso_rng_data(void)
+{
+	return vdso_rng_data;
+}
+#define __arch_get_k_vdso_rng_data __loongarch_get_k_vdso_rng_data
+
 /* The asm-generic header needs to be included after the definitions above */
 #include <asm-generic/vdso/vsyscall.h>
 
diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c
index 90dfccb41c14..f6fcc52aefae 100644
--- a/arch/loongarch/kernel/vdso.c
+++ b/arch/loongarch/kernel/vdso.c
@@ -37,6 +37,7 @@  static union {
 static struct page *vdso_pages[] = { NULL };
 struct vdso_data *vdso_data = generic_vdso_data.data;
 struct vdso_pcpu_data *vdso_pdata = loongarch_vdso_data.vdata.pdata;
+struct vdso_rng_data *vdso_rng_data = &loongarch_vdso_data.vdata.rng_data;
 
 static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
 {
diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile
index d724d46b07c8..40c1175823d6 100644
--- a/arch/loongarch/vdso/Makefile
+++ b/arch/loongarch/vdso/Makefile
@@ -4,7 +4,8 @@ 
 # Include the generic Makefile to check the built vdso.
 include $(srctree)/lib/vdso/Makefile
 
-obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o sigreturn.o
+obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o vgetrandom.o \
+              vgetrandom-chacha.o sigreturn.o
 
 # Common compiler flags between ABIs.
 ccflags-vdso := \
@@ -29,6 +30,10 @@  ifneq ($(c-gettimeofday-y),)
   CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y)
 endif
 
+ifneq ($(c-getrandom-y),)
+  CFLAGS_vgetrandom.o += -include $(c-getrandom-y)
+endif
+
 # VDSO linker flags.
 ldflags-y := -Bsymbolic --no-undefined -soname=linux-vdso.so.1 \
 	$(filter -E%,$(KBUILD_CFLAGS)) -nostdlib -shared \
diff --git a/arch/loongarch/vdso/vdso.lds.S b/arch/loongarch/vdso/vdso.lds.S
index 56ad855896de..6b441bde4026 100644
--- a/arch/loongarch/vdso/vdso.lds.S
+++ b/arch/loongarch/vdso/vdso.lds.S
@@ -62,6 +62,7 @@  VERSION
 		__vdso_clock_getres;
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
+		__vdso_getrandom;
 		__vdso_rt_sigreturn;
 	local: *;
 	};
diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..7e86a50f6e85
--- /dev/null
+++ b/arch/loongarch/vdso/vgetrandom-chacha.S
@@ -0,0 +1,242 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+
+#include <asm/asm.h>
+#include <asm/regdef.h>
+#include <linux/linkage.h>
+
+.text
+
+/* Salsa20 quarter-round */
+.macro	QR	a b c d
+	add.w		\a, \a, \b
+	xor		\d, \d, \a
+	rotri.w		\d, \d, 16
+
+	add.w		\c, \c, \d
+	xor		\b, \b, \c
+	rotri.w		\b, \b, 20
+
+	add.w		\a, \a, \b
+	xor		\d, \d, \a
+	rotri.w		\d, \d, 24
+
+	add.w		\c, \c, \d
+	xor		\b, \b, \c
+	rotri.w		\b, \b, 25
+.endm
+
+/*
+ * Very basic LoongArch implementation of ChaCha20. Produces a given positive
+ * number of blocks of output with a nonce of 0, taking an input key and
+ * 8-byte counter. Importantly does not spill to the stack. Its arguments
+ * are:
+ *
+ *	a0: output bytes
+ *	a1: 32-byte key input
+ *	a2: 8-byte counter input/output
+ *	a3: number of 64-byte blocks to write to output
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+
+/* We don't need a frame pointer */
+#define s9		fp
+
+#define output		a0
+#define key		a1
+#define counter		a2
+#define nblocks		a3
+#define i		a4
+#define state0		s0
+#define state1		s1
+#define state2		s2
+#define state3		s3
+#define state4		s4
+#define state5		s5
+#define state6		s6
+#define state7		s7
+#define state8		s8
+#define state9		s9
+#define state10		a5
+#define state11		a6
+#define state12		a7
+#define state13		t0
+#define state14		t1
+#define state15		t2
+#define cnt_lo		t3
+#define cnt_hi		t4
+#define copy0		t5
+#define copy1		t6
+#define copy2		t7
+
+/* Reuse i as copy3 */
+#define copy3		i
+
+	/*
+	 * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
+	 * This does not violate the stack-less requirement: no sensitive data
+	 * is spilled onto the stack.
+	 */
+	PTR_ADDI	sp, sp, (-SZREG * 10) & STACK_ALIGN
+	REG_S		s0, sp, 0
+	REG_S		s1, sp, SZREG
+	REG_S		s2, sp, SZREG * 2
+	REG_S		s3, sp, SZREG * 3
+	REG_S		s4, sp, SZREG * 4
+	REG_S		s5, sp, SZREG * 5
+	REG_S		s6, sp, SZREG * 6
+	REG_S		s7, sp, SZREG * 7
+	REG_S		s8, sp, SZREG * 8
+	REG_S		s9, sp, SZREG * 9
+
+	li.w		copy0, 0x61707865
+	li.w		copy1, 0x3320646e
+	li.w		copy2, 0x79622d32
+
+	ld.w		cnt_lo, counter, 0
+	ld.w		cnt_hi, counter, 4
+
+.Lblock:
+	/* state[0,1,2,3] = "expand 32-byte k" */
+	move		state0, copy0
+	move		state1, copy1
+	move		state2, copy2
+	li.w		state3, 0x6b206574
+
+	/* state[4,5,..,11] = key */
+	ld.w		state4, key, 0
+	ld.w		state5, key, 4
+	ld.w		state6, key, 8
+	ld.w		state7, key, 12
+	ld.w		state8, key, 16
+	ld.w		state9, key, 20
+	ld.w		state10, key, 24
+	ld.w		state11, key, 28
+
+	/* state[12,13] = counter */
+	move		state12, cnt_lo
+	move		state13, cnt_hi
+
+	/* state[14,15] = 0 */
+	move		state14, zero
+	move		state15, zero
+
+	li.w		i, 10
+.Lpermute:
+	/* odd round */
+	QR		state0, state4, state8, state12
+	QR		state1, state5, state9, state13
+	QR		state2, state6, state10, state14
+	QR		state3, state7, state11, state15
+
+	/* even round */
+	QR		state0, state5, state10, state15
+	QR		state1, state6, state11, state12
+	QR		state2, state7, state8, state13
+	QR		state3, state4, state9, state14
+
+	addi.w		i, i, -1
+	bnez		i, .Lpermute
+
+	/*
+	 * copy[3] = "expa", materialize it here because copy[3] shares the
+	 * same register with i which just became dead.
+	 */
+	li.w		copy3, 0x6b206574
+
+	/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
+	add.w		state0, state0, copy0
+	add.w		state1, state1, copy1
+	add.w		state2, state2, copy2
+	add.w		state3, state3, copy3
+	st.w		state0, output, 0
+	st.w		state1, output, 4
+	st.w		state2, output, 8
+	st.w		state3, output, 12
+
+	/* from now on state[0,1,2,3] are scratch registers  */
+
+	/* state[0,1,2,3] = lo32(key) */
+	ld.w		state0, key, 0
+	ld.w		state1, key, 4
+	ld.w		state2, key, 8
+	ld.w		state3, key, 12
+
+	/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
+	add.w		state4, state4, state0
+	add.w		state5, state5, state1
+	add.w		state6, state6, state2
+	add.w		state7, state7, state3
+	st.w		state4, output, 16
+	st.w		state5, output, 20
+	st.w		state6, output, 24
+	st.w		state7, output, 28
+
+	/* state[0,1,2,3] = hi32(key) */
+	ld.w		state0, key, 16
+	ld.w		state1, key, 20
+	ld.w		state2, key, 24
+	ld.w		state3, key, 28
+
+	/* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
+	add.w		state8, state8, state0
+	add.w		state9, state9, state1
+	add.w		state10, state10, state2
+	add.w		state11, state11, state3
+	st.w		state8, output, 32
+	st.w		state9, output, 36
+	st.w		state10, output, 40
+	st.w		state11, output, 44
+
+	/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
+	add.w		state12, state12, cnt_lo
+	add.w		state13, state13, cnt_hi
+	st.w		state12, output, 48
+	st.w		state13, output, 52
+	st.w		state14, output, 56
+	st.w		state15, output, 60
+
+	/* ++counter  */
+	addi.w		cnt_lo, cnt_lo, 1
+	sltui		state0, cnt_lo, 1
+	add.w		cnt_hi, cnt_hi, state0
+
+	/* output += 64 */
+	PTR_ADDI	output, output, 64
+	/* --nblocks */
+	PTR_ADDI	nblocks, nblocks, -1
+	bnez		nblocks, .Lblock
+
+	/* counter = [cnt_lo, cnt_hi] */
+	st.w		cnt_lo, counter, 0
+	st.w		cnt_hi, counter, 4
+
+	/*
+	 * Zero out the potentially sensitive regs, in case nothing uses these
+	 * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
+	 * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
+	 * only need to zero state[11,...,15].
+	 */
+	move		state10, zero
+	move		state11, zero
+	move		state12, zero
+	move		state13, zero
+	move		state14, zero
+	move		state15, zero
+
+	REG_L		s0, sp, 0
+	REG_L		s1, sp, SZREG
+	REG_L		s2, sp, SZREG * 2
+	REG_L		s3, sp, SZREG * 3
+	REG_L		s4, sp, SZREG * 4
+	REG_L		s5, sp, SZREG * 5
+	REG_L		s6, sp, SZREG * 6
+	REG_L		s7, sp, SZREG * 7
+	REG_L		s8, sp, SZREG * 8
+	REG_L		s9, sp, SZREG * 9
+	PTR_ADDI	sp, sp, -((-SZREG * 10) & STACK_ALIGN)
+
+	jr		ra
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
diff --git a/arch/loongarch/vdso/vgetrandom.c b/arch/loongarch/vdso/vgetrandom.c
new file mode 100644
index 000000000000..d5f258ac4a36
--- /dev/null
+++ b/arch/loongarch/vdso/vgetrandom.c
@@ -0,0 +1,10 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+#include <linux/types.h>
+
+ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
+{
+	return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
+}