diff mbox

[resend,04/15] arm64: add support for kernel mode NEON in interrupt context

Message ID 1398959381-8126-5-git-send-email-ard.biesheuvel@linaro.org
State New
Headers show

Commit Message

Ard Biesheuvel May 1, 2014, 3:49 p.m. UTC
This patch modifies kernel_neon_begin() and kernel_neon_end(), so
they may be called from any context. To address the case where only
a couple of registers are needed, kernel_neon_begin_partial(u32) is
introduced which takes as a parameter the number of bottom 'n' NEON
q-registers required. To mark the end of such a partial section, the
regular kernel_neon_end() should be used.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/include/asm/fpsimd.h       | 15 ++++++++++++
 arch/arm64/include/asm/fpsimdmacros.h | 35 ++++++++++++++++++++++++++++
 arch/arm64/include/asm/neon.h         |  6 ++++-
 arch/arm64/kernel/entry-fpsimd.S      | 24 +++++++++++++++++++
 arch/arm64/kernel/fpsimd.c            | 44 ++++++++++++++++++++++++-----------
 5 files changed, 109 insertions(+), 15 deletions(-)

Comments

Catalin Marinas May 6, 2014, 4:49 p.m. UTC | #1
On Thu, May 01, 2014 at 04:49:36PM +0100, Ard Biesheuvel wrote:
> diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
> index 7a900142dbc8..05e1b24aca4c 100644
> --- a/arch/arm64/include/asm/fpsimd.h
> +++ b/arch/arm64/include/asm/fpsimd.h
> @@ -41,6 +41,17 @@ struct fpsimd_state {
>  	unsigned int cpu;
>  };
>  
> +/*
> + * Struct for stacking the bottom 'n' FP/SIMD registers.
> + */
> +struct fpsimd_partial_state {
> +	u32		num_regs;
> +	u32		fpsr;
> +	u32		fpcr;
> +	__uint128_t	vregs[32] __aligned(16);
> +} __aligned(16);

Do we need this explicit alignment here?

> diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
> index bbec599c96bd..69e75134689d 100644
> --- a/arch/arm64/include/asm/fpsimdmacros.h
> +++ b/arch/arm64/include/asm/fpsimdmacros.h
> @@ -62,3 +62,38 @@
>  	ldr	w\tmpnr, [\state, #16 * 2 + 4]
>  	msr	fpcr, x\tmpnr
>  .endm
> +
> +.altmacro
> +.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2
> +	mrs	x\tmpnr1, fpsr
> +	str	w\numnr, [\state]
> +	mrs	x\tmpnr2, fpcr
> +	stp	w\tmpnr1, w\tmpnr2, [\state, #4]
> +	adr	x\tmpnr1, 0f
> +	add	\state, \state, x\numnr, lsl #4
> +	sub	x\tmpnr1, x\tmpnr1, x\numnr, lsl #1
> +	br	x\tmpnr1
> +	.irp	qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
> +	.irp	qb, %(qa + 1)
> +	stp	q\qa, q\qb, [\state, # -16 * \qa - 16]
> +	.endr
> +	.endr
> +0:
> +.endm
> +
> +.macro fpsimd_restore_partial state, tmpnr1, tmpnr2
> +	ldp	w\tmpnr1, w\tmpnr2, [\state, #4]
> +	msr	fpsr, x\tmpnr1
> +	msr	fpcr, x\tmpnr2
> +	adr	x\tmpnr1, 0f
> +	ldr	w\tmpnr2, [\state]
> +	add	\state, \state, x\tmpnr2, lsl #4
> +	sub	x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
> +	br	x\tmpnr1
> +	.irp	qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
> +	.irp	qb, %(qa + 1)
> +	ldp	q\qa, q\qb, [\state, # -16 * \qa - 16]
> +	.endr
> +	.endr
> +0:
> +.endm

BTW, it may be better if num_regs is placed at the end of the structure,
especially since you use stp to store both fpsr and fpcr (though I
haven't rewritten the above to see how they look).
Ard Biesheuvel May 6, 2014, 5:09 p.m. UTC | #2
On 6 May 2014 18:49, Catalin Marinas <catalin.marinas@arm.com> wrote:
> On Thu, May 01, 2014 at 04:49:36PM +0100, Ard Biesheuvel wrote:
>> diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
>> index 7a900142dbc8..05e1b24aca4c 100644
>> --- a/arch/arm64/include/asm/fpsimd.h
>> +++ b/arch/arm64/include/asm/fpsimd.h
>> @@ -41,6 +41,17 @@ struct fpsimd_state {
>>       unsigned int cpu;
>>  };
>>
>> +/*
>> + * Struct for stacking the bottom 'n' FP/SIMD registers.
>> + */
>> +struct fpsimd_partial_state {
>> +     u32             num_regs;
>> +     u32             fpsr;
>> +     u32             fpcr;
>> +     __uint128_t     vregs[32] __aligned(16);
>> +} __aligned(16);
>
> Do we need this explicit alignment here?
>

Without it, the implied alignment is 8 bytes, I suppose, but I haven't
checked carefully.
I will check and remove this if 8 bytes is the default.

>> diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
>> index bbec599c96bd..69e75134689d 100644
>> --- a/arch/arm64/include/asm/fpsimdmacros.h
>> +++ b/arch/arm64/include/asm/fpsimdmacros.h
>> @@ -62,3 +62,38 @@
>>       ldr     w\tmpnr, [\state, #16 * 2 + 4]
>>       msr     fpcr, x\tmpnr
>>  .endm
>> +
>> +.altmacro
>> +.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2
>> +     mrs     x\tmpnr1, fpsr
>> +     str     w\numnr, [\state]
>> +     mrs     x\tmpnr2, fpcr
>> +     stp     w\tmpnr1, w\tmpnr2, [\state, #4]
>> +     adr     x\tmpnr1, 0f
>> +     add     \state, \state, x\numnr, lsl #4
>> +     sub     x\tmpnr1, x\tmpnr1, x\numnr, lsl #1
>> +     br      x\tmpnr1
>> +     .irp    qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
>> +     .irp    qb, %(qa + 1)
>> +     stp     q\qa, q\qb, [\state, # -16 * \qa - 16]
>> +     .endr
>> +     .endr
>> +0:
>> +.endm
>> +
>> +.macro fpsimd_restore_partial state, tmpnr1, tmpnr2
>> +     ldp     w\tmpnr1, w\tmpnr2, [\state, #4]
>> +     msr     fpsr, x\tmpnr1
>> +     msr     fpcr, x\tmpnr2
>> +     adr     x\tmpnr1, 0f
>> +     ldr     w\tmpnr2, [\state]
>> +     add     \state, \state, x\tmpnr2, lsl #4
>> +     sub     x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
>> +     br      x\tmpnr1
>> +     .irp    qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
>> +     .irp    qb, %(qa + 1)
>> +     ldp     q\qa, q\qb, [\state, # -16 * \qa - 16]
>> +     .endr
>> +     .endr
>> +0:
>> +.endm
>
> BTW, it may be better if num_regs is placed at the end of the structure,
> especially since you use stp to store both fpsr and fpcr (though I
> haven't rewritten the above to see how they look).
>

I suppose you mean in the middle, i.e., after fpsr and fpcr?
Yes that makes sense, I will change that.
diff mbox

Patch

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 7a900142dbc8..05e1b24aca4c 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -41,6 +41,17 @@  struct fpsimd_state {
 	unsigned int cpu;
 };
 
+/*
+ * Struct for stacking the bottom 'n' FP/SIMD registers.
+ */
+struct fpsimd_partial_state {
+	u32		num_regs;
+	u32		fpsr;
+	u32		fpcr;
+	__uint128_t	vregs[32] __aligned(16);
+} __aligned(16);
+
+
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /* Masks for extracting the FPSR and FPCR from the FPSCR */
 #define VFP_FPSCR_STAT_MASK	0xf800009f
@@ -66,6 +77,10 @@  extern void fpsimd_update_current_state(struct fpsimd_state *state);
 
 extern void fpsimd_flush_task_state(struct task_struct *target);
 
+extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state,
+				      u32 num_regs);
+extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state);
+
 #endif
 
 #endif
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
index bbec599c96bd..69e75134689d 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -62,3 +62,38 @@ 
 	ldr	w\tmpnr, [\state, #16 * 2 + 4]
 	msr	fpcr, x\tmpnr
 .endm
+
+.altmacro
+.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2
+	mrs	x\tmpnr1, fpsr
+	str	w\numnr, [\state]
+	mrs	x\tmpnr2, fpcr
+	stp	w\tmpnr1, w\tmpnr2, [\state, #4]
+	adr	x\tmpnr1, 0f
+	add	\state, \state, x\numnr, lsl #4
+	sub	x\tmpnr1, x\tmpnr1, x\numnr, lsl #1
+	br	x\tmpnr1
+	.irp	qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+	.irp	qb, %(qa + 1)
+	stp	q\qa, q\qb, [\state, # -16 * \qa - 16]
+	.endr
+	.endr
+0:
+.endm
+
+.macro fpsimd_restore_partial state, tmpnr1, tmpnr2
+	ldp	w\tmpnr1, w\tmpnr2, [\state, #4]
+	msr	fpsr, x\tmpnr1
+	msr	fpcr, x\tmpnr2
+	adr	x\tmpnr1, 0f
+	ldr	w\tmpnr2, [\state]
+	add	\state, \state, x\tmpnr2, lsl #4
+	sub	x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
+	br	x\tmpnr1
+	.irp	qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+	.irp	qb, %(qa + 1)
+	ldp	q\qa, q\qb, [\state, # -16 * \qa - 16]
+	.endr
+	.endr
+0:
+.endm
diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h
index b0cc58a97780..13ce4cc18e26 100644
--- a/arch/arm64/include/asm/neon.h
+++ b/arch/arm64/include/asm/neon.h
@@ -8,7 +8,11 @@ 
  * published by the Free Software Foundation.
  */
 
+#include <linux/types.h>
+
 #define cpu_has_neon()		(1)
 
-void kernel_neon_begin(void);
+#define kernel_neon_begin()	kernel_neon_begin_partial(32)
+
+void kernel_neon_begin_partial(u32 num_regs);
 void kernel_neon_end(void);
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 6a27cd6dbfa6..d358ccacfc00 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -41,3 +41,27 @@  ENTRY(fpsimd_load_state)
 	fpsimd_restore x0, 8
 	ret
 ENDPROC(fpsimd_load_state)
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+
+/*
+ * Save the bottom n FP registers.
+ *
+ * x0 - pointer to struct fpsimd_partial_state
+ */
+ENTRY(fpsimd_save_partial_state)
+	fpsimd_save_partial x0, 1, 8, 9
+	ret
+ENDPROC(fpsimd_load_partial_state)
+
+/*
+ * Load the bottom n FP registers.
+ *
+ * x0 - pointer to struct fpsimd_partial_state
+ */
+ENTRY(fpsimd_load_partial_state)
+	fpsimd_restore_partial x0, 8, 9
+	ret
+ENDPROC(fpsimd_load_partial_state)
+
+#endif
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 6cfbb4ef27d7..82ebc259a787 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -218,29 +218,45 @@  void fpsimd_flush_task_state(struct task_struct *t)
 
 #ifdef CONFIG_KERNEL_MODE_NEON
 
+static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate);
+static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate);
+
 /*
  * Kernel-side NEON support functions
  */
-void kernel_neon_begin(void)
+void kernel_neon_begin_partial(u32 num_regs)
 {
-	/* Avoid using the NEON in interrupt context */
-	BUG_ON(in_interrupt());
-	preempt_disable();
+	if (in_interrupt()) {
+		struct fpsimd_partial_state *s = this_cpu_ptr(
+			in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
 
-	/*
-	 * Save the userland FPSIMD state if we have one and if we haven't done
-	 * so already. Clear fpsimd_last_state to indicate that there is no
-	 * longer userland FPSIMD state in the registers.
-	 */
-	if (current->mm && !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE))
-		fpsimd_save_state(&current->thread.fpsimd_state);
-	this_cpu_write(fpsimd_last_state, NULL);
+		BUG_ON(num_regs > 32);
+		fpsimd_save_partial_state(s, roundup(num_regs, 2));
+	} else {
+		/*
+		 * Save the userland FPSIMD state if we have one and if we
+		 * haven't done so already. Clear fpsimd_last_state to indicate
+		 * that there is no longer userland FPSIMD state in the
+		 * registers.
+		 */
+		preempt_disable();
+		if (current->mm &&
+		    !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE))
+			fpsimd_save_state(&current->thread.fpsimd_state);
+		this_cpu_write(fpsimd_last_state, NULL);
+	}
 }
-EXPORT_SYMBOL(kernel_neon_begin);
+EXPORT_SYMBOL(kernel_neon_begin_partial);
 
 void kernel_neon_end(void)
 {
-	preempt_enable();
+	if (in_interrupt()) {
+		struct fpsimd_partial_state *s = this_cpu_ptr(
+			in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
+		fpsimd_load_partial_state(s);
+	} else {
+		preempt_enable();
+	}
 }
 EXPORT_SYMBOL(kernel_neon_end);