diff mbox series

[v2,09/10] ARM: p2v: switch to MOVW for Thumb2 and ARM/LPAE

Message ID 20200921154117.757-10-ardb@kernel.org
State Accepted
Commit e8e00f5afb087912fb3edb225ee373aa6499bb79
Headers show
Series [v2,01/10] ARM: p2v: fix handling of LPAE translation in BE mode | expand

Commit Message

Ard Biesheuvel Sept. 21, 2020, 3:41 p.m. UTC
In preparation for reducing the phys-to-virt minimum relative alignment
from 16 MiB to 2 MiB, switch to patchable sequences involving MOVW
instructions that can more easily be manipulated to carry a 12-bit
immediate. Note that the non-LPAE ARM sequence is not updated: MOVW
may not be supported on non-LPAE platforms, and the sequence itself
can be updated more easily to apply the 12 bits of displacement.

For Thumb2, which has many more versions of opcodes, switch to a sequence
that can be patched by the same patching code for both versions. Note
that the Thumb2 opcodes for MOVW and MVN are unambiguous, and have no
rotation bits in their immediate fields, so there is no need to use
placeholder constants in the asm blocks.

While at it, drop the 'volatile' qualifiers from the asm blocks: the
code does not have any side effects that are invisible to the compiler,
so it is free to omit these sequences if the outputs are not used.

Suggested-by: Russell King <linux@armlinux.org.uk>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/include/asm/memory.h |  44 ++++--
 arch/arm/kernel/phys2virt.S   | 147 +++++++++++++++-----
 2 files changed, 148 insertions(+), 43 deletions(-)

Comments

Nicolas Pitre Sept. 21, 2020, 6:29 p.m. UTC | #1
On Mon, 21 Sep 2020, Ard Biesheuvel wrote:

> In preparation for reducing the phys-to-virt minimum relative alignment

> from 16 MiB to 2 MiB, switch to patchable sequences involving MOVW

> instructions that can more easily be manipulated to carry a 12-bit

> immediate. Note that the non-LPAE ARM sequence is not updated: MOVW

> may not be supported on non-LPAE platforms, and the sequence itself

> can be updated more easily to apply the 12 bits of displacement.


I'm still wondering what is the advantage of a movw+lsl vs two adds?
I think I remember but I'd have to go back in the ARM ARM to be sure.
You could elaborate a bit more on the  "more easily be manipulated" 
please.  No need to resend just for this.

> For Thumb2, which has many more versions of opcodes, switch to a sequence

> that can be patched by the same patching code for both versions. Note

> that the Thumb2 opcodes for MOVW and MVN are unambiguous, and have no

> rotation bits in their immediate fields, so there is no need to use

> placeholder constants in the asm blocks.

> 

> While at it, drop the 'volatile' qualifiers from the asm blocks: the

> code does not have any side effects that are invisible to the compiler,

> so it is free to omit these sequences if the outputs are not used.


Indeed. Weird that the volatile was there in the first place.


> 

> Suggested-by: Russell King <linux@armlinux.org.uk>

> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>

> ---

>  arch/arm/include/asm/memory.h |  44 ++++--

>  arch/arm/kernel/phys2virt.S   | 147 +++++++++++++++-----

>  2 files changed, 148 insertions(+), 43 deletions(-)

> 

> diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h

> index 4121662dea5a..ccf55cef6ab9 100644

> --- a/arch/arm/include/asm/memory.h

> +++ b/arch/arm/include/asm/memory.h

> @@ -183,6 +183,7 @@ extern const void *__pv_table_begin, *__pv_table_end;

>  #define PHYS_OFFSET	((phys_addr_t)__pv_phys_pfn_offset << PAGE_SHIFT)

>  #define PHYS_PFN_OFFSET	(__pv_phys_pfn_offset)

>  

> +#ifndef CONFIG_THUMB2_KERNEL

>  #define __pv_stub(from,to,instr)			\

>  	__asm__("@ __pv_stub\n"				\

>  	"1:	" instr "	%0, %1, %2\n"		\

> @@ -192,25 +193,45 @@ extern const void *__pv_table_begin, *__pv_table_end;

>  	: "=r" (to)					\

>  	: "r" (from), "I" (__PV_BITS_31_24))

>  

> -#define __pv_stub_mov_hi(t)				\

> -	__asm__ volatile("@ __pv_stub_mov\n"		\

> -	"1:	mov	%R0, %1\n"			\

> +#define __pv_add_carry_stub(x, y)			\

> +	__asm__("@ __pv_add_carry_stub\n"		\

> +	"0:	movw	%R0, #0\n"			\

> +	"	adds	%Q0, %1, %R0, lsl #24\n"	\

> +	"1:	mov	%R0, %2\n"			\

> +	"	adc	%R0, %R0, #0\n"			\

>  	"	.pushsection .pv_table,\"a\"\n"		\

> -	"	.long	1b - .\n"			\

> +	"	.long	0b - ., 1b - .\n"		\

>  	"	.popsection\n"				\

> -	: "=r" (t)					\

> -	: "I" (__PV_BITS_7_0))

> +	: "=&r" (y)					\

> +	: "r" (x), "I" (__PV_BITS_7_0)			\

> +	: "cc")

> +

> +#else

> +#define __pv_stub(from,to,instr)			\

> +	__asm__("@ __pv_stub\n"				\

> +	"0:	movw	%0, #0\n"			\

> +	"	lsl	%0, #24\n"			\

> +	"	" instr " %0, %1, %0\n"			\

> +	"	.pushsection .pv_table,\"a\"\n"		\

> +	"	.long	0b - .\n"			\

> +	"	.popsection\n"				\

> +	: "=&r" (to)					\

> +	: "r" (from))

>  

>  #define __pv_add_carry_stub(x, y)			\

> -	__asm__ volatile("@ __pv_add_carry_stub\n"	\

> -	"1:	adds	%Q0, %1, %2\n"			\

> +	__asm__("@ __pv_add_carry_stub\n"		\

> +	"0:	movw	%R0, #0\n"			\

> +	"	lsls	%R0, #24\n"			\

> +	"	adds	%Q0, %1, %R0\n"			\

> +	"1:	mvn	%R0, #0\n"			\

>  	"	adc	%R0, %R0, #0\n"			\

>  	"	.pushsection .pv_table,\"a\"\n"		\

> -	"	.long	1b - .\n"			\

> +	"	.long	0b - ., 1b - .\n"		\

>  	"	.popsection\n"				\

> -	: "+r" (y)					\

> -	: "r" (x), "I" (__PV_BITS_31_24)		\

> +	: "=&r" (y)					\

> +	: "r" (x)					\

>  	: "cc")

> +#endif

>  

>  static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)

>  {

> @@ -219,7 +240,6 @@ static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)

>  	if (sizeof(phys_addr_t) == 4) {

>  		__pv_stub(x, t, "add");

>  	} else {

> -		__pv_stub_mov_hi(t);

>  		__pv_add_carry_stub(x, t);

>  	}

>  	return t;

> diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S

> index be8fb0d89877..a4e364689663 100644

> --- a/arch/arm/kernel/phys2virt.S

> +++ b/arch/arm/kernel/phys2virt.S

> @@ -1,7 +1,7 @@

>  /* SPDX-License-Identifier: GPL-2.0-only */

>  /*

>   *  Copyright (C) 1994-2002 Russell King

> - *  Copyright (c) 2003 ARM Limited

> + *  Copyright (c) 2003, 2020 ARM Limited

>   *  All Rights Reserved

>   */

>  

> @@ -58,55 +58,140 @@ __fixup_a_pv_table:

>  	mov	r6, r6, lsr #24

>  	cmn	r0, #1

>  #ifdef CONFIG_THUMB2_KERNEL

> +	@

> +	@ The Thumb-2 versions of the patchable sequences are

> +	@

> +	@ phys-to-virt:			movw	<reg>, #offset<31:24>

> +	@				lsl	<reg>, #24

> +	@				sub	<VA>, <PA>, <reg>

> +	@

> +	@ virt-to-phys (non-LPAE):	movw	<reg>, #offset<31:24>

> +	@				lsl	<reg>, #24

> +	@				add	<PA>, <VA>, <reg>

> +	@

> +	@ virt-to-phys (LPAE):		movw	<reg>, #offset<31:24>

> +	@				lsl	<reg>, #24

> +	@				adds	<PAlo>, <VA>, <reg>

> +	@				mov	<PAhi>, #offset<39:32>

> +	@				adc	<PAhi>, <PAhi>, #0

> +	@

> +	@ In the non-LPAE case, all patchable instructions are MOVW

> +	@ instructions, where we need to patch in the offset into the

> +	@ second halfword of the opcode (the 16-bit immediate is encoded

> +	@ as imm4:i:imm3:imm8)

> +	@

> +	@       15       11 10  9           4 3    0  15  14  12 11 8 7    0

> +	@      +-----------+---+-------------+------++---+------+----+------+

> +	@ MOVW | 1 1 1 1 0 | i | 1 0 0 1 0 0 | imm4 || 0 | imm3 | Rd | imm8 |

> +	@      +-----------+---+-------------+------++---+------+----+------+

> +	@

> +	@ In the LPAE case, we also need to patch in the high word of the

> +	@ offset into the immediate field of the MOV instruction, or patch it

> +	@ to a MVN instruction if the offset is negative. In this case, we

> +	@ need to inspect the first halfword of the opcode, to check whether

> +	@ it is MOVW or MOV/MVN, and to perform the MOV to MVN patching if

> +	@ needed. The encoding of the immediate is rather complex for values

> +	@ of i:imm3 != 0b0000, but fortunately, we never need more than 8 lower

> +	@ order bits, which can be patched into imm8 directly (and i:imm3

> +	@ cleared)

> +	@

> +	@      15       11 10  9        5         0  15  14  12 11 8 7    0

> +	@     +-----------+---+---------------------++---+------+----+------+

> +	@ MOV | 1 1 1 1 0 | i | 0 0 0 1 0 0 1 1 1 1 || 0 | imm3 | Rd | imm8 |

> +	@ MVN | 1 1 1 1 0 | i | 0 0 0 1 1 0 1 1 1 1 || 0 | imm3 | Rd | imm8 |

> +	@     +-----------+---+---------------------++---+------+----+------+

> +	@

>  	moveq	r0, #0x200000		@ set bit 21, mov to mvn instruction

> -	lsls	r6, #24

> -	beq	.Lnext

> -	clz	r7, r6

> -	lsr	r6, #24

> -	lsl	r6, r7

> -	bic	r6, #0x0080

> -	lsrs	r7, #1

> -	orrcs	r6, #0x0080

> -	orr	r6, r6, r7, lsl #12

> -	orr	r6, #0x4000

>  	b	.Lnext

>  .Lloop:	add	r7, r4

> -	adds	r4, #4

> -	ldrh	ip, [r7, #2]

> -ARM_BE8(rev16	ip, ip)

> -	tst	ip, #0x4000

> -	and	ip, #0x8f00

> -	orrne	ip, r6			@ mask in offset bits 31-24

> -	orreq	ip, r0			@ mask in offset bits 7-0

> -ARM_BE8(rev16	ip, ip)

> -	strh	ip, [r7, #2]

> -	bne	.Lnext

> +	adds	r4, #4			@ clears Z flag

> +#ifdef CONFIG_ARM_LPAE

>  	ldrh	ip, [r7]

>  ARM_BE8(rev16	ip, ip)

> -	bic	ip, #0x20

> -	orr	ip, ip, r0, lsr #16

> +	tst	ip, #0x200		@ MOVW has bit 9 set, MVN has it clear

> +	bne	0f			@ skip to MOVW handling (Z flag is clear)

> +	bic	ip, #0x20		@ clear bit 5 (MVN -> MOV)

> +	orr	ip, ip, r0, lsr #16	@ MOV -> MVN if offset < 0

>  ARM_BE8(rev16	ip, ip)

>  	strh	ip, [r7]

> +	@ Z flag is set

> +0:

> +#endif

> +	ldrh	ip, [r7, #2]

> +ARM_BE8(rev16	ip, ip)

> +	and	ip, #0xf00		@ clear everything except Rd field

> +	orreq	ip, r0			@ Z flag set -> MOV/MVN -> patch in high bits

> +	orrne	ip, r6			@ Z flag clear -> MOVW -> patch in low bits

> +ARM_BE8(rev16	ip, ip)

> +	strh	ip, [r7, #2]

>  #else

>  #ifdef CONFIG_CPU_ENDIAN_BE8

>  @ in BE8, we load data in BE, but instructions still in LE

> -#define PV_BIT22	0x00004000

> +#define PV_BIT24	0x00000001

>  #define PV_IMM8_MASK	0xff000000

> -#define PV_ROT_MASK	0x000f0000

>  #else

> -#define PV_BIT22	0x00400000

> +#define PV_BIT24	0x01000000

>  #define PV_IMM8_MASK	0x000000ff

> -#define PV_ROT_MASK	0xf00

>  #endif

>  

> +	@

> +	@ The ARM versions of the patchable sequences are

> +	@

> +	@ phys-to-virt:			sub	<VA>, <PA>, #offset<31:24>, lsl #24

> +	@

> +	@ virt-to-phys (non-LPAE):	add	<PA>, <VA>, #offset<31:24>, lsl #24

> +	@

> +	@ virt-to-phys (LPAE):		movw	<reg>, #offset<31:24>

> +	@				adds	<PAlo>, <VA>, <reg>, lsl #24

> +	@				mov	<PAhi>, #offset<39:32>

> +	@				adc	<PAhi>, <PAhi>, #0

> +	@

> +	@ In the non-LPAE case, all patchable instructions are ADD or SUB

> +	@ instructions, where we need to patch in the offset into the

> +	@ immediate field of the opcode, which is emitted with the correct

> +	@ rotation value. (The effective value of the immediate is imm12<7:0>

> +	@ rotated right by [2 * imm12<11:8>] bits)

> +	@

> +	@      31   28 27      23 22  20 19  16 15  12 11    0

> +	@      +------+-----------------+------+------+-------+

> +	@  ADD | cond | 0 0 1 0 1 0 0 0 |  Rn  |  Rd  | imm12 |

> +	@  SUB | cond | 0 0 1 0 0 1 0 0 |  Rn  |  Rd  | imm12 |

> +	@  MOV | cond | 0 0 1 1 1 0 1 0 |  Rn  |  Rd  | imm12 |

> +	@  MVN | cond | 0 0 1 1 1 1 1 0 |  Rn  |  Rd  | imm12 |

> +	@      +------+-----------------+------+------+-------+

> +	@

> +	@ In the LPAE case, we use a MOVW instruction to carry the low offset

> +	@ word, and patch in the high word of the offset into the immediate

> +	@ field of the subsequent MOV instruction, or patch it to a MVN

> +	@ instruction if the offset is negative. We can distinguish MOVW

> +	@ instructions based on bits 23:22 of the opcode, and ADD/SUB can be

> +	@ distinguished from MOV/MVN (all using the encodings above) using

> +	@ bit 24.

> +	@

> +	@      31   28 27      23 22  20 19  16 15  12 11    0

> +	@      +------+-----------------+------+------+-------+

> +	@ MOVW | cond | 0 0 1 1 0 0 0 0 | imm4 |  Rd  | imm12 |

> +	@      +------+-----------------+------+------+-------+

> +	@

>  	moveq	r0, #0x400000		@ set bit 22, mov to mvn instruction

>  	b	.Lnext

>  .Lloop:	ldr	ip, [r7, r4]

> +#ifdef CONFIG_ARM_LPAE

> +	tst	ip, #PV_BIT24		@ ADD/SUB have bit 24 clear

> +	beq	1f

> +ARM_BE8(rev	ip, ip)

> +	tst	ip, #0xc00000		@ MOVW has bits 23:22 clear

> +	bic	ip, ip, #0x400000	@ clear bit 22

> +	bfc	ip, #0, #12		@ clear imm12 field of MOV[W] instruction

> +	orreq	ip, ip, r6		@ MOVW -> mask in offset bits 31-24

> +	orrne	ip, ip, r0		@ MOV  -> mask in offset bits 7-0 (or bit 22)

> +ARM_BE8(rev	ip, ip)

> +	b	2f

> +1:

> +#endif

>  	bic	ip, ip, #PV_IMM8_MASK

> -	tst	ip, #PV_ROT_MASK		@ check the rotation field

> -	orrne	ip, ip, r6 ARM_BE8(, lsl #24)	@ mask in offset bits 31-24

> -	biceq	ip, ip, #PV_BIT22		@ clear bit 22

> -	orreq	ip, ip, r0 ARM_BE8(, ror #8)	@ mask in offset bits 7-0 (or bit 22)

> +	orr	ip, ip, r6 ARM_BE8(, lsl #24)	@ mask in offset bits 31-24

> +2:

>  	str	ip, [r7, r4]

>  	add	r4, r4, #4

>  #endif

> -- 

> 2.17.1

> 

>
Ard Biesheuvel Sept. 21, 2020, 6:45 p.m. UTC | #2
On Mon, 21 Sep 2020 at 20:29, Nicolas Pitre <nico@fluxnic.net> wrote:
>

> On Mon, 21 Sep 2020, Ard Biesheuvel wrote:

>

> > In preparation for reducing the phys-to-virt minimum relative alignment

> > from 16 MiB to 2 MiB, switch to patchable sequences involving MOVW

> > instructions that can more easily be manipulated to carry a 12-bit

> > immediate. Note that the non-LPAE ARM sequence is not updated: MOVW

> > may not be supported on non-LPAE platforms, and the sequence itself

> > can be updated more easily to apply the 12 bits of displacement.

>

> I'm still wondering what is the advantage of a movw+lsl vs two adds?

> I think I remember but I'd have to go back in the ARM ARM to be sure.

> You could elaborate a bit more on the  "more easily be manipulated"

> please.  No need to resend just for this.

>


The reason for using movw+lsl for Thumb2 is that it allows us to use
the same patching code for all three sequences: every MOVW gets the
low order offset patched in, and every MVN the high order offset.
Also, as the immediate format is different between MOVW and ADD/SUB,
and not as easily patchable in the latter case, we'd have to keep four
different values to patch into the opcodes (one for movw, one for
mov/mvn, one for the first add/sub, and one for the second), and have
logic to distinguish between all those opcodes.

> > For Thumb2, which has many more versions of opcodes, switch to a sequence

> > that can be patched by the same patching code for both versions. Note

> > that the Thumb2 opcodes for MOVW and MVN are unambiguous, and have no

> > rotation bits in their immediate fields, so there is no need to use

> > placeholder constants in the asm blocks.

> >

> > While at it, drop the 'volatile' qualifiers from the asm blocks: the

> > code does not have any side effects that are invisible to the compiler,

> > so it is free to omit these sequences if the outputs are not used.

>

> Indeed. Weird that the volatile was there in the first place.

>

>

> >

> > Suggested-by: Russell King <linux@armlinux.org.uk>

> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>

> > ---

> >  arch/arm/include/asm/memory.h |  44 ++++--

> >  arch/arm/kernel/phys2virt.S   | 147 +++++++++++++++-----

> >  2 files changed, 148 insertions(+), 43 deletions(-)

> >

> > diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h

> > index 4121662dea5a..ccf55cef6ab9 100644

> > --- a/arch/arm/include/asm/memory.h

> > +++ b/arch/arm/include/asm/memory.h

> > @@ -183,6 +183,7 @@ extern const void *__pv_table_begin, *__pv_table_end;

> >  #define PHYS_OFFSET  ((phys_addr_t)__pv_phys_pfn_offset << PAGE_SHIFT)

> >  #define PHYS_PFN_OFFSET      (__pv_phys_pfn_offset)

> >

> > +#ifndef CONFIG_THUMB2_KERNEL

> >  #define __pv_stub(from,to,instr)                     \

> >       __asm__("@ __pv_stub\n"                         \

> >       "1:     " instr "       %0, %1, %2\n"           \

> > @@ -192,25 +193,45 @@ extern const void *__pv_table_begin, *__pv_table_end;

> >       : "=r" (to)                                     \

> >       : "r" (from), "I" (__PV_BITS_31_24))

> >

> > -#define __pv_stub_mov_hi(t)                          \

> > -     __asm__ volatile("@ __pv_stub_mov\n"            \

> > -     "1:     mov     %R0, %1\n"                      \

> > +#define __pv_add_carry_stub(x, y)                    \

> > +     __asm__("@ __pv_add_carry_stub\n"               \

> > +     "0:     movw    %R0, #0\n"                      \

> > +     "       adds    %Q0, %1, %R0, lsl #24\n"        \

> > +     "1:     mov     %R0, %2\n"                      \

> > +     "       adc     %R0, %R0, #0\n"                 \

> >       "       .pushsection .pv_table,\"a\"\n"         \

> > -     "       .long   1b - .\n"                       \

> > +     "       .long   0b - ., 1b - .\n"               \

> >       "       .popsection\n"                          \

> > -     : "=r" (t)                                      \

> > -     : "I" (__PV_BITS_7_0))

> > +     : "=&r" (y)                                     \

> > +     : "r" (x), "I" (__PV_BITS_7_0)                  \

> > +     : "cc")

> > +

> > +#else

> > +#define __pv_stub(from,to,instr)                     \

> > +     __asm__("@ __pv_stub\n"                         \

> > +     "0:     movw    %0, #0\n"                       \

> > +     "       lsl     %0, #24\n"                      \

> > +     "       " instr " %0, %1, %0\n"                 \

> > +     "       .pushsection .pv_table,\"a\"\n"         \

> > +     "       .long   0b - .\n"                       \

> > +     "       .popsection\n"                          \

> > +     : "=&r" (to)                                    \

> > +     : "r" (from))

> >

> >  #define __pv_add_carry_stub(x, y)                    \

> > -     __asm__ volatile("@ __pv_add_carry_stub\n"      \

> > -     "1:     adds    %Q0, %1, %2\n"                  \

> > +     __asm__("@ __pv_add_carry_stub\n"               \

> > +     "0:     movw    %R0, #0\n"                      \

> > +     "       lsls    %R0, #24\n"                     \

> > +     "       adds    %Q0, %1, %R0\n"                 \

> > +     "1:     mvn     %R0, #0\n"                      \

> >       "       adc     %R0, %R0, #0\n"                 \

> >       "       .pushsection .pv_table,\"a\"\n"         \

> > -     "       .long   1b - .\n"                       \

> > +     "       .long   0b - ., 1b - .\n"               \

> >       "       .popsection\n"                          \

> > -     : "+r" (y)                                      \

> > -     : "r" (x), "I" (__PV_BITS_31_24)                \

> > +     : "=&r" (y)                                     \

> > +     : "r" (x)                                       \

> >       : "cc")

> > +#endif

> >

> >  static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)

> >  {

> > @@ -219,7 +240,6 @@ static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)

> >       if (sizeof(phys_addr_t) == 4) {

> >               __pv_stub(x, t, "add");

> >       } else {

> > -             __pv_stub_mov_hi(t);

> >               __pv_add_carry_stub(x, t);

> >       }

> >       return t;

> > diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S

> > index be8fb0d89877..a4e364689663 100644

> > --- a/arch/arm/kernel/phys2virt.S

> > +++ b/arch/arm/kernel/phys2virt.S

> > @@ -1,7 +1,7 @@

> >  /* SPDX-License-Identifier: GPL-2.0-only */

> >  /*

> >   *  Copyright (C) 1994-2002 Russell King

> > - *  Copyright (c) 2003 ARM Limited

> > + *  Copyright (c) 2003, 2020 ARM Limited

> >   *  All Rights Reserved

> >   */

> >

> > @@ -58,55 +58,140 @@ __fixup_a_pv_table:

> >       mov     r6, r6, lsr #24

> >       cmn     r0, #1

> >  #ifdef CONFIG_THUMB2_KERNEL

> > +     @

> > +     @ The Thumb-2 versions of the patchable sequences are

> > +     @

> > +     @ phys-to-virt:                 movw    <reg>, #offset<31:24>

> > +     @                               lsl     <reg>, #24

> > +     @                               sub     <VA>, <PA>, <reg>

> > +     @

> > +     @ virt-to-phys (non-LPAE):      movw    <reg>, #offset<31:24>

> > +     @                               lsl     <reg>, #24

> > +     @                               add     <PA>, <VA>, <reg>

> > +     @

> > +     @ virt-to-phys (LPAE):          movw    <reg>, #offset<31:24>

> > +     @                               lsl     <reg>, #24

> > +     @                               adds    <PAlo>, <VA>, <reg>

> > +     @                               mov     <PAhi>, #offset<39:32>

> > +     @                               adc     <PAhi>, <PAhi>, #0

> > +     @

> > +     @ In the non-LPAE case, all patchable instructions are MOVW

> > +     @ instructions, where we need to patch in the offset into the

> > +     @ second halfword of the opcode (the 16-bit immediate is encoded

> > +     @ as imm4:i:imm3:imm8)

> > +     @

> > +     @       15       11 10  9           4 3    0  15  14  12 11 8 7    0

> > +     @      +-----------+---+-------------+------++---+------+----+------+

> > +     @ MOVW | 1 1 1 1 0 | i | 1 0 0 1 0 0 | imm4 || 0 | imm3 | Rd | imm8 |

> > +     @      +-----------+---+-------------+------++---+------+----+------+

> > +     @

> > +     @ In the LPAE case, we also need to patch in the high word of the

> > +     @ offset into the immediate field of the MOV instruction, or patch it

> > +     @ to a MVN instruction if the offset is negative. In this case, we

> > +     @ need to inspect the first halfword of the opcode, to check whether

> > +     @ it is MOVW or MOV/MVN, and to perform the MOV to MVN patching if

> > +     @ needed. The encoding of the immediate is rather complex for values

> > +     @ of i:imm3 != 0b0000, but fortunately, we never need more than 8 lower

> > +     @ order bits, which can be patched into imm8 directly (and i:imm3

> > +     @ cleared)

> > +     @

> > +     @      15       11 10  9        5         0  15  14  12 11 8 7    0

> > +     @     +-----------+---+---------------------++---+------+----+------+

> > +     @ MOV | 1 1 1 1 0 | i | 0 0 0 1 0 0 1 1 1 1 || 0 | imm3 | Rd | imm8 |

> > +     @ MVN | 1 1 1 1 0 | i | 0 0 0 1 1 0 1 1 1 1 || 0 | imm3 | Rd | imm8 |

> > +     @     +-----------+---+---------------------++---+------+----+------+

> > +     @

> >       moveq   r0, #0x200000           @ set bit 21, mov to mvn instruction

> > -     lsls    r6, #24

> > -     beq     .Lnext

> > -     clz     r7, r6

> > -     lsr     r6, #24

> > -     lsl     r6, r7

> > -     bic     r6, #0x0080

> > -     lsrs    r7, #1

> > -     orrcs   r6, #0x0080

> > -     orr     r6, r6, r7, lsl #12

> > -     orr     r6, #0x4000

> >       b       .Lnext

> >  .Lloop:      add     r7, r4

> > -     adds    r4, #4

> > -     ldrh    ip, [r7, #2]

> > -ARM_BE8(rev16        ip, ip)

> > -     tst     ip, #0x4000

> > -     and     ip, #0x8f00

> > -     orrne   ip, r6                  @ mask in offset bits 31-24

> > -     orreq   ip, r0                  @ mask in offset bits 7-0

> > -ARM_BE8(rev16        ip, ip)

> > -     strh    ip, [r7, #2]

> > -     bne     .Lnext

> > +     adds    r4, #4                  @ clears Z flag

> > +#ifdef CONFIG_ARM_LPAE

> >       ldrh    ip, [r7]

> >  ARM_BE8(rev16        ip, ip)

> > -     bic     ip, #0x20

> > -     orr     ip, ip, r0, lsr #16

> > +     tst     ip, #0x200              @ MOVW has bit 9 set, MVN has it clear

> > +     bne     0f                      @ skip to MOVW handling (Z flag is clear)

> > +     bic     ip, #0x20               @ clear bit 5 (MVN -> MOV)

> > +     orr     ip, ip, r0, lsr #16     @ MOV -> MVN if offset < 0

> >  ARM_BE8(rev16        ip, ip)

> >       strh    ip, [r7]

> > +     @ Z flag is set

> > +0:

> > +#endif

> > +     ldrh    ip, [r7, #2]

> > +ARM_BE8(rev16        ip, ip)

> > +     and     ip, #0xf00              @ clear everything except Rd field

> > +     orreq   ip, r0                  @ Z flag set -> MOV/MVN -> patch in high bits

> > +     orrne   ip, r6                  @ Z flag clear -> MOVW -> patch in low bits

> > +ARM_BE8(rev16        ip, ip)

> > +     strh    ip, [r7, #2]

> >  #else

> >  #ifdef CONFIG_CPU_ENDIAN_BE8

> >  @ in BE8, we load data in BE, but instructions still in LE

> > -#define PV_BIT22     0x00004000

> > +#define PV_BIT24     0x00000001

> >  #define PV_IMM8_MASK 0xff000000

> > -#define PV_ROT_MASK  0x000f0000

> >  #else

> > -#define PV_BIT22     0x00400000

> > +#define PV_BIT24     0x01000000

> >  #define PV_IMM8_MASK 0x000000ff

> > -#define PV_ROT_MASK  0xf00

> >  #endif

> >

> > +     @

> > +     @ The ARM versions of the patchable sequences are

> > +     @

> > +     @ phys-to-virt:                 sub     <VA>, <PA>, #offset<31:24>, lsl #24

> > +     @

> > +     @ virt-to-phys (non-LPAE):      add     <PA>, <VA>, #offset<31:24>, lsl #24

> > +     @

> > +     @ virt-to-phys (LPAE):          movw    <reg>, #offset<31:24>

> > +     @                               adds    <PAlo>, <VA>, <reg>, lsl #24

> > +     @                               mov     <PAhi>, #offset<39:32>

> > +     @                               adc     <PAhi>, <PAhi>, #0

> > +     @

> > +     @ In the non-LPAE case, all patchable instructions are ADD or SUB

> > +     @ instructions, where we need to patch in the offset into the

> > +     @ immediate field of the opcode, which is emitted with the correct

> > +     @ rotation value. (The effective value of the immediate is imm12<7:0>

> > +     @ rotated right by [2 * imm12<11:8>] bits)

> > +     @

> > +     @      31   28 27      23 22  20 19  16 15  12 11    0

> > +     @      +------+-----------------+------+------+-------+

> > +     @  ADD | cond | 0 0 1 0 1 0 0 0 |  Rn  |  Rd  | imm12 |

> > +     @  SUB | cond | 0 0 1 0 0 1 0 0 |  Rn  |  Rd  | imm12 |

> > +     @  MOV | cond | 0 0 1 1 1 0 1 0 |  Rn  |  Rd  | imm12 |

> > +     @  MVN | cond | 0 0 1 1 1 1 1 0 |  Rn  |  Rd  | imm12 |

> > +     @      +------+-----------------+------+------+-------+

> > +     @

> > +     @ In the LPAE case, we use a MOVW instruction to carry the low offset

> > +     @ word, and patch in the high word of the offset into the immediate

> > +     @ field of the subsequent MOV instruction, or patch it to a MVN

> > +     @ instruction if the offset is negative. We can distinguish MOVW

> > +     @ instructions based on bits 23:22 of the opcode, and ADD/SUB can be

> > +     @ distinguished from MOV/MVN (all using the encodings above) using

> > +     @ bit 24.

> > +     @

> > +     @      31   28 27      23 22  20 19  16 15  12 11    0

> > +     @      +------+-----------------+------+------+-------+

> > +     @ MOVW | cond | 0 0 1 1 0 0 0 0 | imm4 |  Rd  | imm12 |

> > +     @      +------+-----------------+------+------+-------+

> > +     @

> >       moveq   r0, #0x400000           @ set bit 22, mov to mvn instruction

> >       b       .Lnext

> >  .Lloop:      ldr     ip, [r7, r4]

> > +#ifdef CONFIG_ARM_LPAE

> > +     tst     ip, #PV_BIT24           @ ADD/SUB have bit 24 clear

> > +     beq     1f

> > +ARM_BE8(rev  ip, ip)

> > +     tst     ip, #0xc00000           @ MOVW has bits 23:22 clear

> > +     bic     ip, ip, #0x400000       @ clear bit 22

> > +     bfc     ip, #0, #12             @ clear imm12 field of MOV[W] instruction

> > +     orreq   ip, ip, r6              @ MOVW -> mask in offset bits 31-24

> > +     orrne   ip, ip, r0              @ MOV  -> mask in offset bits 7-0 (or bit 22)

> > +ARM_BE8(rev  ip, ip)

> > +     b       2f

> > +1:

> > +#endif

> >       bic     ip, ip, #PV_IMM8_MASK

> > -     tst     ip, #PV_ROT_MASK                @ check the rotation field

> > -     orrne   ip, ip, r6 ARM_BE8(, lsl #24)   @ mask in offset bits 31-24

> > -     biceq   ip, ip, #PV_BIT22               @ clear bit 22

> > -     orreq   ip, ip, r0 ARM_BE8(, ror #8)    @ mask in offset bits 7-0 (or bit 22)

> > +     orr     ip, ip, r6 ARM_BE8(, lsl #24)   @ mask in offset bits 31-24

> > +2:

> >       str     ip, [r7, r4]

> >       add     r4, r4, #4

> >  #endif

> > --

> > 2.17.1

> >

> >
Linus Walleij Sept. 22, 2020, 9 a.m. UTC | #3
On Mon, Sep 21, 2020 at 5:41 PM Ard Biesheuvel <ardb@kernel.org> wrote:

> In preparation for reducing the phys-to-virt minimum relative alignment

> from 16 MiB to 2 MiB, switch to patchable sequences involving MOVW

> instructions that can more easily be manipulated to carry a 12-bit

> immediate. Note that the non-LPAE ARM sequence is not updated: MOVW

> may not be supported on non-LPAE platforms, and the sequence itself

> can be updated more easily to apply the 12 bits of displacement.

>

> For Thumb2, which has many more versions of opcodes, switch to a sequence

> that can be patched by the same patching code for both versions. Note

> that the Thumb2 opcodes for MOVW and MVN are unambiguous, and have no

> rotation bits in their immediate fields, so there is no need to use

> placeholder constants in the asm blocks.

>

> While at it, drop the 'volatile' qualifiers from the asm blocks: the

> code does not have any side effects that are invisible to the compiler,

> so it is free to omit these sequences if the outputs are not used.

>

> Suggested-by: Russell King <linux@armlinux.org.uk>

> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>


Already the nice comments explaining what is going on
here makes the patch highly valuable. It was opaque to
me until I read the comments in this patch. Now it is
just hard to understand instead of hopeless to understand
(for my limited intellect):
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>


Yours,
Linus Walleij
diff mbox series

Patch

diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index 4121662dea5a..ccf55cef6ab9 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -183,6 +183,7 @@  extern const void *__pv_table_begin, *__pv_table_end;
 #define PHYS_OFFSET	((phys_addr_t)__pv_phys_pfn_offset << PAGE_SHIFT)
 #define PHYS_PFN_OFFSET	(__pv_phys_pfn_offset)
 
+#ifndef CONFIG_THUMB2_KERNEL
 #define __pv_stub(from,to,instr)			\
 	__asm__("@ __pv_stub\n"				\
 	"1:	" instr "	%0, %1, %2\n"		\
@@ -192,25 +193,45 @@  extern const void *__pv_table_begin, *__pv_table_end;
 	: "=r" (to)					\
 	: "r" (from), "I" (__PV_BITS_31_24))
 
-#define __pv_stub_mov_hi(t)				\
-	__asm__ volatile("@ __pv_stub_mov\n"		\
-	"1:	mov	%R0, %1\n"			\
+#define __pv_add_carry_stub(x, y)			\
+	__asm__("@ __pv_add_carry_stub\n"		\
+	"0:	movw	%R0, #0\n"			\
+	"	adds	%Q0, %1, %R0, lsl #24\n"	\
+	"1:	mov	%R0, %2\n"			\
+	"	adc	%R0, %R0, #0\n"			\
 	"	.pushsection .pv_table,\"a\"\n"		\
-	"	.long	1b - .\n"			\
+	"	.long	0b - ., 1b - .\n"		\
 	"	.popsection\n"				\
-	: "=r" (t)					\
-	: "I" (__PV_BITS_7_0))
+	: "=&r" (y)					\
+	: "r" (x), "I" (__PV_BITS_7_0)			\
+	: "cc")
+
+#else
+#define __pv_stub(from,to,instr)			\
+	__asm__("@ __pv_stub\n"				\
+	"0:	movw	%0, #0\n"			\
+	"	lsl	%0, #24\n"			\
+	"	" instr " %0, %1, %0\n"			\
+	"	.pushsection .pv_table,\"a\"\n"		\
+	"	.long	0b - .\n"			\
+	"	.popsection\n"				\
+	: "=&r" (to)					\
+	: "r" (from))
 
 #define __pv_add_carry_stub(x, y)			\
-	__asm__ volatile("@ __pv_add_carry_stub\n"	\
-	"1:	adds	%Q0, %1, %2\n"			\
+	__asm__("@ __pv_add_carry_stub\n"		\
+	"0:	movw	%R0, #0\n"			\
+	"	lsls	%R0, #24\n"			\
+	"	adds	%Q0, %1, %R0\n"			\
+	"1:	mvn	%R0, #0\n"			\
 	"	adc	%R0, %R0, #0\n"			\
 	"	.pushsection .pv_table,\"a\"\n"		\
-	"	.long	1b - .\n"			\
+	"	.long	0b - ., 1b - .\n"		\
 	"	.popsection\n"				\
-	: "+r" (y)					\
-	: "r" (x), "I" (__PV_BITS_31_24)		\
+	: "=&r" (y)					\
+	: "r" (x)					\
 	: "cc")
+#endif
 
 static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
 {
@@ -219,7 +240,6 @@  static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
 	if (sizeof(phys_addr_t) == 4) {
 		__pv_stub(x, t, "add");
 	} else {
-		__pv_stub_mov_hi(t);
 		__pv_add_carry_stub(x, t);
 	}
 	return t;
diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S
index be8fb0d89877..a4e364689663 100644
--- a/arch/arm/kernel/phys2virt.S
+++ b/arch/arm/kernel/phys2virt.S
@@ -1,7 +1,7 @@ 
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
  *  Copyright (C) 1994-2002 Russell King
- *  Copyright (c) 2003 ARM Limited
+ *  Copyright (c) 2003, 2020 ARM Limited
  *  All Rights Reserved
  */
 
@@ -58,55 +58,140 @@  __fixup_a_pv_table:
 	mov	r6, r6, lsr #24
 	cmn	r0, #1
 #ifdef CONFIG_THUMB2_KERNEL
+	@
+	@ The Thumb-2 versions of the patchable sequences are
+	@
+	@ phys-to-virt:			movw	<reg>, #offset<31:24>
+	@				lsl	<reg>, #24
+	@				sub	<VA>, <PA>, <reg>
+	@
+	@ virt-to-phys (non-LPAE):	movw	<reg>, #offset<31:24>
+	@				lsl	<reg>, #24
+	@				add	<PA>, <VA>, <reg>
+	@
+	@ virt-to-phys (LPAE):		movw	<reg>, #offset<31:24>
+	@				lsl	<reg>, #24
+	@				adds	<PAlo>, <VA>, <reg>
+	@				mov	<PAhi>, #offset<39:32>
+	@				adc	<PAhi>, <PAhi>, #0
+	@
+	@ In the non-LPAE case, all patchable instructions are MOVW
+	@ instructions, where we need to patch in the offset into the
+	@ second halfword of the opcode (the 16-bit immediate is encoded
+	@ as imm4:i:imm3:imm8)
+	@
+	@       15       11 10  9           4 3    0  15  14  12 11 8 7    0
+	@      +-----------+---+-------------+------++---+------+----+------+
+	@ MOVW | 1 1 1 1 0 | i | 1 0 0 1 0 0 | imm4 || 0 | imm3 | Rd | imm8 |
+	@      +-----------+---+-------------+------++---+------+----+------+
+	@
+	@ In the LPAE case, we also need to patch in the high word of the
+	@ offset into the immediate field of the MOV instruction, or patch it
+	@ to a MVN instruction if the offset is negative. In this case, we
+	@ need to inspect the first halfword of the opcode, to check whether
+	@ it is MOVW or MOV/MVN, and to perform the MOV to MVN patching if
+	@ needed. The encoding of the immediate is rather complex for values
+	@ of i:imm3 != 0b0000, but fortunately, we never need more than 8 lower
+	@ order bits, which can be patched into imm8 directly (and i:imm3
+	@ cleared)
+	@
+	@      15       11 10  9        5         0  15  14  12 11 8 7    0
+	@     +-----------+---+---------------------++---+------+----+------+
+	@ MOV | 1 1 1 1 0 | i | 0 0 0 1 0 0 1 1 1 1 || 0 | imm3 | Rd | imm8 |
+	@ MVN | 1 1 1 1 0 | i | 0 0 0 1 1 0 1 1 1 1 || 0 | imm3 | Rd | imm8 |
+	@     +-----------+---+---------------------++---+------+----+------+
+	@
 	moveq	r0, #0x200000		@ set bit 21, mov to mvn instruction
-	lsls	r6, #24
-	beq	.Lnext
-	clz	r7, r6
-	lsr	r6, #24
-	lsl	r6, r7
-	bic	r6, #0x0080
-	lsrs	r7, #1
-	orrcs	r6, #0x0080
-	orr	r6, r6, r7, lsl #12
-	orr	r6, #0x4000
 	b	.Lnext
 .Lloop:	add	r7, r4
-	adds	r4, #4
-	ldrh	ip, [r7, #2]
-ARM_BE8(rev16	ip, ip)
-	tst	ip, #0x4000
-	and	ip, #0x8f00
-	orrne	ip, r6			@ mask in offset bits 31-24
-	orreq	ip, r0			@ mask in offset bits 7-0
-ARM_BE8(rev16	ip, ip)
-	strh	ip, [r7, #2]
-	bne	.Lnext
+	adds	r4, #4			@ clears Z flag
+#ifdef CONFIG_ARM_LPAE
 	ldrh	ip, [r7]
 ARM_BE8(rev16	ip, ip)
-	bic	ip, #0x20
-	orr	ip, ip, r0, lsr #16
+	tst	ip, #0x200		@ MOVW has bit 9 set, MVN has it clear
+	bne	0f			@ skip to MOVW handling (Z flag is clear)
+	bic	ip, #0x20		@ clear bit 5 (MVN -> MOV)
+	orr	ip, ip, r0, lsr #16	@ MOV -> MVN if offset < 0
 ARM_BE8(rev16	ip, ip)
 	strh	ip, [r7]
+	@ Z flag is set
+0:
+#endif
+	ldrh	ip, [r7, #2]
+ARM_BE8(rev16	ip, ip)
+	and	ip, #0xf00		@ clear everything except Rd field
+	orreq	ip, r0			@ Z flag set -> MOV/MVN -> patch in high bits
+	orrne	ip, r6			@ Z flag clear -> MOVW -> patch in low bits
+ARM_BE8(rev16	ip, ip)
+	strh	ip, [r7, #2]
 #else
 #ifdef CONFIG_CPU_ENDIAN_BE8
 @ in BE8, we load data in BE, but instructions still in LE
-#define PV_BIT22	0x00004000
+#define PV_BIT24	0x00000001
 #define PV_IMM8_MASK	0xff000000
-#define PV_ROT_MASK	0x000f0000
 #else
-#define PV_BIT22	0x00400000
+#define PV_BIT24	0x01000000
 #define PV_IMM8_MASK	0x000000ff
-#define PV_ROT_MASK	0xf00
 #endif
 
+	@
+	@ The ARM versions of the patchable sequences are
+	@
+	@ phys-to-virt:			sub	<VA>, <PA>, #offset<31:24>, lsl #24
+	@
+	@ virt-to-phys (non-LPAE):	add	<PA>, <VA>, #offset<31:24>, lsl #24
+	@
+	@ virt-to-phys (LPAE):		movw	<reg>, #offset<31:24>
+	@				adds	<PAlo>, <VA>, <reg>, lsl #24
+	@				mov	<PAhi>, #offset<39:32>
+	@				adc	<PAhi>, <PAhi>, #0
+	@
+	@ In the non-LPAE case, all patchable instructions are ADD or SUB
+	@ instructions, where we need to patch in the offset into the
+	@ immediate field of the opcode, which is emitted with the correct
+	@ rotation value. (The effective value of the immediate is imm12<7:0>
+	@ rotated right by [2 * imm12<11:8>] bits)
+	@
+	@      31   28 27      23 22  20 19  16 15  12 11    0
+	@      +------+-----------------+------+------+-------+
+	@  ADD | cond | 0 0 1 0 1 0 0 0 |  Rn  |  Rd  | imm12 |
+	@  SUB | cond | 0 0 1 0 0 1 0 0 |  Rn  |  Rd  | imm12 |
+	@  MOV | cond | 0 0 1 1 1 0 1 0 |  Rn  |  Rd  | imm12 |
+	@  MVN | cond | 0 0 1 1 1 1 1 0 |  Rn  |  Rd  | imm12 |
+	@      +------+-----------------+------+------+-------+
+	@
+	@ In the LPAE case, we use a MOVW instruction to carry the low offset
+	@ word, and patch in the high word of the offset into the immediate
+	@ field of the subsequent MOV instruction, or patch it to a MVN
+	@ instruction if the offset is negative. We can distinguish MOVW
+	@ instructions based on bits 23:22 of the opcode, and ADD/SUB can be
+	@ distinguished from MOV/MVN (all using the encodings above) using
+	@ bit 24.
+	@
+	@      31   28 27      23 22  20 19  16 15  12 11    0
+	@      +------+-----------------+------+------+-------+
+	@ MOVW | cond | 0 0 1 1 0 0 0 0 | imm4 |  Rd  | imm12 |
+	@      +------+-----------------+------+------+-------+
+	@
 	moveq	r0, #0x400000		@ set bit 22, mov to mvn instruction
 	b	.Lnext
 .Lloop:	ldr	ip, [r7, r4]
+#ifdef CONFIG_ARM_LPAE
+	tst	ip, #PV_BIT24		@ ADD/SUB have bit 24 clear
+	beq	1f
+ARM_BE8(rev	ip, ip)
+	tst	ip, #0xc00000		@ MOVW has bits 23:22 clear
+	bic	ip, ip, #0x400000	@ clear bit 22
+	bfc	ip, #0, #12		@ clear imm12 field of MOV[W] instruction
+	orreq	ip, ip, r6		@ MOVW -> mask in offset bits 31-24
+	orrne	ip, ip, r0		@ MOV  -> mask in offset bits 7-0 (or bit 22)
+ARM_BE8(rev	ip, ip)
+	b	2f
+1:
+#endif
 	bic	ip, ip, #PV_IMM8_MASK
-	tst	ip, #PV_ROT_MASK		@ check the rotation field
-	orrne	ip, ip, r6 ARM_BE8(, lsl #24)	@ mask in offset bits 31-24
-	biceq	ip, ip, #PV_BIT22		@ clear bit 22
-	orreq	ip, ip, r0 ARM_BE8(, ror #8)	@ mask in offset bits 7-0 (or bit 22)
+	orr	ip, ip, r6 ARM_BE8(, lsl #24)	@ mask in offset bits 31-24
+2:
 	str	ip, [r7, r4]
 	add	r4, r4, #4
 #endif