diff mbox

[Xen-devel,2/2] xen: arm: update arm32 assembly primitives to Linux v3.16-rc6

Message ID 2c06427f1180cf408a3e9750de3040dde0afe2ea.1406301772.git.ian.campbell@citrix.com
State Accepted
Commit 65e9f0660697e895ada7999a4e9822c716bc4ec7
Headers show

Commit Message

Ian Campbell July 25, 2014, 3:22 p.m. UTC
bitops, cmpxchg, atomics: Import:
  c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics
    Author: Will Deacon <will.deacon@arm.com>
    Signed-off-by: Will Deacon <will.deacon@arm.com>
    Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

atomics: In addition to the above import:
  db38ee8 ARM: 7983/1: atomics: implement a better __atomic_add_unless for v6+
    Author: Will Deacon <will.deacon@arm.com>
    Signed-off-by: Will Deacon <will.deacon@arm.com>
    Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

spinlocks: We have diverged from Linux, so no updates but note this in the README.

mem* and str*: Import:
  d98b90e ARM: 7990/1: asm: rename logical shift macros push pull into lspush lspull
    Author: Victor Kamensky <victor.kamensky@linaro.org>
    Suggested-by: Will Deacon <will.deacon@arm.com>
    Signed-off-by: Victor Kamensky <victor.kamensky@linaro.org>
    Acked-by: Nicolas Pitre <nico@linaro.org>
    Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

  For some reason str* were mentioned under mem* in the README, fix.

libgcc: No changes, update baseline

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
---
 xen/arch/arm/README.LinuxPrimitives    |   17 +++++++--------
 xen/arch/arm/arm32/lib/assembler.h     |    8 +++----
 xen/arch/arm/arm32/lib/bitops.h        |    5 +++++
 xen/arch/arm/arm32/lib/copy_template.S |   36 ++++++++++++++++----------------
 xen/arch/arm/arm32/lib/memmove.S       |   36 ++++++++++++++++----------------
 xen/include/asm-arm/arm32/atomic.h     |   32 ++++++++++++++++++++++++++++
 xen/include/asm-arm/arm32/cmpxchg.h    |    5 +++++
 7 files changed, 90 insertions(+), 49 deletions(-)

Comments

Julien Grall July 25, 2014, 3:42 p.m. UTC | #1
Hi Ian,

On 07/25/2014 04:22 PM, Ian Campbell wrote:
> bitops, cmpxchg, atomics: Import:
>   c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics

Compare to Linux we don't have specific prefetch* helpers. We directly
use the compiler builtin ones. Shouldn't we import the ARM specific
helpers to gain in performance?

Regards,

>     Author: Will Deacon <will.deacon@arm.com>
>     Signed-off-by: Will Deacon <will.deacon@arm.com>
>     Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
> 
> atomics: In addition to the above import:
>   db38ee8 ARM: 7983/1: atomics: implement a better __atomic_add_unless for v6+
>     Author: Will Deacon <will.deacon@arm.com>
>     Signed-off-by: Will Deacon <will.deacon@arm.com>
>     Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
> 
> spinlocks: We have diverged from Linux, so no updates but note this in the README.
> 
> mem* and str*: Import:
>   d98b90e ARM: 7990/1: asm: rename logical shift macros push pull into lspush lspull
>     Author: Victor Kamensky <victor.kamensky@linaro.org>
>     Suggested-by: Will Deacon <will.deacon@arm.com>
>     Signed-off-by: Victor Kamensky <victor.kamensky@linaro.org>
>     Acked-by: Nicolas Pitre <nico@linaro.org>
>     Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
> 
>   For some reason str* were mentioned under mem* in the README, fix.
> 
> libgcc: No changes, update baseline
> 
> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> ---
>  xen/arch/arm/README.LinuxPrimitives    |   17 +++++++--------
>  xen/arch/arm/arm32/lib/assembler.h     |    8 +++----
>  xen/arch/arm/arm32/lib/bitops.h        |    5 +++++
>  xen/arch/arm/arm32/lib/copy_template.S |   36 ++++++++++++++++----------------
>  xen/arch/arm/arm32/lib/memmove.S       |   36 ++++++++++++++++----------------
>  xen/include/asm-arm/arm32/atomic.h     |   32 ++++++++++++++++++++++++++++
>  xen/include/asm-arm/arm32/cmpxchg.h    |    5 +++++
>  7 files changed, 90 insertions(+), 49 deletions(-)
> 
> diff --git a/xen/arch/arm/README.LinuxPrimitives b/xen/arch/arm/README.LinuxPrimitives
> index 69eeb70..7e15b04 100644
> --- a/xen/arch/arm/README.LinuxPrimitives
> +++ b/xen/arch/arm/README.LinuxPrimitives
> @@ -65,7 +65,7 @@ linux/arch/arm64/lib/copy_page.S        unused in Xen
>  arm32
>  =====================================================================
>  
> -bitops: last sync @ v3.14-rc7 (last commit: b7ec699)
> +bitops: last sync @ v3.16-rc6 (last commit: c32ffce0f66e)
>  
>  linux/arch/arm/lib/bitops.h             xen/arch/arm/arm32/lib/bitops.h
>  linux/arch/arm/lib/changebit.S          xen/arch/arm/arm32/lib/changebit.S
> @@ -83,13 +83,13 @@ done
>  
>  ---------------------------------------------------------------------
>  
> -cmpxchg: last sync @ v3.14-rc7 (last commit: 775ebcc)
> +cmpxchg: last sync @ v3.16-rc6 (last commit: c32ffce0f66e)
>  
>  linux/arch/arm/include/asm/cmpxchg.h    xen/include/asm-arm/arm32/cmpxchg.h
>  
>  ---------------------------------------------------------------------
>  
> -atomics: last sync @ v3.14-rc7 (last commit: aed3a4e)
> +atomics: last sync @ v3.16-rc6 (last commit: 030d0178bdbd)
>  
>  linux/arch/arm/include/asm/atomic.h     xen/include/asm-arm/arm32/atomic.h
>  
> @@ -99,6 +99,8 @@ spinlocks: last sync: 15e7e5c1ebf5
>  
>  linux/arch/arm/include/asm/spinlock.h   xen/include/asm-arm/arm32/spinlock.h
>  
> +*** Linux has switched to ticket locks but we still use bitlocks.
> +
>  resync to v3.14-rc7:
>  
>    7c8746a ARM: 7955/1: spinlock: ensure we have a compiler barrier before sev
> @@ -111,7 +113,7 @@ resync to v3.14-rc7:
>  
>  ---------------------------------------------------------------------
>  
> -mem*: last sync @ v3.14-rc7 (last commit: 418df63a)
> +mem*: last sync @ v3.16-rc6 (last commit: d98b90ea22b0)
>  
>  linux/arch/arm/lib/copy_template.S      xen/arch/arm/arm32/lib/copy_template.S
>  linux/arch/arm/lib/memchr.S             xen/arch/arm/arm32/lib/memchr.S
> @@ -120,9 +122,6 @@ linux/arch/arm/lib/memmove.S            xen/arch/arm/arm32/lib/memmove.S
>  linux/arch/arm/lib/memset.S             xen/arch/arm/arm32/lib/memset.S
>  linux/arch/arm/lib/memzero.S            xen/arch/arm/arm32/lib/memzero.S
>  
> -linux/arch/arm/lib/strchr.S             xen/arch/arm/arm32/lib/strchr.S
> -linux/arch/arm/lib/strrchr.S            xen/arch/arm/arm32/lib/strrchr.S
> -
>  for i in copy_template.S memchr.S memcpy.S memmove.S memset.S \
>           memzero.S ; do
>      diff -u linux/arch/arm/lib/$i xen/arch/arm/arm32/lib/$i
> @@ -130,7 +129,7 @@ done
>  
>  ---------------------------------------------------------------------
>  
> -str*: last sync @ v3.13-rc7 (last commit: 93ed397)
> +str*: last sync @ v3.16-rc6 (last commit: d98b90ea22b0)
>  
>  linux/arch/arm/lib/strchr.S             xen/arch/arm/arm32/lib/strchr.S
>  linux/arch/arm/lib/strrchr.S            xen/arch/arm/arm32/lib/strrchr.S
> @@ -145,7 +144,7 @@ clear_page == memset
>  
>  ---------------------------------------------------------------------
>  
> -libgcc: last sync @ v3.14-rc7 (last commit: 01885bc)
> +libgcc: last sync @ v3.16-rc6 (last commit: 01885bc)
>  
>  linux/arch/arm/lib/lib1funcs.S          xen/arch/arm/arm32/lib/lib1funcs.S
>  linux/arch/arm/lib/lshrdi3.S            xen/arch/arm/arm32/lib/lshrdi3.S
> diff --git a/xen/arch/arm/arm32/lib/assembler.h b/xen/arch/arm/arm32/lib/assembler.h
> index f8d4b3a..6de2638 100644
> --- a/xen/arch/arm/arm32/lib/assembler.h
> +++ b/xen/arch/arm/arm32/lib/assembler.h
> @@ -36,8 +36,8 @@
>   * Endian independent macros for shifting bytes within registers.
>   */
>  #ifndef __ARMEB__
> -#define pull            lsr
> -#define push            lsl
> +#define lspull          lsr
> +#define lspush          lsl
>  #define get_byte_0      lsl #0
>  #define get_byte_1	lsr #8
>  #define get_byte_2	lsr #16
> @@ -47,8 +47,8 @@
>  #define put_byte_2	lsl #16
>  #define put_byte_3	lsl #24
>  #else
> -#define pull            lsl
> -#define push            lsr
> +#define lspull          lsl
> +#define lspush          lsr
>  #define get_byte_0	lsr #24
>  #define get_byte_1	lsr #16
>  #define get_byte_2	lsr #8
> diff --git a/xen/arch/arm/arm32/lib/bitops.h b/xen/arch/arm/arm32/lib/bitops.h
> index 25784c3..a167c2d 100644
> --- a/xen/arch/arm/arm32/lib/bitops.h
> +++ b/xen/arch/arm/arm32/lib/bitops.h
> @@ -37,6 +37,11 @@ UNWIND(	.fnstart	)
>  	add	r1, r1, r0, lsl #2	@ Get word offset
>  	mov	r3, r2, lsl r3		@ create mask
>  	smp_dmb
> +#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
> +	.arch_extension	mp
> +	ALT_SMP(W(pldw)	[r1])
> +	ALT_UP(W(nop))
> +#endif
>  1:	ldrex	r2, [r1]
>  	ands	r0, r2, r3		@ save old value of bit
>  	\instr	r2, r2, r3		@ toggle bit
> diff --git a/xen/arch/arm/arm32/lib/copy_template.S b/xen/arch/arm/arm32/lib/copy_template.S
> index 805e3f8..3bc8eb8 100644
> --- a/xen/arch/arm/arm32/lib/copy_template.S
> +++ b/xen/arch/arm/arm32/lib/copy_template.S
> @@ -197,24 +197,24 @@
>  
>  12:	PLD(	pld	[r1, #124]		)
>  13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
> -		mov	r3, lr, pull #\pull
> +		mov	r3, lr, lspull #\pull
>  		subs	r2, r2, #32
>  		ldr4w	r1, r8, r9, ip, lr, abort=19f
> -		orr	r3, r3, r4, push #\push
> -		mov	r4, r4, pull #\pull
> -		orr	r4, r4, r5, push #\push
> -		mov	r5, r5, pull #\pull
> -		orr	r5, r5, r6, push #\push
> -		mov	r6, r6, pull #\pull
> -		orr	r6, r6, r7, push #\push
> -		mov	r7, r7, pull #\pull
> -		orr	r7, r7, r8, push #\push
> -		mov	r8, r8, pull #\pull
> -		orr	r8, r8, r9, push #\push
> -		mov	r9, r9, pull #\pull
> -		orr	r9, r9, ip, push #\push
> -		mov	ip, ip, pull #\pull
> -		orr	ip, ip, lr, push #\push
> +		orr	r3, r3, r4, lspush #\push
> +		mov	r4, r4, lspull #\pull
> +		orr	r4, r4, r5, lspush #\push
> +		mov	r5, r5, lspull #\pull
> +		orr	r5, r5, r6, lspush #\push
> +		mov	r6, r6, lspull #\pull
> +		orr	r6, r6, r7, lspush #\push
> +		mov	r7, r7, lspull #\pull
> +		orr	r7, r7, r8, lspush #\push
> +		mov	r8, r8, lspull #\pull
> +		orr	r8, r8, r9, lspush #\push
> +		mov	r9, r9, lspull #\pull
> +		orr	r9, r9, ip, lspush #\push
> +		mov	ip, ip, lspull #\pull
> +		orr	ip, ip, lr, lspush #\push
>  		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
>  		bge	12b
>  	PLD(	cmn	r2, #96			)
> @@ -225,10 +225,10 @@
>  14:		ands	ip, r2, #28
>  		beq	16f
>  
> -15:		mov	r3, lr, pull #\pull
> +15:		mov	r3, lr, lspull #\pull
>  		ldr1w	r1, lr, abort=21f
>  		subs	ip, ip, #4
> -		orr	r3, r3, lr, push #\push
> +		orr	r3, r3, lr, lspush #\push
>  		str1w	r0, r3, abort=21f
>  		bgt	15b
>  	CALGN(	cmp	r2, #0			)
> diff --git a/xen/arch/arm/arm32/lib/memmove.S b/xen/arch/arm/arm32/lib/memmove.S
> index 4e142b8..18634c3 100644
> --- a/xen/arch/arm/arm32/lib/memmove.S
> +++ b/xen/arch/arm/arm32/lib/memmove.S
> @@ -148,24 +148,24 @@ ENTRY(memmove)
>  
>  12:	PLD(	pld	[r1, #-128]		)
>  13:		ldmdb   r1!, {r7, r8, r9, ip}
> -		mov     lr, r3, push #\push
> +		mov     lr, r3, lspush #\push
>  		subs    r2, r2, #32
>  		ldmdb   r1!, {r3, r4, r5, r6}
> -		orr     lr, lr, ip, pull #\pull
> -		mov     ip, ip, push #\push
> -		orr     ip, ip, r9, pull #\pull
> -		mov     r9, r9, push #\push
> -		orr     r9, r9, r8, pull #\pull
> -		mov     r8, r8, push #\push
> -		orr     r8, r8, r7, pull #\pull
> -		mov     r7, r7, push #\push
> -		orr     r7, r7, r6, pull #\pull
> -		mov     r6, r6, push #\push
> -		orr     r6, r6, r5, pull #\pull
> -		mov     r5, r5, push #\push
> -		orr     r5, r5, r4, pull #\pull
> -		mov     r4, r4, push #\push
> -		orr     r4, r4, r3, pull #\pull
> +		orr     lr, lr, ip, lspull #\pull
> +		mov     ip, ip, lspush #\push
> +		orr     ip, ip, r9, lspull #\pull
> +		mov     r9, r9, lspush #\push
> +		orr     r9, r9, r8, lspull #\pull
> +		mov     r8, r8, lspush #\push
> +		orr     r8, r8, r7, lspull #\pull
> +		mov     r7, r7, lspush #\push
> +		orr     r7, r7, r6, lspull #\pull
> +		mov     r6, r6, lspush #\push
> +		orr     r6, r6, r5, lspull #\pull
> +		mov     r5, r5, lspush #\push
> +		orr     r5, r5, r4, lspull #\pull
> +		mov     r4, r4, lspush #\push
> +		orr     r4, r4, r3, lspull #\pull
>  		stmdb   r0!, {r4 - r9, ip, lr}
>  		bge	12b
>  	PLD(	cmn	r2, #96			)
> @@ -176,10 +176,10 @@ ENTRY(memmove)
>  14:		ands	ip, r2, #28
>  		beq	16f
>  
> -15:		mov     lr, r3, push #\push
> +15:		mov     lr, r3, lspush #\push
>  		ldr	r3, [r1, #-4]!
>  		subs	ip, ip, #4
> -		orr	lr, lr, r3, pull #\pull
> +		orr	lr, lr, r3, lspull #\pull
>  		str	lr, [r0, #-4]!
>  		bgt	15b
>  	CALGN(	cmp	r2, #0			)
> diff --git a/xen/include/asm-arm/arm32/atomic.h b/xen/include/asm-arm/arm32/atomic.h
> index 3d601d1..7ec712f 100644
> --- a/xen/include/asm-arm/arm32/atomic.h
> +++ b/xen/include/asm-arm/arm32/atomic.h
> @@ -39,6 +39,7 @@ static inline int atomic_add_return(int i, atomic_t *v)
>  	int result;
>  
>  	smp_mb();
> +	prefetchw(&v->counter);
>  
>  	__asm__ __volatile__("@ atomic_add_return\n"
>  "1:	ldrex	%0, [%3]\n"
> @@ -78,6 +79,7 @@ static inline int atomic_sub_return(int i, atomic_t *v)
>  	int result;
>  
>  	smp_mb();
> +	prefetchw(&v->counter);
>  
>  	__asm__ __volatile__("@ atomic_sub_return\n"
>  "1:	ldrex	%0, [%3]\n"
> @@ -100,6 +102,7 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
>  	unsigned long res;
>  
>  	smp_mb();
> +	prefetchw(&ptr->counter);
>  
>  	do {
>  		__asm__ __volatile__("@ atomic_cmpxchg\n"
> @@ -117,6 +120,35 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
>  	return oldval;
>  }
>  
> +static inline int __atomic_add_unless(atomic_t *v, int a, int u)
> +{
> +	int oldval, newval;
> +	unsigned long tmp;
> +
> +	smp_mb();
> +	prefetchw(&v->counter);
> +
> +	__asm__ __volatile__ ("@ atomic_add_unless\n"
> +"1:	ldrex	%0, [%4]\n"
> +"	teq	%0, %5\n"
> +"	beq	2f\n"
> +"	add	%1, %0, %6\n"
> +"	strex	%2, %1, [%4]\n"
> +"	teq	%2, #0\n"
> +"	bne	1b\n"
> +"2:"
> +	: "=&r" (oldval), "=&r" (newval), "=&r" (tmp), "+Qo" (v->counter)
> +	: "r" (&v->counter), "r" (u), "r" (a)
> +	: "cc");
> +
> +	if (oldval != u)
> +		smp_mb();
> +
> +	return oldval;
> +}
> +
> +#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
> +
>  #define atomic_inc(v)		atomic_add(1, v)
>  #define atomic_dec(v)		atomic_sub(1, v)
>  
> diff --git a/xen/include/asm-arm/arm32/cmpxchg.h b/xen/include/asm-arm/arm32/cmpxchg.h
> index 9a511f2..03e0bed 100644
> --- a/xen/include/asm-arm/arm32/cmpxchg.h
> +++ b/xen/include/asm-arm/arm32/cmpxchg.h
> @@ -1,6 +1,8 @@
>  #ifndef __ASM_ARM32_CMPXCHG_H
>  #define __ASM_ARM32_CMPXCHG_H
>  
> +#include <xen/prefetch.h>
> +
>  extern void __bad_xchg(volatile void *, int);
>  
>  static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size)
> @@ -9,6 +11,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
>  	unsigned int tmp;
>  
>  	smp_mb();
> +	prefetchw((const void *)ptr);
>  
>  	switch (size) {
>  	case 1:
> @@ -56,6 +59,8 @@ static always_inline unsigned long __cmpxchg(
>  {
>  	unsigned long oldval, res;
>  
> +	prefetchw((const void *)ptr);
> +
>  	switch (size) {
>  	case 1:
>  		do {
>
Ian Campbell July 25, 2014, 3:48 p.m. UTC | #2
On Fri, 2014-07-25 at 16:42 +0100, Julien Grall wrote:
> Hi Ian,
> 
> On 07/25/2014 04:22 PM, Ian Campbell wrote:
> > bitops, cmpxchg, atomics: Import:
> >   c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics
> 
> Compare to Linux we don't have specific prefetch* helpers. We directly
> use the compiler builtin ones. Shouldn't we import the ARM specific
> helpers to gain in performance?

My binaries are full of pld instructions where I think I would expect
them, so it seems like the compiler builtin ones are sufficient.

I suspect the Linux define is there to cope with older compilers or
something.

Ian.
Julien Grall July 25, 2014, 3:48 p.m. UTC | #3
On 07/25/2014 04:48 PM, Ian Campbell wrote:
> On Fri, 2014-07-25 at 16:42 +0100, Julien Grall wrote:
>> Hi Ian,
>>
>> On 07/25/2014 04:22 PM, Ian Campbell wrote:
>>> bitops, cmpxchg, atomics: Import:
>>>   c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics
>>
>> Compare to Linux we don't have specific prefetch* helpers. We directly
>> use the compiler builtin ones. Shouldn't we import the ARM specific
>> helpers to gain in performance?
> 
> My binaries are full of pld instructions where I think I would expect
> them, so it seems like the compiler builtin ones are sufficient.
> 
> I suspect the Linux define is there to cope with older compilers or
> something.

If so:

Acked-by: Julien Grall <julien.grall@linaro.org>

Regards,
Ian Campbell July 25, 2014, 4:03 p.m. UTC | #4
On Fri, 2014-07-25 at 16:48 +0100, Julien Grall wrote:
> On 07/25/2014 04:48 PM, Ian Campbell wrote:
> > On Fri, 2014-07-25 at 16:42 +0100, Julien Grall wrote:
> >> Hi Ian,
> >>
> >> On 07/25/2014 04:22 PM, Ian Campbell wrote:
> >>> bitops, cmpxchg, atomics: Import:
> >>>   c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics
> >>
> >> Compare to Linux we don't have specific prefetch* helpers. We directly
> >> use the compiler builtin ones. Shouldn't we import the ARM specific
> >> helpers to gain in performance?
> > 
> > My binaries are full of pld instructions where I think I would expect
> > them, so it seems like the compiler builtin ones are sufficient.
> > 
> > I suspect the Linux define is there to cope with older compilers or
> > something.
> 
> If so:

The compiled output is very different if I use the arch specific
explicit variants. The explicit variant generates (lots) more pldw and
(somewhat) fewer pld. I've no idea what this means...

Note that the builtins presumably let gcc reason about whether preloads
are needed, whereas the explicit variants do not. I'm not sure how that
results in fewer pld with the explicit variant though! (unless it's
doing some sort of peephole optimisation and throwing them away?)

I've no idea what the right answer is.

How about we take the updates for now and revisit the question of
builtin vs explicit prefetches some other time?


> Acked-by: Julien Grall <julien.grall@linaro.org>
> 
> Regards,
>
Julien Grall July 25, 2014, 4:17 p.m. UTC | #5
On 07/25/2014 05:03 PM, Ian Campbell wrote:
> On Fri, 2014-07-25 at 16:48 +0100, Julien Grall wrote:
>> On 07/25/2014 04:48 PM, Ian Campbell wrote:
>>> On Fri, 2014-07-25 at 16:42 +0100, Julien Grall wrote:
>>>> Hi Ian,
>>>>
>>>> On 07/25/2014 04:22 PM, Ian Campbell wrote:
>>>>> bitops, cmpxchg, atomics: Import:
>>>>>   c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics
>>>>
>>>> Compare to Linux we don't have specific prefetch* helpers. We directly
>>>> use the compiler builtin ones. Shouldn't we import the ARM specific
>>>> helpers to gain in performance?
>>>
>>> My binaries are full of pld instructions where I think I would expect
>>> them, so it seems like the compiler builtin ones are sufficient.
>>>
>>> I suspect the Linux define is there to cope with older compilers or
>>> something.
>>
>> If so:
> 
> The compiled output is very different if I use the arch specific
> explicit variants. The explicit variant generates (lots) more pldw and
> (somewhat) fewer pld. I've no idea what this means...

It looks like that pldw has been defined for ARMv7 with MP extensions.

AFAIU, pldw is used to signal we will likely write on this address.

I guess, we use the prefetch* helpers more often for write in the memory.

> 
> Note that the builtins presumably let gcc reason about whether preloads
> are needed, whereas the explicit variants do not. I'm not sure how that
> results in fewer pld with the explicit variant though! (unless it's
> doing some sort of peephole optimisation and throwing them away?)
> 
> I've no idea what the right answer is.
> 
> How about we take the updates for now and revisit the question of
> builtin vs explicit prefetches some other time?

I'm fine with it. You can keep the ack for this patch.

Regards,
Ian Campbell July 25, 2014, 4:23 p.m. UTC | #6
On Fri, 2014-07-25 at 17:17 +0100, Julien Grall wrote:
> On 07/25/2014 05:03 PM, Ian Campbell wrote:
> > On Fri, 2014-07-25 at 16:48 +0100, Julien Grall wrote:
> >> On 07/25/2014 04:48 PM, Ian Campbell wrote:
> >>> On Fri, 2014-07-25 at 16:42 +0100, Julien Grall wrote:
> >>>> Hi Ian,
> >>>>
> >>>> On 07/25/2014 04:22 PM, Ian Campbell wrote:
> >>>>> bitops, cmpxchg, atomics: Import:
> >>>>>   c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics
> >>>>
> >>>> Compare to Linux we don't have specific prefetch* helpers. We directly
> >>>> use the compiler builtin ones. Shouldn't we import the ARM specific
> >>>> helpers to gain in performance?
> >>>
> >>> My binaries are full of pld instructions where I think I would expect
> >>> them, so it seems like the compiler builtin ones are sufficient.
> >>>
> >>> I suspect the Linux define is there to cope with older compilers or
> >>> something.
> >>
> >> If so:
> > 
> > The compiled output is very different if I use the arch specific
> > explicit variants. The explicit variant generates (lots) more pldw and
> > (somewhat) fewer pld. I've no idea what this means...
> 
> It looks like that pldw has been defined for ARMv7 with MP extensions.
> 
> AFAIU, pldw is used to signal we will likely write on this address.

Oh, I know *that*.

What I couldn't explain is why the builtins should generate 181 pld's
and 6 pldw's (total 187) while the explicit ones generate 127 pld's and
93 pldw's (total 220) for the exact same code base.

Perhaps we simply use prefetchw too often in our code in gcc's opinion
so it elides some of them. Or perhaps the volatile in the explicit
version stops gcc from making other optimisations so there's simply more
occasions where the prefetching is needed.

The difference in the write prefetches is pretty stark though, 6 vs 93.

Ian.
diff mbox

Patch

diff --git a/xen/arch/arm/README.LinuxPrimitives b/xen/arch/arm/README.LinuxPrimitives
index 69eeb70..7e15b04 100644
--- a/xen/arch/arm/README.LinuxPrimitives
+++ b/xen/arch/arm/README.LinuxPrimitives
@@ -65,7 +65,7 @@  linux/arch/arm64/lib/copy_page.S        unused in Xen
 arm32
 =====================================================================
 
-bitops: last sync @ v3.14-rc7 (last commit: b7ec699)
+bitops: last sync @ v3.16-rc6 (last commit: c32ffce0f66e)
 
 linux/arch/arm/lib/bitops.h             xen/arch/arm/arm32/lib/bitops.h
 linux/arch/arm/lib/changebit.S          xen/arch/arm/arm32/lib/changebit.S
@@ -83,13 +83,13 @@  done
 
 ---------------------------------------------------------------------
 
-cmpxchg: last sync @ v3.14-rc7 (last commit: 775ebcc)
+cmpxchg: last sync @ v3.16-rc6 (last commit: c32ffce0f66e)
 
 linux/arch/arm/include/asm/cmpxchg.h    xen/include/asm-arm/arm32/cmpxchg.h
 
 ---------------------------------------------------------------------
 
-atomics: last sync @ v3.14-rc7 (last commit: aed3a4e)
+atomics: last sync @ v3.16-rc6 (last commit: 030d0178bdbd)
 
 linux/arch/arm/include/asm/atomic.h     xen/include/asm-arm/arm32/atomic.h
 
@@ -99,6 +99,8 @@  spinlocks: last sync: 15e7e5c1ebf5
 
 linux/arch/arm/include/asm/spinlock.h   xen/include/asm-arm/arm32/spinlock.h
 
+*** Linux has switched to ticket locks but we still use bitlocks.
+
 resync to v3.14-rc7:
 
   7c8746a ARM: 7955/1: spinlock: ensure we have a compiler barrier before sev
@@ -111,7 +113,7 @@  resync to v3.14-rc7:
 
 ---------------------------------------------------------------------
 
-mem*: last sync @ v3.14-rc7 (last commit: 418df63a)
+mem*: last sync @ v3.16-rc6 (last commit: d98b90ea22b0)
 
 linux/arch/arm/lib/copy_template.S      xen/arch/arm/arm32/lib/copy_template.S
 linux/arch/arm/lib/memchr.S             xen/arch/arm/arm32/lib/memchr.S
@@ -120,9 +122,6 @@  linux/arch/arm/lib/memmove.S            xen/arch/arm/arm32/lib/memmove.S
 linux/arch/arm/lib/memset.S             xen/arch/arm/arm32/lib/memset.S
 linux/arch/arm/lib/memzero.S            xen/arch/arm/arm32/lib/memzero.S
 
-linux/arch/arm/lib/strchr.S             xen/arch/arm/arm32/lib/strchr.S
-linux/arch/arm/lib/strrchr.S            xen/arch/arm/arm32/lib/strrchr.S
-
 for i in copy_template.S memchr.S memcpy.S memmove.S memset.S \
          memzero.S ; do
     diff -u linux/arch/arm/lib/$i xen/arch/arm/arm32/lib/$i
@@ -130,7 +129,7 @@  done
 
 ---------------------------------------------------------------------
 
-str*: last sync @ v3.13-rc7 (last commit: 93ed397)
+str*: last sync @ v3.16-rc6 (last commit: d98b90ea22b0)
 
 linux/arch/arm/lib/strchr.S             xen/arch/arm/arm32/lib/strchr.S
 linux/arch/arm/lib/strrchr.S            xen/arch/arm/arm32/lib/strrchr.S
@@ -145,7 +144,7 @@  clear_page == memset
 
 ---------------------------------------------------------------------
 
-libgcc: last sync @ v3.14-rc7 (last commit: 01885bc)
+libgcc: last sync @ v3.16-rc6 (last commit: 01885bc)
 
 linux/arch/arm/lib/lib1funcs.S          xen/arch/arm/arm32/lib/lib1funcs.S
 linux/arch/arm/lib/lshrdi3.S            xen/arch/arm/arm32/lib/lshrdi3.S
diff --git a/xen/arch/arm/arm32/lib/assembler.h b/xen/arch/arm/arm32/lib/assembler.h
index f8d4b3a..6de2638 100644
--- a/xen/arch/arm/arm32/lib/assembler.h
+++ b/xen/arch/arm/arm32/lib/assembler.h
@@ -36,8 +36,8 @@ 
  * Endian independent macros for shifting bytes within registers.
  */
 #ifndef __ARMEB__
-#define pull            lsr
-#define push            lsl
+#define lspull          lsr
+#define lspush          lsl
 #define get_byte_0      lsl #0
 #define get_byte_1	lsr #8
 #define get_byte_2	lsr #16
@@ -47,8 +47,8 @@ 
 #define put_byte_2	lsl #16
 #define put_byte_3	lsl #24
 #else
-#define pull            lsl
-#define push            lsr
+#define lspull          lsl
+#define lspush          lsr
 #define get_byte_0	lsr #24
 #define get_byte_1	lsr #16
 #define get_byte_2	lsr #8
diff --git a/xen/arch/arm/arm32/lib/bitops.h b/xen/arch/arm/arm32/lib/bitops.h
index 25784c3..a167c2d 100644
--- a/xen/arch/arm/arm32/lib/bitops.h
+++ b/xen/arch/arm/arm32/lib/bitops.h
@@ -37,6 +37,11 @@  UNWIND(	.fnstart	)
 	add	r1, r1, r0, lsl #2	@ Get word offset
 	mov	r3, r2, lsl r3		@ create mask
 	smp_dmb
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
+	.arch_extension	mp
+	ALT_SMP(W(pldw)	[r1])
+	ALT_UP(W(nop))
+#endif
 1:	ldrex	r2, [r1]
 	ands	r0, r2, r3		@ save old value of bit
 	\instr	r2, r2, r3		@ toggle bit
diff --git a/xen/arch/arm/arm32/lib/copy_template.S b/xen/arch/arm/arm32/lib/copy_template.S
index 805e3f8..3bc8eb8 100644
--- a/xen/arch/arm/arm32/lib/copy_template.S
+++ b/xen/arch/arm/arm32/lib/copy_template.S
@@ -197,24 +197,24 @@ 
 
 12:	PLD(	pld	[r1, #124]		)
 13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
-		mov	r3, lr, pull #\pull
+		mov	r3, lr, lspull #\pull
 		subs	r2, r2, #32
 		ldr4w	r1, r8, r9, ip, lr, abort=19f
-		orr	r3, r3, r4, push #\push
-		mov	r4, r4, pull #\pull
-		orr	r4, r4, r5, push #\push
-		mov	r5, r5, pull #\pull
-		orr	r5, r5, r6, push #\push
-		mov	r6, r6, pull #\pull
-		orr	r6, r6, r7, push #\push
-		mov	r7, r7, pull #\pull
-		orr	r7, r7, r8, push #\push
-		mov	r8, r8, pull #\pull
-		orr	r8, r8, r9, push #\push
-		mov	r9, r9, pull #\pull
-		orr	r9, r9, ip, push #\push
-		mov	ip, ip, pull #\pull
-		orr	ip, ip, lr, push #\push
+		orr	r3, r3, r4, lspush #\push
+		mov	r4, r4, lspull #\pull
+		orr	r4, r4, r5, lspush #\push
+		mov	r5, r5, lspull #\pull
+		orr	r5, r5, r6, lspush #\push
+		mov	r6, r6, lspull #\pull
+		orr	r6, r6, r7, lspush #\push
+		mov	r7, r7, lspull #\pull
+		orr	r7, r7, r8, lspush #\push
+		mov	r8, r8, lspull #\pull
+		orr	r8, r8, r9, lspush #\push
+		mov	r9, r9, lspull #\pull
+		orr	r9, r9, ip, lspush #\push
+		mov	ip, ip, lspull #\pull
+		orr	ip, ip, lr, lspush #\push
 		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
 		bge	12b
 	PLD(	cmn	r2, #96			)
@@ -225,10 +225,10 @@ 
 14:		ands	ip, r2, #28
 		beq	16f
 
-15:		mov	r3, lr, pull #\pull
+15:		mov	r3, lr, lspull #\pull
 		ldr1w	r1, lr, abort=21f
 		subs	ip, ip, #4
-		orr	r3, r3, lr, push #\push
+		orr	r3, r3, lr, lspush #\push
 		str1w	r0, r3, abort=21f
 		bgt	15b
 	CALGN(	cmp	r2, #0			)
diff --git a/xen/arch/arm/arm32/lib/memmove.S b/xen/arch/arm/arm32/lib/memmove.S
index 4e142b8..18634c3 100644
--- a/xen/arch/arm/arm32/lib/memmove.S
+++ b/xen/arch/arm/arm32/lib/memmove.S
@@ -148,24 +148,24 @@  ENTRY(memmove)
 
 12:	PLD(	pld	[r1, #-128]		)
 13:		ldmdb   r1!, {r7, r8, r9, ip}
-		mov     lr, r3, push #\push
+		mov     lr, r3, lspush #\push
 		subs    r2, r2, #32
 		ldmdb   r1!, {r3, r4, r5, r6}
-		orr     lr, lr, ip, pull #\pull
-		mov     ip, ip, push #\push
-		orr     ip, ip, r9, pull #\pull
-		mov     r9, r9, push #\push
-		orr     r9, r9, r8, pull #\pull
-		mov     r8, r8, push #\push
-		orr     r8, r8, r7, pull #\pull
-		mov     r7, r7, push #\push
-		orr     r7, r7, r6, pull #\pull
-		mov     r6, r6, push #\push
-		orr     r6, r6, r5, pull #\pull
-		mov     r5, r5, push #\push
-		orr     r5, r5, r4, pull #\pull
-		mov     r4, r4, push #\push
-		orr     r4, r4, r3, pull #\pull
+		orr     lr, lr, ip, lspull #\pull
+		mov     ip, ip, lspush #\push
+		orr     ip, ip, r9, lspull #\pull
+		mov     r9, r9, lspush #\push
+		orr     r9, r9, r8, lspull #\pull
+		mov     r8, r8, lspush #\push
+		orr     r8, r8, r7, lspull #\pull
+		mov     r7, r7, lspush #\push
+		orr     r7, r7, r6, lspull #\pull
+		mov     r6, r6, lspush #\push
+		orr     r6, r6, r5, lspull #\pull
+		mov     r5, r5, lspush #\push
+		orr     r5, r5, r4, lspull #\pull
+		mov     r4, r4, lspush #\push
+		orr     r4, r4, r3, lspull #\pull
 		stmdb   r0!, {r4 - r9, ip, lr}
 		bge	12b
 	PLD(	cmn	r2, #96			)
@@ -176,10 +176,10 @@  ENTRY(memmove)
 14:		ands	ip, r2, #28
 		beq	16f
 
-15:		mov     lr, r3, push #\push
+15:		mov     lr, r3, lspush #\push
 		ldr	r3, [r1, #-4]!
 		subs	ip, ip, #4
-		orr	lr, lr, r3, pull #\pull
+		orr	lr, lr, r3, lspull #\pull
 		str	lr, [r0, #-4]!
 		bgt	15b
 	CALGN(	cmp	r2, #0			)
diff --git a/xen/include/asm-arm/arm32/atomic.h b/xen/include/asm-arm/arm32/atomic.h
index 3d601d1..7ec712f 100644
--- a/xen/include/asm-arm/arm32/atomic.h
+++ b/xen/include/asm-arm/arm32/atomic.h
@@ -39,6 +39,7 @@  static inline int atomic_add_return(int i, atomic_t *v)
 	int result;
 
 	smp_mb();
+	prefetchw(&v->counter);
 
 	__asm__ __volatile__("@ atomic_add_return\n"
 "1:	ldrex	%0, [%3]\n"
@@ -78,6 +79,7 @@  static inline int atomic_sub_return(int i, atomic_t *v)
 	int result;
 
 	smp_mb();
+	prefetchw(&v->counter);
 
 	__asm__ __volatile__("@ atomic_sub_return\n"
 "1:	ldrex	%0, [%3]\n"
@@ -100,6 +102,7 @@  static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
 	unsigned long res;
 
 	smp_mb();
+	prefetchw(&ptr->counter);
 
 	do {
 		__asm__ __volatile__("@ atomic_cmpxchg\n"
@@ -117,6 +120,35 @@  static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
 	return oldval;
 }
 
+static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+{
+	int oldval, newval;
+	unsigned long tmp;
+
+	smp_mb();
+	prefetchw(&v->counter);
+
+	__asm__ __volatile__ ("@ atomic_add_unless\n"
+"1:	ldrex	%0, [%4]\n"
+"	teq	%0, %5\n"
+"	beq	2f\n"
+"	add	%1, %0, %6\n"
+"	strex	%2, %1, [%4]\n"
+"	teq	%2, #0\n"
+"	bne	1b\n"
+"2:"
+	: "=&r" (oldval), "=&r" (newval), "=&r" (tmp), "+Qo" (v->counter)
+	: "r" (&v->counter), "r" (u), "r" (a)
+	: "cc");
+
+	if (oldval != u)
+		smp_mb();
+
+	return oldval;
+}
+
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+
 #define atomic_inc(v)		atomic_add(1, v)
 #define atomic_dec(v)		atomic_sub(1, v)
 
diff --git a/xen/include/asm-arm/arm32/cmpxchg.h b/xen/include/asm-arm/arm32/cmpxchg.h
index 9a511f2..03e0bed 100644
--- a/xen/include/asm-arm/arm32/cmpxchg.h
+++ b/xen/include/asm-arm/arm32/cmpxchg.h
@@ -1,6 +1,8 @@ 
 #ifndef __ASM_ARM32_CMPXCHG_H
 #define __ASM_ARM32_CMPXCHG_H
 
+#include <xen/prefetch.h>
+
 extern void __bad_xchg(volatile void *, int);
 
 static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size)
@@ -9,6 +11,7 @@  static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
 	unsigned int tmp;
 
 	smp_mb();
+	prefetchw((const void *)ptr);
 
 	switch (size) {
 	case 1:
@@ -56,6 +59,8 @@  static always_inline unsigned long __cmpxchg(
 {
 	unsigned long oldval, res;
 
+	prefetchw((const void *)ptr);
+
 	switch (size) {
 	case 1:
 		do {