diff mbox

arm64: barrier: implement wfe-base smp_cond_load_acquire

Message ID 1466169136-28672-1-git-send-email-will.deacon@arm.com
State Superseded
Headers show

Commit Message

Will Deacon June 17, 2016, 1:12 p.m. UTC
smp_cond_load_acquire is used to spin on a variable until some
expression involving that variable becomes true.

On arm64, we can build this using WFE and LDXR, since clearing of the
exclusive monitor as a result of the variable being changed by another
CPU generates an event, which will wake us up out of WFE.

This patch implements smp_cond_load_acquire using LDAXR and WFE, which
themselves are contained in an internal __cmpwait_acquire function.

Signed-off-by: Will Deacon <will.deacon@arm.com>

---

Based on Peter's locking/core branch.

 arch/arm64/include/asm/barrier.h | 13 ++++++++++
 arch/arm64/include/asm/cmpxchg.h | 52 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

-- 
2.1.4


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

Comments

Mark Rutland June 17, 2016, 2:27 p.m. UTC | #1
Hi Will,

On Fri, Jun 17, 2016 at 02:12:15PM +0100, Will Deacon wrote:
> smp_cond_load_acquire is used to spin on a variable until some

> expression involving that variable becomes true.

> 

> On arm64, we can build this using WFE and LDXR, since clearing of the

> exclusive monitor as a result of the variable being changed by another

> CPU generates an event, which will wake us up out of WFE.

> 

> This patch implements smp_cond_load_acquire using LDAXR and WFE, which

> themselves are contained in an internal __cmpwait_acquire function.

> 

> Signed-off-by: Will Deacon <will.deacon@arm.com>

> ---

> 

> Based on Peter's locking/core branch.

> 

>  arch/arm64/include/asm/barrier.h | 13 ++++++++++

>  arch/arm64/include/asm/cmpxchg.h | 52 ++++++++++++++++++++++++++++++++++++++++

>  2 files changed, 65 insertions(+)

> 

> diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h

> index dae5c49618db..4bb2b09f8ca9 100644

> --- a/arch/arm64/include/asm/barrier.h

> +++ b/arch/arm64/include/asm/barrier.h

> @@ -91,6 +91,19 @@ do {									\

>  	__u.__val;							\

>  })

>  

> +#define smp_cond_load_acquire(ptr, cond_expr)				\

> +({									\

> +	typeof(ptr) __PTR = (ptr);					\

> +	typeof(*ptr) VAL;						\


I was going to mention that the use of __PTR and VAL looked inconsistent
with the res of the arm64 routines, but I see that's part of the API for
the use of cond_expr, per [1].

> +	for (;;) {							\

> +		VAL = smp_load_acquire(__PTR);				\

> +		if (cond_expr)						\

> +			break;						\

> +		__cmpwait_acquire(__PTR, VAL);				\

> +	}								\

> +	VAL;								\

> +})

> +

>  #include <asm-generic/barrier.h>

>  

>  #endif	/* __ASSEMBLY__ */

> diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h

> index 510c7b404454..84b83e521edc 100644

> --- a/arch/arm64/include/asm/cmpxchg.h

> +++ b/arch/arm64/include/asm/cmpxchg.h

> @@ -224,4 +224,56 @@ __CMPXCHG_GEN(_mb)

>  	__ret;								\

>  })

>  

> +#define __CMPWAIT_CASE(w, sz, name, acq, cl)				\

> +static inline void __cmpwait_case_##name(volatile void *ptr,		\

> +					 unsigned long val)		\

> +{									\

> +	unsigned long tmp;						\

> +									\

> +	asm volatile(							\

> +	"	ld" #acq "xr" #sz "\t%" #w "[tmp], %[v]\n"		\

> +	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\

> +	"	cbnz	%" #w "[tmp], 1f\n"				\

> +	"	wfe\n"							\

> +	"1:"								\

> +	: [tmp] "=&r" (tmp), [v] "+Q" (*(unsigned long *)ptr)		\

> +	: [val] "r" (val)						\

> +	: cl);								\

> +}

> +

> +__CMPWAIT_CASE(w, b, acq_1, a, "memory");

> +__CMPWAIT_CASE(w, h, acq_2, a, "memory");

> +__CMPWAIT_CASE(w,  , acq_4, a, "memory");

> +__CMPWAIT_CASE( ,  , acq_8, a, "memory");

> +

> +#undef __CMPWAIT_CASE


From my understanding of the intent, I believe that the asm is correct.

I'm guessing from the way this and __CMPWAIT_GEN are parameterised that
there is a plan for variants of this with different ordering semantics,
though I'm having difficulty envisioning them. Perhaps I lack
imagination. ;)

Is there any case for waiting with anything other than acquire
semantics? If not, we could fold the acq parameter and acq_ prefix from
the name parameter.

Do we expect this to be used outside of smp_cond_load_acquire? If not,
can't we rely on the prior smp_load_acquire for the acquire semantics
(and use pain LDXR* here)? Then the acq part can go from the cmpwait
cases entirely.

Is there any case where we wouldn't want the memory clobber?

Thanks,
Mark.

[1] https://git.kernel.org/cgit/linux/kernel/git/peterz/queue.git/commit/?h=locking/core&id=c1b003e1416e46640f6613952da2fe65c5522f4d

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Will Deacon June 17, 2016, 3:38 p.m. UTC | #2
On Fri, Jun 17, 2016 at 03:27:42PM +0100, Mark Rutland wrote:
> On Fri, Jun 17, 2016 at 02:12:15PM +0100, Will Deacon wrote:

> > diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h

> > index 510c7b404454..84b83e521edc 100644

> > --- a/arch/arm64/include/asm/cmpxchg.h

> > +++ b/arch/arm64/include/asm/cmpxchg.h

> > @@ -224,4 +224,56 @@ __CMPXCHG_GEN(_mb)

> >  	__ret;								\

> >  })

> >  

> > +#define __CMPWAIT_CASE(w, sz, name, acq, cl)				\

> > +static inline void __cmpwait_case_##name(volatile void *ptr,		\

> > +					 unsigned long val)		\

> > +{									\

> > +	unsigned long tmp;						\

> > +									\

> > +	asm volatile(							\

> > +	"	ld" #acq "xr" #sz "\t%" #w "[tmp], %[v]\n"		\

> > +	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\

> > +	"	cbnz	%" #w "[tmp], 1f\n"				\

> > +	"	wfe\n"							\

> > +	"1:"								\

> > +	: [tmp] "=&r" (tmp), [v] "+Q" (*(unsigned long *)ptr)		\

> > +	: [val] "r" (val)						\

> > +	: cl);								\

> > +}

> > +

> > +__CMPWAIT_CASE(w, b, acq_1, a, "memory");

> > +__CMPWAIT_CASE(w, h, acq_2, a, "memory");

> > +__CMPWAIT_CASE(w,  , acq_4, a, "memory");

> > +__CMPWAIT_CASE( ,  , acq_8, a, "memory");

> > +

> > +#undef __CMPWAIT_CASE

> 

> From my understanding of the intent, I believe that the asm is correct.


Cheers for having a look.

> I'm guessing from the way this and __CMPWAIT_GEN are parameterised that

> there is a plan for variants of this with different ordering semantics,

> though I'm having difficulty envisioning them. Perhaps I lack

> imagination. ;)


Originally I also had a cmpwait_relaxed implementation, since Peter had
a generic cmpwait utility. That might be resurrected some day, so I
figured leaving the code like this makes it easily extensible to other
memory-ordering semantics without adding much in the way of complexity.

> Is there any case for waiting with anything other than acquire

> semantics? If not, we could fold the acq parameter and acq_ prefix from

> the name parameter.

> 

> Do we expect this to be used outside of smp_cond_load_acquire? If not,

> can't we rely on the prior smp_load_acquire for the acquire semantics

> (and use pain LDXR* here)? Then the acq part can go from the cmpwait

> cases entirely.


Yeah, I think we can rely on the read-after-read ordering here and
use __cmpwait_relaxed to the job for the inner loop.

> Is there any case where we wouldn't want the memory clobber?


I don't think you'd need it for cmpwait_relaxed, because the CPU could
reorder stuff anyway, so anything the compiler does is potentially futile.
So actually, I can respin this without the clobber. I'll simplify
the __CMPWAIT_CASE macro to drop the last two parameters as well.

Will

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Mark Rutland June 17, 2016, 3:42 p.m. UTC | #3
On Fri, Jun 17, 2016 at 04:38:03PM +0100, Will Deacon wrote:
> On Fri, Jun 17, 2016 at 03:27:42PM +0100, Mark Rutland wrote:

> > On Fri, Jun 17, 2016 at 02:12:15PM +0100, Will Deacon wrote:

> > > diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h

> > > index 510c7b404454..84b83e521edc 100644

> > > --- a/arch/arm64/include/asm/cmpxchg.h

> > > +++ b/arch/arm64/include/asm/cmpxchg.h

> > > @@ -224,4 +224,56 @@ __CMPXCHG_GEN(_mb)

> > >  	__ret;								\

> > >  })

> > >  

> > > +#define __CMPWAIT_CASE(w, sz, name, acq, cl)				\

> > > +static inline void __cmpwait_case_##name(volatile void *ptr,		\

> > > +					 unsigned long val)		\

> > > +{									\

> > > +	unsigned long tmp;						\

> > > +									\

> > > +	asm volatile(							\

> > > +	"	ld" #acq "xr" #sz "\t%" #w "[tmp], %[v]\n"		\

> > > +	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\

> > > +	"	cbnz	%" #w "[tmp], 1f\n"				\

> > > +	"	wfe\n"							\

> > > +	"1:"								\

> > > +	: [tmp] "=&r" (tmp), [v] "+Q" (*(unsigned long *)ptr)		\

> > > +	: [val] "r" (val)						\

> > > +	: cl);								\

> > > +}

> > > +

> > > +__CMPWAIT_CASE(w, b, acq_1, a, "memory");

> > > +__CMPWAIT_CASE(w, h, acq_2, a, "memory");

> > > +__CMPWAIT_CASE(w,  , acq_4, a, "memory");

> > > +__CMPWAIT_CASE( ,  , acq_8, a, "memory");

> > > +

> > > +#undef __CMPWAIT_CASE

> > 

> > From my understanding of the intent, I believe that the asm is correct.

> 

> Cheers for having a look.

> 

> > I'm guessing from the way this and __CMPWAIT_GEN are parameterised that

> > there is a plan for variants of this with different ordering semantics,

> > though I'm having difficulty envisioning them. Perhaps I lack

> > imagination. ;)

> 

> Originally I also had a cmpwait_relaxed implementation, since Peter had

> a generic cmpwait utility. That might be resurrected some day, so I

> figured leaving the code like this makes it easily extensible to other

> memory-ordering semantics without adding much in the way of complexity.


Ok.

> > Is there any case for waiting with anything other than acquire

> > semantics? If not, we could fold the acq parameter and acq_ prefix from

> > the name parameter.

> > 

> > Do we expect this to be used outside of smp_cond_load_acquire? If not,

> > can't we rely on the prior smp_load_acquire for the acquire semantics

> > (and use pain LDXR* here)? Then the acq part can go from the cmpwait

> > cases entirely.

> 

> Yeah, I think we can rely on the read-after-read ordering here and

> use __cmpwait_relaxed to the job for the inner loop.


Great!
 
> > Is there any case where we wouldn't want the memory clobber?

> 

> I don't think you'd need it for cmpwait_relaxed, because the CPU could

> reorder stuff anyway, so anything the compiler does is potentially futile.

> So actually, I can respin this without the clobber. I'll simplify

> the __CMPWAIT_CASE macro to drop the last two parameters as well.


I assume that means you're only implementing __cmpwait_relaxed for now.

If so, that sounds good to me!

Thanks,
Mark.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Will Deacon June 17, 2016, 4:04 p.m. UTC | #4
On Fri, Jun 17, 2016 at 05:42:42PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 17, 2016 at 04:38:03PM +0100, Will Deacon wrote:

> > > Is there any case where we wouldn't want the memory clobber?

> > 

> > I don't think you'd need it for cmpwait_relaxed, because the CPU could

> > reorder stuff anyway, so anything the compiler does is potentially futile.

> > So actually, I can respin this without the clobber. I'll simplify

> > the __CMPWAIT_CASE macro to drop the last two parameters as well.

> 

> In my previous patches I promoted cmpwait() as a cpu_relax()

> 'replacement'. Should we not retain the compiler barrier semantics

> because of that?


To be honest, it's never been clear to me what purpose the memory clobber
serves in cpu_relax().

> That is, anywhere you use this, you basically _have_ to reread memory,

> because the very fact that you return from this basically indicates its

> been changed.


Sure, but that re-read is likely to be done with an acquire. I think this
boils down to whether or not cmpwait_relaxed has any use-cases outside of
an acquire-based outer-loop like that in smp_cond_load_acquire.

Will

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
diff mbox

Patch

diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index dae5c49618db..4bb2b09f8ca9 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -91,6 +91,19 @@  do {									\
 	__u.__val;							\
 })
 
+#define smp_cond_load_acquire(ptr, cond_expr)				\
+({									\
+	typeof(ptr) __PTR = (ptr);					\
+	typeof(*ptr) VAL;						\
+	for (;;) {							\
+		VAL = smp_load_acquire(__PTR);				\
+		if (cond_expr)						\
+			break;						\
+		__cmpwait_acquire(__PTR, VAL);				\
+	}								\
+	VAL;								\
+})
+
 #include <asm-generic/barrier.h>
 
 #endif	/* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index 510c7b404454..84b83e521edc 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -224,4 +224,56 @@  __CMPXCHG_GEN(_mb)
 	__ret;								\
 })
 
+#define __CMPWAIT_CASE(w, sz, name, acq, cl)				\
+static inline void __cmpwait_case_##name(volatile void *ptr,		\
+					 unsigned long val)		\
+{									\
+	unsigned long tmp;						\
+									\
+	asm volatile(							\
+	"	ld" #acq "xr" #sz "\t%" #w "[tmp], %[v]\n"		\
+	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\
+	"	cbnz	%" #w "[tmp], 1f\n"				\
+	"	wfe\n"							\
+	"1:"								\
+	: [tmp] "=&r" (tmp), [v] "+Q" (*(unsigned long *)ptr)		\
+	: [val] "r" (val)						\
+	: cl);								\
+}
+
+__CMPWAIT_CASE(w, b, acq_1, a, "memory");
+__CMPWAIT_CASE(w, h, acq_2, a, "memory");
+__CMPWAIT_CASE(w,  , acq_4, a, "memory");
+__CMPWAIT_CASE( ,  , acq_8, a, "memory");
+
+#undef __CMPWAIT_CASE
+
+#define __CMPWAIT_GEN(sfx)						\
+static inline void __cmpwait##sfx(volatile void *ptr,			\
+				  unsigned long val,			\
+				  int size)				\
+{									\
+	switch (size) {							\
+	case 1:								\
+		return __cmpwait_case##sfx##_1(ptr, (u8)val);		\
+	case 2:								\
+		return __cmpwait_case##sfx##_2(ptr, (u16)val);		\
+	case 4:								\
+		return __cmpwait_case##sfx##_4(ptr, val);		\
+	case 8:								\
+		return __cmpwait_case##sfx##_8(ptr, val);		\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+									\
+	unreachable();							\
+}
+
+__CMPWAIT_GEN(_acq)
+
+#undef __CMPWAIT_GEN
+
+#define __cmpwait_acquire(ptr, val) \
+	__cmpwait_acq((ptr), (unsigned long)(val), sizeof(*(ptr)))
+
 #endif	/* __ASM_CMPXCHG_H */