diff mbox series

[3/3] rseq/selftests: Add support for arm64

Message ID 1529949285-11013-4-git-send-email-will.deacon@arm.com
State Superseded
Headers show
Series Support rseq on arm64 | expand

Commit Message

Will Deacon June 25, 2018, 5:54 p.m. UTC
Hook up arm64 support to the rseq selftests.

Signed-off-by: Will Deacon <will.deacon@arm.com>

---
 tools/testing/selftests/rseq/param_test.c |  20 +
 tools/testing/selftests/rseq/rseq-arm64.h | 594 ++++++++++++++++++++++++++++++
 tools/testing/selftests/rseq/rseq.h       |   2 +
 3 files changed, 616 insertions(+)
 create mode 100644 tools/testing/selftests/rseq/rseq-arm64.h

-- 
2.1.4

Comments

Mathieu Desnoyers June 25, 2018, 6:10 p.m. UTC | #1
----- On Jun 25, 2018, at 1:54 PM, Will Deacon will.deacon@arm.com wrote:

[...]

> +#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\

> +				post_commit_offset, abort_ip)			\

> +	"	.pushsection	__rseq_table, \"aw\"\n"				\

> +	"	.balign	32\n"							\

> +	__rseq_str(label) ":\n"							\

> +	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\

> +	"	.quad	" __rseq_str(start_ip) ", "				\

> +			  __rseq_str(post_commit_offset) ", "			\

> +			  __rseq_str(abort_ip) "\n"				\

> +	"	.popsection\n"

> +

> +#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\

> +	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\

> +				(post_commit_ip - start_ip), abort_ip)

> +

> +#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\

> +	RSEQ_INJECT_ASM(1)							\

> +	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\

> +	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\

> +			", :lo12:" __rseq_str(cs_label) "\n"			\

> +	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\

> +	__rseq_str(label) ":\n"

> +

> +#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\

> +	"	.pushsection	__rseq_failure, \"ax\"\n"			\

> +	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\

> +	__rseq_str(label) ":\n"							\

> +	"	b	%l[" __rseq_str(abort_label) "]\n"			\

> +	"	.popsection\n"


Thanks Will for porting rseq to arm64 !

I notice you are using the instructions

  adrp
  add
  str

to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
performance-wise with an approach using a literal pool
near the instruction pointer like I did on arm32 ?

With that approach, this ends up being simply

  adr
  str

which provides significantly better performance on my test
platform over loading a pointer targeting a separate data
section.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
Will Deacon June 26, 2018, 3:14 p.m. UTC | #2
Hi Mathieu,

On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
> ----- On Jun 25, 2018, at 1:54 PM, Will Deacon will.deacon@arm.com wrote:

> > +#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\

> > +				post_commit_offset, abort_ip)			\

> > +	"	.pushsection	__rseq_table, \"aw\"\n"				\

> > +	"	.balign	32\n"							\

> > +	__rseq_str(label) ":\n"							\

> > +	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\

> > +	"	.quad	" __rseq_str(start_ip) ", "				\

> > +			  __rseq_str(post_commit_offset) ", "			\

> > +			  __rseq_str(abort_ip) "\n"				\

> > +	"	.popsection\n"

> > +

> > +#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\

> > +	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\

> > +				(post_commit_ip - start_ip), abort_ip)

> > +

> > +#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\

> > +	RSEQ_INJECT_ASM(1)							\

> > +	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\

> > +	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\

> > +			", :lo12:" __rseq_str(cs_label) "\n"			\

> > +	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\

> > +	__rseq_str(label) ":\n"

> > +

> > +#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\

> > +	"	.pushsection	__rseq_failure, \"ax\"\n"			\

> > +	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\

> > +	__rseq_str(label) ":\n"							\

> > +	"	b	%l[" __rseq_str(abort_label) "]\n"			\

> > +	"	.popsection\n"

> 

> Thanks Will for porting rseq to arm64 !


That's ok, it was good fun :)

I'm going to chat with our compiler guys to see if there's any room for
improving the flexibility in the critical section, since having a temporary
in the clobber list is pretty grotty.

> I notice you are using the instructions

> 

>   adrp

>   add

>   str

> 

> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare

> performance-wise with an approach using a literal pool

> near the instruction pointer like I did on arm32 ?


I didn't, no. Do you have a benchmark to hand so I can give this a go?
The two reasons I didn't go down this route are:

1. It introduces data which is mapped as executable. I don't have a
   specific security concern here, but the way things have gone so far
   this year, I've realised that I'm not bright enough to anticipate
   these things.

2. It introduces a branch over the table on the fast path, which is likely
   to have a relatively higher branch misprediction cost on more advanced
   CPUs.

I also find it grotty that we emit two tables so that debuggers can cope,
but that's just a cosmetic nit.

> With that approach, this ends up being simply

> 

>   adr

>   str

> 

> which provides significantly better performance on my test

> platform over loading a pointer targeting a separate data

> section.


My understanding is that your test platform is based on Cortex-A7, so I'd
be wary about concluding too much about general performance from that CPU
since its a pretty straightforward in-order design.

Will
Mathieu Desnoyers June 26, 2018, 4:11 p.m. UTC | #3
----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon@arm.com wrote:

> Hi Mathieu,

> 

> On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:

>> ----- On Jun 25, 2018, at 1:54 PM, Will Deacon will.deacon@arm.com wrote:

>> > +#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\

>> > +				post_commit_offset, abort_ip)			\

>> > +	"	.pushsection	__rseq_table, \"aw\"\n"				\

>> > +	"	.balign	32\n"							\

>> > +	__rseq_str(label) ":\n"							\

>> > +	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\

>> > +	"	.quad	" __rseq_str(start_ip) ", "				\

>> > +			  __rseq_str(post_commit_offset) ", "			\

>> > +			  __rseq_str(abort_ip) "\n"				\

>> > +	"	.popsection\n"

>> > +

>> > +#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\

>> > +	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\

>> > +				(post_commit_ip - start_ip), abort_ip)

>> > +

>> > +#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\

>> > +	RSEQ_INJECT_ASM(1)							\

>> > +	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\

>> > +	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\

>> > +			", :lo12:" __rseq_str(cs_label) "\n"			\

>> > +	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\

>> > +	__rseq_str(label) ":\n"

>> > +

>> > +#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\

>> > +	"	.pushsection	__rseq_failure, \"ax\"\n"			\

>> > +	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\

>> > +	__rseq_str(label) ":\n"							\

>> > +	"	b	%l[" __rseq_str(abort_label) "]\n"			\

>> > +	"	.popsection\n"

>> 

>> Thanks Will for porting rseq to arm64 !

> 

> That's ok, it was good fun :)

> 

> I'm going to chat with our compiler guys to see if there's any room for

> improving the flexibility in the critical section, since having a temporary

> in the clobber list is pretty grotty.


Let me know how it goes!

> 

>> I notice you are using the instructions

>> 

>>   adrp

>>   add

>>   str

>> 

>> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare

>> performance-wise with an approach using a literal pool

>> near the instruction pointer like I did on arm32 ?

> 

> I didn't, no. Do you have a benchmark to hand so I can give this a go?


see tools/testing/selftests/rseq/param_test_benchmark --help

It's a stripped-down version of param_test, without all the code for
delay loops and testing checks.

Example use for counter increment with 4 threads, doing 5G counter
increments per thread:

time ./param_test_benchmark -T i -t 4 -r 5000000000

> The two reasons I didn't go down this route are:

> 

> 1. It introduces data which is mapped as executable. I don't have a

>   specific security concern here, but the way things have gone so far

>   this year, I've realised that I'm not bright enough to anticipate

>   these things.


So far I've been able to dig up that "pure code" or "execute only" code
is explicitly requested by compiler flags (-mno-pc-relative-literal-loads
on aarch64, -mpure-code on arm32 when the moon cycle is aligned). It's a
shame that it's not more standard, or that there does not appear to be any
preprocessor define available to test this within code.

I'm all for allowing end users to chose whether they want to use literal
pools in code or not, but I think it should be configurable at compile
time, and we should make it similar on arm32 and arm64. Given that compilers
don't emit preprocessor define, perhaps we need to introduce our own
RSEQ_NO_PC_RELATIVE_LITERAL_LOADS (or perhaps a shorter name ?) define to
select behavior at compile-time.

> 2. It introduces a branch over the table on the fast path, which is likely

>   to have a relatively higher branch misprediction cost on more advanced

>   CPUs.


Hrm, wait a second... I see that your comparison of the cpu number requires:

+#define RSEQ_ASM_OP_CMPEQ32(var, expect, label)                                        \
+        "        ldr        " RSEQ_ASM_TMP_REG32 ", %[" __rseq_str(var) "]\n"        \
+        "        sub        " RSEQ_ASM_TMP_REG32 ", " RSEQ_ASM_TMP_REG32                \
+                        ", %w[" __rseq_str(expect) "]\n"                        \
+        "        cbnz        " RSEQ_ASM_TMP_REG32 ", " __rseq_str(label) "\n"

because the abort code is emitted in a separate section:

+#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)                                \
+        "        .pushsection        __rseq_failure, \"ax\"\n"                        \
+        "        .long         "        __rseq_str(RSEQ_SIG) "\n"                        \
+        __rseq_str(label) ":\n"                                                        \
+        "        b        %l[" __rseq_str(abort_label) "]\n"                        \
+        "        .popsection\n"

Like I did on x86. But the cbnz instruction requires the branch target to be
within +/- 1MB from the instruction (http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch06s04.html),
which clearly is not guaranteed when you place the abort label in a separate
section.

Also, using cbnz to jump to a label that is outside of the assembly
(e.g. %l[error1]) does not ensure that the branch target is within
1MB of the code.

I've had assembler issues on arm32 due to those kind of constraints
when integrating rseq headers into larger code-bases.

So, one way to fix the fast-path so cpu number comparison can branch
to a close location is to put the abort code near the fast-path, and
you end up having to unconditionally jump over the abort code from
the fast-path on success. So once you bite the bullet and jump over
abort, you just have to ensure you place the struct rseq_cs data
near the abort code, so you end up jumping over both at the same time.

> 

> I also find it grotty that we emit two tables so that debuggers can cope,

> but that's just a cosmetic nit.

> 

>> With that approach, this ends up being simply

>> 

>>   adr

>>   str

>> 

>> which provides significantly better performance on my test

>> platform over loading a pointer targeting a separate data

>> section.

> 

> My understanding is that your test platform is based on Cortex-A7, so I'd

> be wary about concluding too much about general performance from that CPU

> since its a pretty straightforward in-order design.


I did benchmarks on our Wandboard (Cortex A9) as well as the Cubietruck. I
could only use perf to do detailed breakdown of the fast-path overhead on
the Cubie because I could not get it to work on our Wandboard, but overall
speed was better on Wandboard as well (as I recall) with the literal pool.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
Will Deacon June 28, 2018, 4:47 p.m. UTC | #4
Hi Mathieu,

On Tue, Jun 26, 2018 at 12:11:52PM -0400, Mathieu Desnoyers wrote:
> ----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon@arm.com wrote:

> > On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:

> >> I notice you are using the instructions

> >> 

> >>   adrp

> >>   add

> >>   str

> >> 

> >> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare

> >> performance-wise with an approach using a literal pool

> >> near the instruction pointer like I did on arm32 ?

> > 

> > I didn't, no. Do you have a benchmark to hand so I can give this a go?

> 

> see tools/testing/selftests/rseq/param_test_benchmark --help

> 

> It's a stripped-down version of param_test, without all the code for

> delay loops and testing checks.

> 

> Example use for counter increment with 4 threads, doing 5G counter

> increments per thread:

> 

> time ./param_test_benchmark -T i -t 4 -r 5000000000


Thanks. I ran that on a few arm64 systems I have access to, with three
configurations of the selftest:

1. As I posted
2. With the abort signature and branch in-lined, so as to avoid the CBNZ
   address limitations in large codebases
3. With both the abort handler and the table inlined (i.e. the same thing
   as 32-bit).

There isn't a reliably measurable difference between (1) and (2), but I take
between 12% and 27% hit between (2) and (3).

So I'll post a v2 based on (2).

Will
Mathieu Desnoyers June 28, 2018, 8:50 p.m. UTC | #5
----- On Jun 28, 2018, at 12:47 PM, Will Deacon will.deacon@arm.com wrote:

> Hi Mathieu,

> 

> On Tue, Jun 26, 2018 at 12:11:52PM -0400, Mathieu Desnoyers wrote:

>> ----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon@arm.com wrote:

>> > On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:

>> >> I notice you are using the instructions

>> >> 

>> >>   adrp

>> >>   add

>> >>   str

>> >> 

>> >> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare

>> >> performance-wise with an approach using a literal pool

>> >> near the instruction pointer like I did on arm32 ?

>> > 

>> > I didn't, no. Do you have a benchmark to hand so I can give this a go?

>> 

>> see tools/testing/selftests/rseq/param_test_benchmark --help

>> 

>> It's a stripped-down version of param_test, without all the code for

>> delay loops and testing checks.

>> 

>> Example use for counter increment with 4 threads, doing 5G counter

>> increments per thread:

>> 

>> time ./param_test_benchmark -T i -t 4 -r 5000000000

> 

> Thanks. I ran that on a few arm64 systems I have access to, with three

> configurations of the selftest:

> 

> 1. As I posted

> 2. With the abort signature and branch in-lined, so as to avoid the CBNZ

>   address limitations in large codebases

> 3. With both the abort handler and the table inlined (i.e. the same thing

>   as 32-bit).

> 

> There isn't a reliably measurable difference between (1) and (2), but I take

> between 12% and 27% hit between (2) and (3).


Those results puzzle me. Do you have the actual code snippets of each
implementation nearby ?

Thanks,

Mathieu

> 

> So I'll post a v2 based on (2).

> 

> Will


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
Will Deacon July 2, 2018, 4:49 p.m. UTC | #6
On Thu, Jun 28, 2018 at 04:50:40PM -0400, Mathieu Desnoyers wrote:
> ----- On Jun 28, 2018, at 12:47 PM, Will Deacon will.deacon@arm.com wrote:

> > On Tue, Jun 26, 2018 at 12:11:52PM -0400, Mathieu Desnoyers wrote:

> >> ----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon@arm.com wrote:

> >> > On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:

> >> >> I notice you are using the instructions

> >> >> 

> >> >>   adrp

> >> >>   add

> >> >>   str

> >> >> 

> >> >> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare

> >> >> performance-wise with an approach using a literal pool

> >> >> near the instruction pointer like I did on arm32 ?

> >> > 

> >> > I didn't, no. Do you have a benchmark to hand so I can give this a go?

> >> 

> >> see tools/testing/selftests/rseq/param_test_benchmark --help

> >> 

> >> It's a stripped-down version of param_test, without all the code for

> >> delay loops and testing checks.

> >> 

> >> Example use for counter increment with 4 threads, doing 5G counter

> >> increments per thread:

> >> 

> >> time ./param_test_benchmark -T i -t 4 -r 5000000000

> > 

> > Thanks. I ran that on a few arm64 systems I have access to, with three

> > configurations of the selftest:

> > 

> > 1. As I posted

> > 2. With the abort signature and branch in-lined, so as to avoid the CBNZ

> >   address limitations in large codebases

> > 3. With both the abort handler and the table inlined (i.e. the same thing

> >   as 32-bit).

> > 

> > There isn't a reliably measurable difference between (1) and (2), but I take

> > between 12% and 27% hit between (2) and (3).

> 

> Those results puzzle me. Do you have the actual code snippets of each

> implementation nearby ?


Sure, I've included the diffs for (2) and (3) below. They both apply on top
of my branch at:

git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git rseq

Will

--->8

diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
index 599788f74137..954f34671ca6 100644
--- a/tools/testing/selftests/rseq/rseq-arm64.h
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -104,11 +104,11 @@ do {										\
 	__rseq_str(label) ":\n"
 
 #define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
-	"	.pushsection	__rseq_failure, \"ax\"\n"			\
-	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
+	"	b	222f\n"							\
+	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\
 	__rseq_str(label) ":\n"							\
 	"	b	%l[" __rseq_str(abort_label) "]\n"			\
-	"	.popsection\n"
+	"222:\n"
 
 #define RSEQ_ASM_OP_STORE(value, var)						\
 	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"

--->8

diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
index 599788f74137..2554aa17acf3 100644
--- a/tools/testing/selftests/rseq/rseq-arm64.h
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -80,35 +80,37 @@ do {										\
 #define RSEQ_ASM_TMP_REG	"x15"
 #define RSEQ_ASM_TMP_REG_2	"x14"
 
-#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
+#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip,			\
 				post_commit_offset, abort_ip)			\
-	"	.pushsection	__rseq_table, \"aw\"\n"				\
-	"	.balign	32\n"							\
-	__rseq_str(label) ":\n"							\
 	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
 	"	.quad	" __rseq_str(start_ip) ", "				\
 			  __rseq_str(post_commit_offset) ", "			\
-			  __rseq_str(abort_ip) "\n"				\
-	"	.popsection\n"
+			  __rseq_str(abort_ip) "\n"
 
-#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
-	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
-				(post_commit_ip - start_ip), abort_ip)
+#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip)		\
+	"	.pushsection	__rseq_table, \"aw\"\n"				\
+	"	.balign	32\n"							\
+	__RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,				\
+				(post_commit_ip - start_ip), abort_ip)		\
+	"	.popsection\n"
 
-#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
+#define RSEQ_ASM_STORE_RSEQ_CS(label, table_label, rseq_cs)			\
 	RSEQ_INJECT_ASM(1)							\
-	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
-	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
-			", :lo12:" __rseq_str(cs_label) "\n"			\
+	"	adr	" RSEQ_ASM_TMP_REG ", " __rseq_str(table_label) "\n"	\
 	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
 	__rseq_str(label) ":\n"
 
-#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
-	"	.pushsection	__rseq_failure, \"ax\"\n"			\
-	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
+#define RSEQ_ASM_DEFINE_ABORT(table_label, start_ip, post_commit_ip, label,	\
+			      abort_label)					\
+	"	b	222f\n"							\
+	"	.balign 32\n"							\
+	__rseq_str(table_label) ":\n"						\
+	__RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,				\
+				(post_commit_ip - start_ip), label ## f)	\
+	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\
 	__rseq_str(label) ":\n"							\
 	"	b	%l[" __rseq_str(abort_label) "]\n"			\
-	"	.popsection\n"
+	"222:\n"
 
 #define RSEQ_ASM_OP_STORE(value, var)						\
 	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
@@ -181,8 +183,8 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -191,9 +193,9 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
 #endif
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -230,8 +232,8 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
@@ -243,9 +245,9 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 		RSEQ_ASM_OP_R_LOAD(v)
 		RSEQ_ASM_OP_R_STORE(load)
 		RSEQ_ASM_OP_R_LOAD_OFF(voffp)
-		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -281,8 +283,8 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 #ifdef RSEQ_COMPARE_TWICE
@@ -290,9 +292,9 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 #endif
 		RSEQ_ASM_OP_R_LOAD(v)
 		RSEQ_ASM_OP_R_ADD(count)
-		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
 		RSEQ_INJECT_ASM(4)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -324,8 +326,8 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -336,9 +338,9 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 #endif
 		RSEQ_ASM_OP_STORE(newv2, v2)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -378,8 +380,8 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -390,9 +392,9 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
 #endif
 		RSEQ_ASM_OP_STORE(newv2, v2)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -432,8 +434,8 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -445,9 +447,9 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
 		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
 #endif
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -489,8 +491,8 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -501,9 +503,9 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 #endif
 		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -544,8 +546,8 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -556,9 +558,9 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 #endif
 		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
Mathieu Desnoyers July 2, 2018, 5:47 p.m. UTC | #7
----- On Jul 2, 2018, at 12:49 PM, Will Deacon will.deacon@arm.com wrote:

> On Thu, Jun 28, 2018 at 04:50:40PM -0400, Mathieu Desnoyers wrote:

>> ----- On Jun 28, 2018, at 12:47 PM, Will Deacon will.deacon@arm.com wrote:

>> > On Tue, Jun 26, 2018 at 12:11:52PM -0400, Mathieu Desnoyers wrote:

>> >> ----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon@arm.com wrote:

>> >> > On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:

>> >> >> I notice you are using the instructions

>> >> >> 

>> >> >>   adrp

>> >> >>   add

>> >> >>   str

>> >> >> 

>> >> >> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare

>> >> >> performance-wise with an approach using a literal pool

>> >> >> near the instruction pointer like I did on arm32 ?

>> >> > 

>> >> > I didn't, no. Do you have a benchmark to hand so I can give this a go?

>> >> 

>> >> see tools/testing/selftests/rseq/param_test_benchmark --help

>> >> 

>> >> It's a stripped-down version of param_test, without all the code for

>> >> delay loops and testing checks.

>> >> 

>> >> Example use for counter increment with 4 threads, doing 5G counter

>> >> increments per thread:

>> >> 

>> >> time ./param_test_benchmark -T i -t 4 -r 5000000000

>> > 

>> > Thanks. I ran that on a few arm64 systems I have access to, with three

>> > configurations of the selftest:

>> > 

>> > 1. As I posted

>> > 2. With the abort signature and branch in-lined, so as to avoid the CBNZ

>> >   address limitations in large codebases

>> > 3. With both the abort handler and the table inlined (i.e. the same thing

>> >   as 32-bit).

>> > 

>> > There isn't a reliably measurable difference between (1) and (2), but I take

>> > between 12% and 27% hit between (2) and (3).

>> 

>> Those results puzzle me. Do you have the actual code snippets of each

>> implementation nearby ?

> 

> Sure, I've included the diffs for (2) and (3) below. They both apply on top

> of my branch at:

> 

> git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git rseq

> 

> Will


I figured out that ADRP+ADD are optimized on Cortex A57 to have a 1 cycle
latency. This would explain why they are doing comparatively well compared
to ADR.

And I guess having more compact code wins here.

So I'm OK with your patchset with the modification for (2), which ensures
the abort label is not too far away on large code-bases.

Thanks!

Mathieu

> 

> --->8

> 

> diff --git a/tools/testing/selftests/rseq/rseq-arm64.h

> b/tools/testing/selftests/rseq/rseq-arm64.h

> index 599788f74137..954f34671ca6 100644

> --- a/tools/testing/selftests/rseq/rseq-arm64.h

> +++ b/tools/testing/selftests/rseq/rseq-arm64.h

> @@ -104,11 +104,11 @@ do {										\

> 	__rseq_str(label) ":\n"

> 

> #define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\

> -	"	.pushsection	__rseq_failure, \"ax\"\n"			\

> -	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\

> +	"	b	222f\n"							\

> +	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\

> 	__rseq_str(label) ":\n"							\

> 	"	b	%l[" __rseq_str(abort_label) "]\n"			\

> -	"	.popsection\n"

> +	"222:\n"

> 

> #define RSEQ_ASM_OP_STORE(value, var)						\

> 	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"

> 

> --->8

> 

> diff --git a/tools/testing/selftests/rseq/rseq-arm64.h

> b/tools/testing/selftests/rseq/rseq-arm64.h

> index 599788f74137..2554aa17acf3 100644

> --- a/tools/testing/selftests/rseq/rseq-arm64.h

> +++ b/tools/testing/selftests/rseq/rseq-arm64.h

> @@ -80,35 +80,37 @@ do {										\

> #define RSEQ_ASM_TMP_REG	"x15"

> #define RSEQ_ASM_TMP_REG_2	"x14"

> 

> -#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\

> +#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip,			\

> 				post_commit_offset, abort_ip)			\

> -	"	.pushsection	__rseq_table, \"aw\"\n"				\

> -	"	.balign	32\n"							\

> -	__rseq_str(label) ":\n"							\

> 	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\

> 	"	.quad	" __rseq_str(start_ip) ", "				\

> 			  __rseq_str(post_commit_offset) ", "			\

> -			  __rseq_str(abort_ip) "\n"				\

> -	"	.popsection\n"

> +			  __rseq_str(abort_ip) "\n"

> 

> -#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\

> -	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\

> -				(post_commit_ip - start_ip), abort_ip)

> +#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip)		\

> +	"	.pushsection	__rseq_table, \"aw\"\n"				\

> +	"	.balign	32\n"							\

> +	__RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,				\

> +				(post_commit_ip - start_ip), abort_ip)		\

> +	"	.popsection\n"

> 

> -#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\

> +#define RSEQ_ASM_STORE_RSEQ_CS(label, table_label, rseq_cs)			\

> 	RSEQ_INJECT_ASM(1)							\

> -	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\

> -	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\

> -			", :lo12:" __rseq_str(cs_label) "\n"			\

> +	"	adr	" RSEQ_ASM_TMP_REG ", " __rseq_str(table_label) "\n"	\

> 	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\

> 	__rseq_str(label) ":\n"

> 

> -#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\

> -	"	.pushsection	__rseq_failure, \"ax\"\n"			\

> -	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\

> +#define RSEQ_ASM_DEFINE_ABORT(table_label, start_ip, post_commit_ip, label,	\

> +			      abort_label)					\

> +	"	b	222f\n"							\

> +	"	.balign 32\n"							\

> +	__rseq_str(table_label) ":\n"						\

> +	__RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,				\

> +				(post_commit_ip - start_ip), label ## f)	\

> +	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\

> 	__rseq_str(label) ":\n"							\

> 	"	b	%l[" __rseq_str(abort_label) "]\n"			\

> -	"	.popsection\n"

> +	"222:\n"

> 

> #define RSEQ_ASM_OP_STORE(value, var)						\

> 	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"

> @@ -181,8 +183,8 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect,

> intptr_t newv, int cpu)

> 	RSEQ_INJECT_C(9)

> 

> 	__asm__ __volatile__ goto (

> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)

> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)

> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)

> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)

> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)

> 		RSEQ_INJECT_ASM(3)

> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])

> @@ -191,9 +193,9 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect,

> intptr_t newv, int cpu)

> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])

> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])

> #endif

> -		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)

> +		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)

> 		RSEQ_INJECT_ASM(5)

> -		RSEQ_ASM_DEFINE_ABORT(4, abort)

> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)

> 		: /* gcc asm goto does not allow outputs */

> 		: [cpu_id]		"r" (cpu),

> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),

> @@ -230,8 +232,8 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t

> expectnot,

> 	RSEQ_INJECT_C(9)

> 

> 	__asm__ __volatile__ goto (

> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)

> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)

> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)

> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)

> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)

> 		RSEQ_INJECT_ASM(3)

> 		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])

> @@ -243,9 +245,9 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t

> expectnot,

> 		RSEQ_ASM_OP_R_LOAD(v)

> 		RSEQ_ASM_OP_R_STORE(load)

> 		RSEQ_ASM_OP_R_LOAD_OFF(voffp)

> -		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)

> +		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)

> 		RSEQ_INJECT_ASM(5)

> -		RSEQ_ASM_DEFINE_ABORT(4, abort)

> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)

> 		: /* gcc asm goto does not allow outputs */

> 		: [cpu_id]		"r" (cpu),

> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),

> @@ -281,8 +283,8 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)

> 	RSEQ_INJECT_C(9)

> 

> 	__asm__ __volatile__ goto (

> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)

> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)

> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)

> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)

> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)

> 		RSEQ_INJECT_ASM(3)

> #ifdef RSEQ_COMPARE_TWICE

> @@ -290,9 +292,9 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)

> #endif

> 		RSEQ_ASM_OP_R_LOAD(v)

> 		RSEQ_ASM_OP_R_ADD(count)

> -		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)

> +		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)

> 		RSEQ_INJECT_ASM(4)

> -		RSEQ_ASM_DEFINE_ABORT(4, abort)

> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)

> 		: /* gcc asm goto does not allow outputs */

> 		: [cpu_id]		"r" (cpu),

> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),

> @@ -324,8 +326,8 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t

> expect,

> 	RSEQ_INJECT_C(9)

> 

> 	__asm__ __volatile__ goto (

> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)

> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)

> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)

> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)

> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)

> 		RSEQ_INJECT_ASM(3)

> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])

> @@ -336,9 +338,9 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t

> expect,

> #endif

> 		RSEQ_ASM_OP_STORE(newv2, v2)

> 		RSEQ_INJECT_ASM(5)

> -		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)

> +		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)

> 		RSEQ_INJECT_ASM(6)

> -		RSEQ_ASM_DEFINE_ABORT(4, abort)

> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)

> 		: /* gcc asm goto does not allow outputs */

> 		: [cpu_id]		"r" (cpu),

> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),

> @@ -378,8 +380,8 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v,

> intptr_t expect,

> 	RSEQ_INJECT_C(9)

> 

> 	__asm__ __volatile__ goto (

> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)

> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)

> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)

> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)

> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)

> 		RSEQ_INJECT_ASM(3)

> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])

> @@ -390,9 +392,9 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v,

> intptr_t expect,

> #endif

> 		RSEQ_ASM_OP_STORE(newv2, v2)

> 		RSEQ_INJECT_ASM(5)

> -		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)

> +		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 2)

> 		RSEQ_INJECT_ASM(6)

> -		RSEQ_ASM_DEFINE_ABORT(4, abort)

> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)

> 		: /* gcc asm goto does not allow outputs */

> 		: [cpu_id]		"r" (cpu),

> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),

> @@ -432,8 +434,8 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,

> 	RSEQ_INJECT_C(9)

> 

> 	__asm__ __volatile__ goto (

> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)

> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)

> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)

> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)

> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)

> 		RSEQ_INJECT_ASM(3)

> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])

> @@ -445,9 +447,9 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,

> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])

> 		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])

> #endif

> -		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)

> +		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)

> 		RSEQ_INJECT_ASM(6)

> -		RSEQ_ASM_DEFINE_ABORT(4, abort)

> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)

> 		: /* gcc asm goto does not allow outputs */

> 		: [cpu_id]		"r" (cpu),

> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),

> @@ -489,8 +491,8 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t

> expect,

> 	RSEQ_INJECT_C(9)

> 

> 	__asm__ __volatile__ goto (

> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)

> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)

> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)

> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)

> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)

> 		RSEQ_INJECT_ASM(3)

> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])

> @@ -501,9 +503,9 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t

> expect,

> #endif

> 		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)

> 		RSEQ_INJECT_ASM(5)

> -		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)

> +		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)

> 		RSEQ_INJECT_ASM(6)

> -		RSEQ_ASM_DEFINE_ABORT(4, abort)

> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)

> 		: /* gcc asm goto does not allow outputs */

> 		: [cpu_id]		"r" (cpu),

> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),

> @@ -544,8 +546,8 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v,

> intptr_t expect,

> 	RSEQ_INJECT_C(9)

> 

> 	__asm__ __volatile__ goto (

> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)

> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)

> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)

> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)

> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)

> 		RSEQ_INJECT_ASM(3)

> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])

> @@ -556,9 +558,9 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v,

> intptr_t expect,

> #endif

> 		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)

> 		RSEQ_INJECT_ASM(5)

> -		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)

> +		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 2)

> 		RSEQ_INJECT_ASM(6)

> -		RSEQ_ASM_DEFINE_ABORT(4, abort)

> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)

> 		: /* gcc asm goto does not allow outputs */

> 		: [cpu_id]		"r" (cpu),

>  		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
diff mbox series

Patch

diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
index 615252331813..fa144c556371 100644
--- a/tools/testing/selftests/rseq/param_test.c
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -114,6 +114,26 @@  unsigned int yield_mod_cnt, nr_abort;
 	"bne 222b\n\t" \
 	"333:\n\t"
 
+#elif defined(__AARCH64EL__)
+
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1] "Qo" (loop_cnt[1]) \
+	, [loop_cnt_2] "Qo" (loop_cnt[2]) \
+	, [loop_cnt_3] "Qo" (loop_cnt[3]) \
+	, [loop_cnt_4] "Qo" (loop_cnt[4]) \
+	, [loop_cnt_5] "Qo" (loop_cnt[5]) \
+	, [loop_cnt_6] "Qo" (loop_cnt[6])
+
+#define INJECT_ASM_REG	RSEQ_ASM_TMP_REG32
+
+#define RSEQ_INJECT_ASM(n) \
+	"	ldr	" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n"	\
+	"	cbz	" INJECT_ASM_REG ", 333f\n"			\
+	"222:\n"							\
+	"	sub	" INJECT_ASM_REG ", " INJECT_ASM_REG ", #1\n"	\
+	"	cbnz	" INJECT_ASM_REG ", 222b\n"			\
+	"333:\n"
+
 #elif __PPC__
 
 #define RSEQ_INJECT_INPUT \
diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
new file mode 100644
index 000000000000..599788f74137
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -0,0 +1,594 @@ 
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-arm64.h
+ *
+ * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * (C) Copyright 2018 - Will Deacon <will.deacon@arm.com>
+ */
+
+#define RSEQ_SIG	0xd428bc00	/* BRK #0x45E0 */
+
+#define rseq_smp_mb()	__asm__ __volatile__ ("dmb ish" ::: "memory")
+#define rseq_smp_rmb()	__asm__ __volatile__ ("dmb ishld" ::: "memory")
+#define rseq_smp_wmb()	__asm__ __volatile__ ("dmb ishst" ::: "memory")
+
+#define rseq_smp_load_acquire(p)						\
+__extension__ ({								\
+	__typeof(*p) ____p1;							\
+	switch (sizeof(*p)) {							\
+	case 1:									\
+		asm volatile ("ldarb %w0, %1"					\
+			: "=r" (*(__u8 *)p)					\
+			: "Q" (*p) : "memory");					\
+		break;								\
+	case 2:									\
+		asm volatile ("ldarh %w0, %1"					\
+			: "=r" (*(__u16 *)p)					\
+			: "Q" (*p) : "memory");					\
+		break;								\
+	case 4:									\
+		asm volatile ("ldar %w0, %1"					\
+			: "=r" (*(__u32 *)p)					\
+			: "Q" (*p) : "memory");					\
+		break;								\
+	case 8:									\
+		asm volatile ("ldar %0, %1"					\
+			: "=r" (*(__u64 *)p)					\
+			: "Q" (*p) : "memory");					\
+		break;								\
+	}									\
+	____p1;									\
+})
+
+#define rseq_smp_acquire__after_ctrl_dep()	rseq_smp_rmb()
+
+#define rseq_smp_store_release(p, v)						\
+do {										\
+	switch (sizeof(*p)) {							\
+	case 1:									\
+		asm volatile ("stlrb %w1, %0"					\
+				: "=Q" (*p)					\
+				: "r" ((__u8)v)					\
+				: "memory");					\
+		break;								\
+	case 2:									\
+		asm volatile ("stlrh %w1, %0"					\
+				: "=Q" (*p)					\
+				: "r" ((__u16)v)				\
+				: "memory");					\
+		break;								\
+	case 4:									\
+		asm volatile ("stlr %w1, %0"					\
+				: "=Q" (*p)					\
+				: "r" ((__u32)v)				\
+				: "memory");					\
+		break;								\
+	case 8:									\
+		asm volatile ("stlr %1, %0"					\
+				: "=Q" (*p)					\
+				: "r" ((__u64)v)				\
+				: "memory");					\
+		break;								\
+	}									\
+} while (0)
+
+#ifdef RSEQ_SKIP_FASTPATH
+#include "rseq-skip.h"
+#else /* !RSEQ_SKIP_FASTPATH */
+
+#define RSEQ_ASM_TMP_REG32	"w15"
+#define RSEQ_ASM_TMP_REG	"x15"
+#define RSEQ_ASM_TMP_REG_2	"x14"
+
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
+				post_commit_offset, abort_ip)			\
+	"	.pushsection	__rseq_table, \"aw\"\n"				\
+	"	.balign	32\n"							\
+	__rseq_str(label) ":\n"							\
+	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
+	"	.quad	" __rseq_str(start_ip) ", "				\
+			  __rseq_str(post_commit_offset) ", "			\
+			  __rseq_str(abort_ip) "\n"				\
+	"	.popsection\n"
+
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
+	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
+				(post_commit_ip - start_ip), abort_ip)
+
+#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
+	RSEQ_INJECT_ASM(1)							\
+	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
+	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
+			", :lo12:" __rseq_str(cs_label) "\n"			\
+	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
+	__rseq_str(label) ":\n"
+
+#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
+	"	.pushsection	__rseq_failure, \"ax\"\n"			\
+	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
+	__rseq_str(label) ":\n"							\
+	"	b	%l[" __rseq_str(abort_label) "]\n"			\
+	"	.popsection\n"
+
+#define RSEQ_ASM_OP_STORE(value, var)						\
+	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_STORE_RELEASE(value, var)					\
+	"	stlr	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_FINAL_STORE(value, var, post_commit_label)			\
+	RSEQ_ASM_OP_STORE(value, var)						\
+	__rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_FINAL_STORE_RELEASE(value, var, post_commit_label)		\
+	RSEQ_ASM_OP_STORE_RELEASE(value, var)					\
+	__rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_CMPEQ(var, expect, label)					\
+	"	ldr	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"		\
+	"	sub	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
+			", %[" __rseq_str(expect) "]\n"				\
+	"	cbnz	" RSEQ_ASM_TMP_REG ", " __rseq_str(label) "\n"
+
+#define RSEQ_ASM_OP_CMPEQ32(var, expect, label)					\
+	"	ldr	" RSEQ_ASM_TMP_REG32 ", %[" __rseq_str(var) "]\n"	\
+	"	sub	" RSEQ_ASM_TMP_REG32 ", " RSEQ_ASM_TMP_REG32		\
+			", %w[" __rseq_str(expect) "]\n"			\
+	"	cbnz	" RSEQ_ASM_TMP_REG32 ", " __rseq_str(label) "\n"
+
+#define RSEQ_ASM_OP_CMPNE(var, expect, label)					\
+	"	ldr	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"		\
+	"	sub	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
+			", %[" __rseq_str(expect) "]\n"				\
+	"	cbz	" RSEQ_ASM_TMP_REG ", " __rseq_str(label) "\n"
+
+#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label)			\
+	RSEQ_INJECT_ASM(2)							\
+	RSEQ_ASM_OP_CMPEQ32(current_cpu_id, cpu_id, label)
+
+#define RSEQ_ASM_OP_R_LOAD(var)							\
+	"	ldr	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_R_STORE(var)						\
+	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_R_LOAD_OFF(offset)						\
+	"	ldr	" RSEQ_ASM_TMP_REG ", [" RSEQ_ASM_TMP_REG		\
+			", %[" __rseq_str(offset) "]]\n"
+
+#define RSEQ_ASM_OP_R_ADD(count)						\
+	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
+			", %[" __rseq_str(count) "]\n"
+
+#define RSEQ_ASM_OP_R_FINAL_STORE(var, post_commit_label)			\
+	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"		\
+	__rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)					\
+	"	cbz	%[" __rseq_str(len) "], 333f\n"				\
+	"	mov	" RSEQ_ASM_TMP_REG_2 ", %[" __rseq_str(len) "]\n"	\
+	"222:	sub	" RSEQ_ASM_TMP_REG_2 ", " RSEQ_ASM_TMP_REG_2 ", #1\n"	\
+	"	ldrb	" RSEQ_ASM_TMP_REG32 ", [%[" __rseq_str(src) "]"	\
+			", " RSEQ_ASM_TMP_REG_2 "]\n"				\
+	"	strb	" RSEQ_ASM_TMP_REG32 ", [%[" __rseq_str(dst) "]"	\
+			", " RSEQ_ASM_TMP_REG_2 "]\n"				\
+	"	cbnz	" RSEQ_ASM_TMP_REG_2 ", 222b\n"				\
+	"333:\n"
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [v]			"Qo" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
+			       off_t voffp, intptr_t *load, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[error2])
+#endif
+		RSEQ_ASM_OP_R_LOAD(v)
+		RSEQ_ASM_OP_R_STORE(load)
+		RSEQ_ASM_OP_R_LOAD_OFF(voffp)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [v]			"Qo" (*v),
+		  [expectnot]		"r" (expectnot),
+		  [load]		"Qo" (*load),
+		  [voffp]		"r" (voffp)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+		RSEQ_ASM_OP_R_LOAD(v)
+		RSEQ_ASM_OP_R_ADD(count)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [v]			"Qo" (*v),
+		  [count]		"r" (count)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
+				 intptr_t *v2, intptr_t newv2,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_STORE(newv2, v2)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [v2]			"Qo" (*v2),
+		  [newv2]		"r" (newv2)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
+					 intptr_t *v2, intptr_t newv2,
+					 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_STORE(newv2, v2)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [v2]			"Qo" (*v2),
+		  [newv2]		"r" (newv2)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[cmpfail])
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
+#endif
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [v]			"Qo" (*v),
+		  [expect]		"r" (expect),
+		  [v2]			"Qo" (*v2),
+		  [expect2]		"r" (expect2),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2, error3
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+error3:
+	rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
+				 void *dst, void *src, size_t len,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG, RSEQ_ASM_TMP_REG_2
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
+					 void *dst, void *src, size_t len,
+					 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG, RSEQ_ASM_TMP_REG_2
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
index a4684112676c..b5d94087fe31 100644
--- a/tools/testing/selftests/rseq/rseq.h
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -71,6 +71,8 @@  extern __thread volatile struct rseq __rseq_abi;
 #include <rseq-x86.h>
 #elif defined(__ARMEL__)
 #include <rseq-arm.h>
+#elif defined (__AARCH64EL__)
+#include <rseq-arm64.h>
 #elif defined(__PPC__)
 #include <rseq-ppc.h>
 #elif defined(__mips__)