diff mbox

[for-2.8] target-arm/translate-a64: fix gen_load_exclusive

Message ID 20161202173454.19179-1-alex.bennee@linaro.org
State Superseded
Headers show

Commit Message

Alex Bennée Dec. 2, 2016, 5:34 p.m. UTC
While testing rth's latest TCG patches with risu I found ldaxp was
broken. Investigating further I found it was broken by 1dd089d0 when
the cmpxchg atomic work was merged. As part of that change the code
attempted to be clever by doing a single 64 bit load and then shuffle
the data around to set the two 32 bit registers.

As I couldn't quite follow the endian magic I've simply partially
reverted the change to the original code gen_load_exclusive code. This
doesn't affect the cmpxchg functionality as that is all done on in
gen_store_exclusive part which is untouched.

I've also restored the comment that was removed (with a slight tweak
to mention cmpxchg).

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

---
 target-arm/translate-a64.c | 42 +++++++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 23 deletions(-)

-- 
2.10.2

Comments

Alex Bennée Dec. 5, 2016, 11:09 a.m. UTC | #1
Alex Bennée <alex.bennee@linaro.org> writes:

> While testing rth's latest TCG patches with risu I found ldaxp was

> broken. Investigating further I found it was broken by 1dd089d0 when

> the cmpxchg atomic work was merged.


CC'ing Paolo/Richard

Do you guys have any final TCG fixes planned for 2.8 that can take this
fix for the ldaxp regression?

> As part of that change the code

> attempted to be clever by doing a single 64 bit load and then shuffle

> the data around to set the two 32 bit registers.

>

> As I couldn't quite follow the endian magic I've simply partially

> reverted the change to the original code gen_load_exclusive code. This

> doesn't affect the cmpxchg functionality as that is all done on in

> gen_store_exclusive part which is untouched.

>

> I've also restored the comment that was removed (with a slight tweak

> to mention cmpxchg).

>

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> ---

>  target-arm/translate-a64.c | 42 +++++++++++++++++++-----------------------

>  1 file changed, 19 insertions(+), 23 deletions(-)

>

> diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c

> index de48747..6dc27a6 100644

> --- a/target-arm/translate-a64.c

> +++ b/target-arm/translate-a64.c

> @@ -1839,41 +1839,37 @@ static void disas_b_exc_sys(DisasContext *s, uint32_t insn)

>      }

>  }

>

> +/*

> + * Load/Store exclusive instructions are implemented by remembering

> + * the value/address loaded, and seeing if these are the same

> + * when the store is performed. This is not actually the architecturally

> + * mandated semantics, but it works for typical guest code sequences

> + * and avoids having to monitor regular stores.

> + *

> + * The store exclusive uses the atomic cmpxchg primitives to avoid

> + * races in multi-threaded linux-user and when MTTCG softmmu is

> + * enabled.

> + */

>  static void gen_load_exclusive(DisasContext *s, int rt, int rt2,

>                                 TCGv_i64 addr, int size, bool is_pair)

>  {

>      TCGv_i64 tmp = tcg_temp_new_i64();

> -    TCGMemOp be = s->be_data;

> +    TCGMemOp memop = s->be_data + size;

>

>      g_assert(size <= 3);

> +    tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), memop);

> +

>      if (is_pair) {

> +        TCGv_i64 addr2 = tcg_temp_new_i64();

>          TCGv_i64 hitmp = tcg_temp_new_i64();

>

> -        if (size == 3) {

> -            TCGv_i64 addr2 = tcg_temp_new_i64();

> -

> -            tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s),

> -                                MO_64 | MO_ALIGN_16 | be);

> -            tcg_gen_addi_i64(addr2, addr, 8);

> -            tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s),

> -                                MO_64 | MO_ALIGN | be);

> -            tcg_temp_free_i64(addr2);

> -        } else {

> -            g_assert(size == 2);

> -            tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s),

> -                                MO_64 | MO_ALIGN | be);

> -            if (be == MO_LE) {

> -                tcg_gen_extr32_i64(tmp, hitmp, tmp);

> -            } else {

> -                tcg_gen_extr32_i64(hitmp, tmp, tmp);

> -            }

> -        }

> -

> +        g_assert(size >= 2);

> +        tcg_gen_addi_i64(addr2, addr, 1 << size);

> +        tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s), memop);

> +        tcg_temp_free_i64(addr2);

>          tcg_gen_mov_i64(cpu_exclusive_high, hitmp);

>          tcg_gen_mov_i64(cpu_reg(s, rt2), hitmp);

>          tcg_temp_free_i64(hitmp);

> -    } else {

> -        tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), size | MO_ALIGN | be);

>      }

>

>      tcg_gen_mov_i64(cpu_exclusive_val, tmp);



--
Alex Bennée
Richard Henderson Dec. 5, 2016, 3:42 p.m. UTC | #2
On 12/05/2016 03:09 AM, Alex Bennée wrote:
> 

> Alex Bennée <alex.bennee@linaro.org> writes:

> 

>> While testing rth's latest TCG patches with risu I found ldaxp was

>> broken. Investigating further I found it was broken by 1dd089d0 when

>> the cmpxchg atomic work was merged.

> 

> CC'ing Paolo/Richard

> 

> Do you guys have any final TCG fixes planned for 2.8 that can take this

> fix for the ldaxp regression?


I don't have any pending patchs for 2.8, no.

>> As part of that change the code

>> attempted to be clever by doing a single 64 bit load and then shuffle

>> the data around to set the two 32 bit registers.


It's not trying to be "clever", it's trying to be correct, giving an atomic
64-bit load.

The fact that we didn't attempt an atomic 128-bit load here for the 64-bit pair
is a bug.  We have support for that in helper_atomic_ldo_le_mmu; I'm not sure
why I didn't use that here.

>> As I couldn't quite follow the endian magic I've simply partially

>> reverted the change to the original code gen_load_exclusive code. This

>> doesn't affect the cmpxchg functionality as that is all done on in

>> gen_store_exclusive part which is untouched.


We'll have to fix it properly eventually.  But perhaps 2.9 is soon enough,
since that's when mttcg will go in.


r~
Peter Maydell Dec. 5, 2016, 3:55 p.m. UTC | #3
On 5 December 2016 at 15:42, Richard Henderson <rth@twiddle.net> wrote:
> We'll have to fix it properly eventually.  But perhaps 2.9 is soon enough,

> since that's when mttcg will go in.


Could you provide a reviewed-by (or an explicit nak) for Alex's
patch, please? I didn't look at this area of the code when it
went in so I'm not really in position to review it even if it
goes via my tree.

thanks
-- PMM
Alex Bennée Dec. 5, 2016, 5:04 p.m. UTC | #4
Richard Henderson <rth@twiddle.net> writes:

> On 12/05/2016 03:09 AM, Alex Bennée wrote:

>>

>> Alex Bennée <alex.bennee@linaro.org> writes:

>>

>>> While testing rth's latest TCG patches with risu I found ldaxp was

>>> broken. Investigating further I found it was broken by 1dd089d0 when

>>> the cmpxchg atomic work was merged.

>>

>> CC'ing Paolo/Richard

>>

>> Do you guys have any final TCG fixes planned for 2.8 that can take this

>> fix for the ldaxp regression?

>

> I don't have any pending patchs for 2.8, no.

>

>>> As part of that change the code

>>> attempted to be clever by doing a single 64 bit load and then shuffle

>>> the data around to set the two 32 bit registers.

>

> It's not trying to be "clever", it's trying to be correct, giving an atomic

> 64-bit load.


Ahh right I see. What happens if the backend is 32bit, will it issue two
loads anyway?

>

> The fact that we didn't attempt an atomic 128-bit load here for the 64-bit pair

> is a bug.  We have support for that in helper_atomic_ldo_le_mmu; I'm not sure

> why I didn't use that here.

>

>>> As I couldn't quite follow the endian magic I've simply partially

>>> reverted the change to the original code gen_load_exclusive code. This

>>> doesn't affect the cmpxchg functionality as that is all done on in

>>> gen_store_exclusive part which is untouched.

>

> We'll have to fix it properly eventually.  But perhaps 2.9 is soon enough,

> since that's when mttcg will go in.


I guess the easiest way would be to punt these paired loads to a helper?

However regardless of the atomicity the actual values loaded into
the registers are wrong so that behaviour is a regression vs 2.7.

I don't know how often these load-exclusive paired operations are used
in real code but I think we should at least fix it so the values are
loaded properly for 2.8 and do the proper atomic fix for 2.9.

>

>

> r~



--
Alex Bennée
Richard Henderson Dec. 5, 2016, 5:15 p.m. UTC | #5
On 12/05/2016 09:04 AM, Alex Bennée wrote:
>> It's not trying to be "clever", it's trying to be correct, giving an atomic

>> 64-bit load.

> 

> Ahh right I see. What happens if the backend is 32bit, will it issue two

> loads anyway?


Yes.  I did bring this up when the atomics patch set went in.

In principal there's no reason we can't handle this like the 128-bit path, with
a special helper for a 32-bit host using __atomic_load_64 if available, which
it would be for i686 (via fpu) and armv7 (via ldrexd), or
gen_helper_exit_atomic if not.

But it's nasty and slow, and work that's unnecessary if we simply decide that
32-bit hosts cannot run mttcg.

> I don't know how often these load-exclusive paired operations are used

> in real code but I think we should at least fix it so the values are

> loaded properly for 2.8 and do the proper atomic fix for 2.9.


Yep.


r~
Richard Henderson Dec. 5, 2016, 5:22 p.m. UTC | #6
On 12/05/2016 03:09 AM, Alex Bennée wrote:
> 

> Alex Bennée <alex.bennee@linaro.org> writes:

> 

>> While testing rth's latest TCG patches with risu I found ldaxp was

>> broken. Investigating further I found it was broken by 1dd089d0 when

>> the cmpxchg atomic work was merged.

> 

> CC'ing Paolo/Richard

> 

> Do you guys have any final TCG fixes planned for 2.8 that can take this

> fix for the ldaxp regression?

> 

>> As part of that change the code

>> attempted to be clever by doing a single 64 bit load and then shuffle

>> the data around to set the two 32 bit registers.

>>

>> As I couldn't quite follow the endian magic I've simply partially

>> reverted the change to the original code gen_load_exclusive code. This

>> doesn't affect the cmpxchg functionality as that is all done on in

>> gen_store_exclusive part which is untouched.

>>

>> I've also restored the comment that was removed (with a slight tweak

>> to mention cmpxchg).

>>

>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>> ---

>>  target-arm/translate-a64.c | 42 +++++++++++++++++++-----------------------

>>  1 file changed, 19 insertions(+), 23 deletions(-)

>>

>> diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c

>> index de48747..6dc27a6 100644

>> --- a/target-arm/translate-a64.c

>> +++ b/target-arm/translate-a64.c

>> @@ -1839,41 +1839,37 @@ static void disas_b_exc_sys(DisasContext *s, uint32_t insn)

>>      }

>>  }

>>

>> +/*

>> + * Load/Store exclusive instructions are implemented by remembering

>> + * the value/address loaded, and seeing if these are the same

>> + * when the store is performed. This is not actually the architecturally

>> + * mandated semantics, but it works for typical guest code sequences

>> + * and avoids having to monitor regular stores.

>> + *

>> + * The store exclusive uses the atomic cmpxchg primitives to avoid

>> + * races in multi-threaded linux-user and when MTTCG softmmu is

>> + * enabled.

>> + */

>>  static void gen_load_exclusive(DisasContext *s, int rt, int rt2,

>>                                 TCGv_i64 addr, int size, bool is_pair)

>>  {

>>      TCGv_i64 tmp = tcg_temp_new_i64();

>> -    TCGMemOp be = s->be_data;

>> +    TCGMemOp memop = s->be_data + size;

>>

>>      g_assert(size <= 3);

>> +    tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), memop);

>> +

>>      if (is_pair) {

>> +        TCGv_i64 addr2 = tcg_temp_new_i64();

>>          TCGv_i64 hitmp = tcg_temp_new_i64();

>>

>> -        if (size == 3) {

>> -            TCGv_i64 addr2 = tcg_temp_new_i64();

>> -

>> -            tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s),

>> -                                MO_64 | MO_ALIGN_16 | be);

>> -            tcg_gen_addi_i64(addr2, addr, 8);

>> -            tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s),

>> -                                MO_64 | MO_ALIGN | be);

>> -            tcg_temp_free_i64(addr2);

>> -        } else {

>> -            g_assert(size == 2);

>> -            tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s),

>> -                                MO_64 | MO_ALIGN | be);

>> -            if (be == MO_LE) {

>> -                tcg_gen_extr32_i64(tmp, hitmp, tmp);

>> -            } else {

>> -                tcg_gen_extr32_i64(hitmp, tmp, tmp);

>> -            }

>> -        }

>> -

>> +        g_assert(size >= 2);

>> +        tcg_gen_addi_i64(addr2, addr, 1 << size);

>> +        tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s), memop);

>> +        tcg_temp_free_i64(addr2);

>>          tcg_gen_mov_i64(cpu_exclusive_high, hitmp);

>>          tcg_gen_mov_i64(cpu_reg(s, rt2), hitmp);

>>          tcg_temp_free_i64(hitmp);

>> -    } else {

>> -        tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), size | MO_ALIGN | be);

>>      }

>>

>>      tcg_gen_mov_i64(cpu_exclusive_val, tmp);


Acked-by: Richard Henderson <rth@twiddle.net>



r~
Peter Maydell Dec. 5, 2016, 6 p.m. UTC | #7
On 5 December 2016 at 17:22, Richard Henderson <rth@twiddle.net> wrote:
> Acked-by: Richard Henderson <rth@twiddle.net>


Thanks. I've put this into a one-patch pullrequest which I've just
sent out and which will hopefully make -rc3.

thanks
-- PMM
diff mbox

Patch

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index de48747..6dc27a6 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1839,41 +1839,37 @@  static void disas_b_exc_sys(DisasContext *s, uint32_t insn)
     }
 }
 
+/*
+ * Load/Store exclusive instructions are implemented by remembering
+ * the value/address loaded, and seeing if these are the same
+ * when the store is performed. This is not actually the architecturally
+ * mandated semantics, but it works for typical guest code sequences
+ * and avoids having to monitor regular stores.
+ *
+ * The store exclusive uses the atomic cmpxchg primitives to avoid
+ * races in multi-threaded linux-user and when MTTCG softmmu is
+ * enabled.
+ */
 static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
                                TCGv_i64 addr, int size, bool is_pair)
 {
     TCGv_i64 tmp = tcg_temp_new_i64();
-    TCGMemOp be = s->be_data;
+    TCGMemOp memop = s->be_data + size;
 
     g_assert(size <= 3);
+    tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), memop);
+
     if (is_pair) {
+        TCGv_i64 addr2 = tcg_temp_new_i64();
         TCGv_i64 hitmp = tcg_temp_new_i64();
 
-        if (size == 3) {
-            TCGv_i64 addr2 = tcg_temp_new_i64();
-
-            tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s),
-                                MO_64 | MO_ALIGN_16 | be);
-            tcg_gen_addi_i64(addr2, addr, 8);
-            tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s),
-                                MO_64 | MO_ALIGN | be);
-            tcg_temp_free_i64(addr2);
-        } else {
-            g_assert(size == 2);
-            tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s),
-                                MO_64 | MO_ALIGN | be);
-            if (be == MO_LE) {
-                tcg_gen_extr32_i64(tmp, hitmp, tmp);
-            } else {
-                tcg_gen_extr32_i64(hitmp, tmp, tmp);
-            }
-        }
-
+        g_assert(size >= 2);
+        tcg_gen_addi_i64(addr2, addr, 1 << size);
+        tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s), memop);
+        tcg_temp_free_i64(addr2);
         tcg_gen_mov_i64(cpu_exclusive_high, hitmp);
         tcg_gen_mov_i64(cpu_reg(s, rt2), hitmp);
         tcg_temp_free_i64(hitmp);
-    } else {
-        tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), size | MO_ALIGN | be);
     }
 
     tcg_gen_mov_i64(cpu_exclusive_val, tmp);