diff mbox series

[PULL,16/16] tcg/i386: Use MOVDQA for TCG_TYPE_V128 load/store

Message ID 20190522222821.23850-17-richard.henderson@linaro.org
State Accepted
Commit 11e2bfef799024be4a08fcf6797fe0b22fb16b58
Headers show
Series tcg queued patches | expand

Commit Message

Richard Henderson May 22, 2019, 10:28 p.m. UTC
This instruction raises #GP, aka SIGSEGV, if the effective address
is not aligned to 16-bytes.

We have assertions in tcg-op-gvec.c that the offset from ENV is
aligned, for vector types <= V128.  But the offset itself does not
validate that the final pointer is aligned -- one must also remember
to use the QEMU_ALIGNED() attribute on the vector member within ENV.

PowerPC Altivec has vector load/store instructions that silently
discard the low 4 bits of the address, making alignment mistakes
difficult to discover.  Aid that by making the most popular host
visibly signal the error.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 tcg/i386/tcg-target.inc.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

-- 
2.17.1

Comments

David Hildenbrand May 28, 2019, 5:28 p.m. UTC | #1
On 23.05.19 00:28, Richard Henderson wrote:
> This instruction raises #GP, aka SIGSEGV, if the effective address

> is not aligned to 16-bytes.

> 

> We have assertions in tcg-op-gvec.c that the offset from ENV is

> aligned, for vector types <= V128.  But the offset itself does not

> validate that the final pointer is aligned -- one must also remember

> to use the QEMU_ALIGNED() attribute on the vector member within ENV.

> 

> PowerPC Altivec has vector load/store instructions that silently

> discard the low 4 bits of the address, making alignment mistakes

> difficult to discover.  Aid that by making the most popular host

> visibly signal the error.

> 

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

>  tcg/i386/tcg-target.inc.c | 24 ++++++++++++++++++++++--

>  1 file changed, 22 insertions(+), 2 deletions(-)

> 

> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c

> index 6ec5e60448..c0443da4af 100644

> --- a/tcg/i386/tcg-target.inc.c

> +++ b/tcg/i386/tcg-target.inc.c

> @@ -1082,14 +1082,24 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,

>          }

>          /* FALLTHRU */

>      case TCG_TYPE_V64:

> +        /* There is no instruction that can validate 8-byte alignment.  */

>          tcg_debug_assert(ret >= 16);

>          tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);

>          break;

>      case TCG_TYPE_V128:

> +        /*

> +         * The gvec infrastructure is asserts that v128 vector loads

> +         * and stores use a 16-byte aligned offset.  Validate that the

> +         * final pointer is aligned by using an insn that will SIGSEGV.

> +         */

>          tcg_debug_assert(ret >= 16);

> -        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);

> +        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);

>          break;

>      case TCG_TYPE_V256:

> +        /*

> +         * The gvec infrastructure only requires 16-byte alignment,

> +         * so here we must use an unaligned load.

> +         */

>          tcg_debug_assert(ret >= 16);

>          tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,

>                                   ret, 0, arg1, arg2);

> @@ -1117,14 +1127,24 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,

>          }

>          /* FALLTHRU */

>      case TCG_TYPE_V64:

> +        /* There is no instruction that can validate 8-byte alignment.  */

>          tcg_debug_assert(arg >= 16);

>          tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);

>          break;

>      case TCG_TYPE_V128:

> +        /*

> +         * The gvec infrastructure is asserts that v128 vector loads

> +         * and stores use a 16-byte aligned offset.  Validate that the

> +         * final pointer is aligned by using an insn that will SIGSEGV.

> +         */

>          tcg_debug_assert(arg >= 16);

> -        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);

> +        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);

>          break;

>      case TCG_TYPE_V256:

> +        /*

> +         * The gvec infrastructure only requires 16-byte alignment,

> +         * so here we must use an unaligned store.

> +         */

>          tcg_debug_assert(arg >= 16);

>          tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,

>                                   arg, 0, arg1, arg2);

> 


This is the problematic patch. Haven't looked into the details yet, so I
can't tell what's wrong. Maybe really an alignemnt issue?

-- 

Thanks,

David / dhildenb
David Hildenbrand May 28, 2019, 6:33 p.m. UTC | #2
On 28.05.19 19:28, David Hildenbrand wrote:
> On 23.05.19 00:28, Richard Henderson wrote:

>> This instruction raises #GP, aka SIGSEGV, if the effective address

>> is not aligned to 16-bytes.

>>

>> We have assertions in tcg-op-gvec.c that the offset from ENV is

>> aligned, for vector types <= V128.  But the offset itself does not

>> validate that the final pointer is aligned -- one must also remember

>> to use the QEMU_ALIGNED() attribute on the vector member within ENV.

>>

>> PowerPC Altivec has vector load/store instructions that silently

>> discard the low 4 bits of the address, making alignment mistakes

>> difficult to discover.  Aid that by making the most popular host

>> visibly signal the error.

>>

>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

>> ---

>>  tcg/i386/tcg-target.inc.c | 24 ++++++++++++++++++++++--

>>  1 file changed, 22 insertions(+), 2 deletions(-)

>>

>> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c

>> index 6ec5e60448..c0443da4af 100644

>> --- a/tcg/i386/tcg-target.inc.c

>> +++ b/tcg/i386/tcg-target.inc.c

>> @@ -1082,14 +1082,24 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,

>>          }

>>          /* FALLTHRU */

>>      case TCG_TYPE_V64:

>> +        /* There is no instruction that can validate 8-byte alignment.  */

>>          tcg_debug_assert(ret >= 16);

>>          tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);

>>          break;

>>      case TCG_TYPE_V128:

>> +        /*

>> +         * The gvec infrastructure is asserts that v128 vector loads

>> +         * and stores use a 16-byte aligned offset.  Validate that the

>> +         * final pointer is aligned by using an insn that will SIGSEGV.

>> +         */

>>          tcg_debug_assert(ret >= 16);

>> -        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);

>> +        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);

>>          break;

>>      case TCG_TYPE_V256:

>> +        /*

>> +         * The gvec infrastructure only requires 16-byte alignment,

>> +         * so here we must use an unaligned load.

>> +         */

>>          tcg_debug_assert(ret >= 16);

>>          tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,

>>                                   ret, 0, arg1, arg2);

>> @@ -1117,14 +1127,24 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,

>>          }

>>          /* FALLTHRU */

>>      case TCG_TYPE_V64:

>> +        /* There is no instruction that can validate 8-byte alignment.  */

>>          tcg_debug_assert(arg >= 16);

>>          tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);

>>          break;

>>      case TCG_TYPE_V128:

>> +        /*

>> +         * The gvec infrastructure is asserts that v128 vector loads

>> +         * and stores use a 16-byte aligned offset.  Validate that the

>> +         * final pointer is aligned by using an insn that will SIGSEGV.

>> +         */

>>          tcg_debug_assert(arg >= 16);

>> -        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);

>> +        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);

>>          break;

>>      case TCG_TYPE_V256:

>> +        /*

>> +         * The gvec infrastructure only requires 16-byte alignment,

>> +         * so here we must use an unaligned store.

>> +         */

>>          tcg_debug_assert(arg >= 16);

>>          tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,

>>                                   arg, 0, arg1, arg2);

>>

> 

> This is the problematic patch. Haven't looked into the details yet, so I

> can't tell what's wrong. Maybe really an alignemnt issue?

> 


Okay, looks like "vregs" in "struct CPUS390XState" is always aligned to
8, but not to 16 bytes.

And that in return is the case, because "CPUS390XState env" is not
aligned to 16 bytes in "struct S390CPU"


!!!!!!!! CPU: 0x55a5e3046ef0
!!!!!!!! ENV: 0x55a5e304f1a8
!!!!!!!! VREGS: 0x55a5e304f228
!!!!!!!! CPU: 0x55a5e3070bb0
!!!!!!!! ENV: 0x55a5e3078e68
!!!!!!!! VREGS: 0x55a5e3078ee8
!!!!!!!! CPU: 0x55a5e3098310
!!!!!!!! ENV: 0x55a5e30a05c8
!!!!!!!! VREGS: 0x55a5e30a0648
!!!!!!!! CPU: 0x55a5e30c0730
!!!!!!!! ENV: 0x55a5e30c89e8
!!!!!!!! VREGS: 0x55a5e30c8a68
!!!!!!!! CPU: 0x55a5e30e7c90
!!!!!!!! ENV: 0x55a5e30eff48
!!!!!!!! VREGS: 0x55a5e30effc8
!!!!!!!! CPU: 0x55a5e310eea0
!!!!!!!! ENV: 0x55a5e3117158
!!!!!!!! VREGS: 0x55a5e31171d8
!!!!!!!! CPU: 0x55a5e31361e0
!!!!!!!! ENV: 0x55a5e313e498
!!!!!!!! VREGS: 0x55a5e313e518
!!!!!!!! CPU: 0x55a5e315d520
!!!!!!!! ENV: 0x55a5e31657d8
!!!!!!!! VREGS: 0x55a5e3165858

vregs is defined as:

CPU_DoubleU vregs[32][2];

We either have to switch to a type that has a natural alignment of 16
bytes, or enforce alignment of "CPUS390XState env" to 16 bytes.

What do you suggest?

-- 

Thanks,

David / dhildenb
David Hildenbrand May 28, 2019, 6:46 p.m. UTC | #3
On 28.05.19 20:33, David Hildenbrand wrote:
> On 28.05.19 19:28, David Hildenbrand wrote:

>> On 23.05.19 00:28, Richard Henderson wrote:

>>> This instruction raises #GP, aka SIGSEGV, if the effective address

>>> is not aligned to 16-bytes.

>>>

>>> We have assertions in tcg-op-gvec.c that the offset from ENV is

>>> aligned, for vector types <= V128.  But the offset itself does not

>>> validate that the final pointer is aligned -- one must also remember

>>> to use the QEMU_ALIGNED() attribute on the vector member within ENV.

>>>

>>> PowerPC Altivec has vector load/store instructions that silently

>>> discard the low 4 bits of the address, making alignment mistakes

>>> difficult to discover.  Aid that by making the most popular host

>>> visibly signal the error.

>>>

>>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

>>> ---

>>>  tcg/i386/tcg-target.inc.c | 24 ++++++++++++++++++++++--

>>>  1 file changed, 22 insertions(+), 2 deletions(-)

>>>

>>> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c

>>> index 6ec5e60448..c0443da4af 100644

>>> --- a/tcg/i386/tcg-target.inc.c

>>> +++ b/tcg/i386/tcg-target.inc.c

>>> @@ -1082,14 +1082,24 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,

>>>          }

>>>          /* FALLTHRU */

>>>      case TCG_TYPE_V64:

>>> +        /* There is no instruction that can validate 8-byte alignment.  */

>>>          tcg_debug_assert(ret >= 16);

>>>          tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);

>>>          break;

>>>      case TCG_TYPE_V128:

>>> +        /*

>>> +         * The gvec infrastructure is asserts that v128 vector loads

>>> +         * and stores use a 16-byte aligned offset.  Validate that the

>>> +         * final pointer is aligned by using an insn that will SIGSEGV.

>>> +         */

>>>          tcg_debug_assert(ret >= 16);

>>> -        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);

>>> +        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);

>>>          break;

>>>      case TCG_TYPE_V256:

>>> +        /*

>>> +         * The gvec infrastructure only requires 16-byte alignment,

>>> +         * so here we must use an unaligned load.

>>> +         */

>>>          tcg_debug_assert(ret >= 16);

>>>          tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,

>>>                                   ret, 0, arg1, arg2);

>>> @@ -1117,14 +1127,24 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,

>>>          }

>>>          /* FALLTHRU */

>>>      case TCG_TYPE_V64:

>>> +        /* There is no instruction that can validate 8-byte alignment.  */

>>>          tcg_debug_assert(arg >= 16);

>>>          tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);

>>>          break;

>>>      case TCG_TYPE_V128:

>>> +        /*

>>> +         * The gvec infrastructure is asserts that v128 vector loads

>>> +         * and stores use a 16-byte aligned offset.  Validate that the

>>> +         * final pointer is aligned by using an insn that will SIGSEGV.

>>> +         */

>>>          tcg_debug_assert(arg >= 16);

>>> -        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);

>>> +        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);

>>>          break;

>>>      case TCG_TYPE_V256:

>>> +        /*

>>> +         * The gvec infrastructure only requires 16-byte alignment,

>>> +         * so here we must use an unaligned store.

>>> +         */

>>>          tcg_debug_assert(arg >= 16);

>>>          tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,

>>>                                   arg, 0, arg1, arg2);

>>>

>>

>> This is the problematic patch. Haven't looked into the details yet, so I

>> can't tell what's wrong. Maybe really an alignemnt issue?

>>

> 

> Okay, looks like "vregs" in "struct CPUS390XState" is always aligned to

> 8, but not to 16 bytes.

> 

> And that in return is the case, because "CPUS390XState env" is not

> aligned to 16 bytes in "struct S390CPU"

> 

> 

> !!!!!!!! CPU: 0x55a5e3046ef0

> !!!!!!!! ENV: 0x55a5e304f1a8

> !!!!!!!! VREGS: 0x55a5e304f228

> !!!!!!!! CPU: 0x55a5e3070bb0

> !!!!!!!! ENV: 0x55a5e3078e68

> !!!!!!!! VREGS: 0x55a5e3078ee8

> !!!!!!!! CPU: 0x55a5e3098310

> !!!!!!!! ENV: 0x55a5e30a05c8

> !!!!!!!! VREGS: 0x55a5e30a0648

> !!!!!!!! CPU: 0x55a5e30c0730

> !!!!!!!! ENV: 0x55a5e30c89e8

> !!!!!!!! VREGS: 0x55a5e30c8a68

> !!!!!!!! CPU: 0x55a5e30e7c90

> !!!!!!!! ENV: 0x55a5e30eff48

> !!!!!!!! VREGS: 0x55a5e30effc8

> !!!!!!!! CPU: 0x55a5e310eea0

> !!!!!!!! ENV: 0x55a5e3117158

> !!!!!!!! VREGS: 0x55a5e31171d8

> !!!!!!!! CPU: 0x55a5e31361e0

> !!!!!!!! ENV: 0x55a5e313e498

> !!!!!!!! VREGS: 0x55a5e313e518

> !!!!!!!! CPU: 0x55a5e315d520

> !!!!!!!! ENV: 0x55a5e31657d8

> !!!!!!!! VREGS: 0x55a5e3165858

> 

> vregs is defined as:

> 

> CPU_DoubleU vregs[32][2];

> 

> We either have to switch to a type that has a natural alignment of 16

> bytes, or enforce alignment of "CPUS390XState env" to 16 bytes.

> 

> What do you suggest?


FWIW, this seems to be the easiest way:

diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index f0d9a6a36d..d363ae0fb3 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -66,7 +66,7 @@ struct CPUS390XState {
      * The floating point registers are part of the vector registers.
      * vregs[0][0] -> vregs[15][0] are 16 floating point registers
      */
-    CPU_DoubleU vregs[32][2];  /* vector registers */
+    CPU_DoubleU vregs[32][2] QEMU_ALIGNED(16);  /* vector registers */
     uint32_t aregs[16];    /* access registers */
     uint8_t riccb[64];     /* runtime instrumentation control */
     uint64_t gscb[4];      /* guarded storage control */


Makes it work for me again.

-- 

Thanks,

David / dhildenb
Richard Henderson May 28, 2019, 9:34 p.m. UTC | #4
On 5/28/19 1:46 PM, David Hildenbrand wrote:
> FWIW, this seems to be the easiest way:

> 

> diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h

> index f0d9a6a36d..d363ae0fb3 100644

> --- a/target/s390x/cpu.h

> +++ b/target/s390x/cpu.h

> @@ -66,7 +66,7 @@ struct CPUS390XState {

>       * The floating point registers are part of the vector registers.

>       * vregs[0][0] -> vregs[15][0] are 16 floating point registers

>       */

> -    CPU_DoubleU vregs[32][2];  /* vector registers */

> +    CPU_DoubleU vregs[32][2] QEMU_ALIGNED(16);  /* vector registers */

>      uint32_t aregs[16];    /* access registers */

>      uint8_t riccb[64];     /* runtime instrumentation control */

>      uint64_t gscb[4];      /* guarded storage control */

> 

> 

> Makes it work for me again.


That's the right fix, and exactly the bug that I was hoping to find with
11e2bfef7990 ("tcg/i386: Use MOVDQA for TCG_TYPE_V128 load/store").


r~
diff mbox series

Patch

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 6ec5e60448..c0443da4af 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1082,14 +1082,24 @@  static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
         }
         /* FALLTHRU */
     case TCG_TYPE_V64:
+        /* There is no instruction that can validate 8-byte alignment.  */
         tcg_debug_assert(ret >= 16);
         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
         break;
     case TCG_TYPE_V128:
+        /*
+         * The gvec infrastructure is asserts that v128 vector loads
+         * and stores use a 16-byte aligned offset.  Validate that the
+         * final pointer is aligned by using an insn that will SIGSEGV.
+         */
         tcg_debug_assert(ret >= 16);
-        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);
+        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
         break;
     case TCG_TYPE_V256:
+        /*
+         * The gvec infrastructure only requires 16-byte alignment,
+         * so here we must use an unaligned load.
+         */
         tcg_debug_assert(ret >= 16);
         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
                                  ret, 0, arg1, arg2);
@@ -1117,14 +1127,24 @@  static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
         }
         /* FALLTHRU */
     case TCG_TYPE_V64:
+        /* There is no instruction that can validate 8-byte alignment.  */
         tcg_debug_assert(arg >= 16);
         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
         break;
     case TCG_TYPE_V128:
+        /*
+         * The gvec infrastructure is asserts that v128 vector loads
+         * and stores use a 16-byte aligned offset.  Validate that the
+         * final pointer is aligned by using an insn that will SIGSEGV.
+         */
         tcg_debug_assert(arg >= 16);
-        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);
+        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
         break;
     case TCG_TYPE_V256:
+        /*
+         * The gvec infrastructure only requires 16-byte alignment,
+         * so here we must use an unaligned store.
+         */
         tcg_debug_assert(arg >= 16);
         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
                                  arg, 0, arg1, arg2);