diff mbox series

tcg: Use memset for large vector byte replication

Message ID 20201215174824.76017-1-richard.henderson@linaro.org
State Superseded
Headers show
Series tcg: Use memset for large vector byte replication | expand

Commit Message

Richard Henderson Dec. 15, 2020, 5:48 p.m. UTC
In f47db80cc07, we handled odd-sized tail clearing for
the case of hosts that have vector operations, but did
not handle the case of hosts that do not have vector ops.

This was ok until e2e7168a214b, which changed the encoding
of simd_desc such that the odd sizes are impossible.

Add memset as a tcg helper, and use that for all out-of-line
byte stores to vectors.  This includes, but is not limited to,
the tail clearing operation in question.

Cc: qemu-stable@nongnu.org
Buglink: https://bugs.launchpad.net/bugs/1907817
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 accel/tcg/tcg-runtime.h     | 11 +++++++++++
 include/exec/helper-proto.h |  4 ++++
 tcg/tcg-op-gvec.c           | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+)

-- 
2.25.1

Comments

Philippe Mathieu-Daudé Dec. 15, 2020, 11:38 p.m. UTC | #1
On 12/15/20 6:48 PM, Richard Henderson wrote:
> In f47db80cc07, we handled odd-sized tail clearing for

> the case of hosts that have vector operations, but did

> not handle the case of hosts that do not have vector ops.

> 

> This was ok until e2e7168a214b, which changed the encoding

> of simd_desc such that the odd sizes are impossible.

> 

> Add memset as a tcg helper, and use that for all out-of-line

> byte stores to vectors.  This includes, but is not limited to,

> the tail clearing operation in question.

> 

> Cc: qemu-stable@nongnu.org

> Buglink: https://bugs.launchpad.net/bugs/1907817

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

>  accel/tcg/tcg-runtime.h     | 11 +++++++++++

>  include/exec/helper-proto.h |  4 ++++

>  tcg/tcg-op-gvec.c           | 32 ++++++++++++++++++++++++++++++++

>  3 files changed, 47 insertions(+)

> 

> diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h

> index 4eda24e63a..2e36d6eb0c 100644

> --- a/accel/tcg/tcg-runtime.h

> +++ b/accel/tcg/tcg-runtime.h

> @@ -28,6 +28,17 @@ DEF_HELPER_FLAGS_1(lookup_tb_ptr, TCG_CALL_NO_WG_SE, ptr, env)

>  

>  DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env)

>  

> +#ifndef IN_HELPER_PROTO

> +/*

> + * Pass calls to memset directly to libc, without a thunk in qemu.

> + * Do not re-declare memset, especially since we fudge the type here;

> + * we assume sizeof(void *) == sizeof(size_t), which is true for

> + * all supported hosts.

> + */

> +#define helper_memset memset

> +DEF_HELPER_FLAGS_3(memset, TCG_CALL_NO_RWG, ptr, ptr, int, ptr)


Nice :)

> +#endif /* IN_HELPER_PROTO */

> +

>  #ifdef CONFIG_SOFTMMU

>  

>  DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG,

> diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h

> index a0a8d9aa46..659f9298e8 100644

> --- a/include/exec/helper-proto.h

> +++ b/include/exec/helper-proto.h

> @@ -35,11 +35,15 @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \

>                              dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \

>                              dh_ctype(t7));

>  

> +#define IN_HELPER_PROTO

> +

>  #include "helper.h"

>  #include "trace/generated-helpers.h"

>  #include "tcg-runtime.h"

>  #include "plugin-helpers.h"

>  

> +#undef IN_HELPER_PROTO

> +

>  #undef DEF_HELPER_FLAGS_0

>  #undef DEF_HELPER_FLAGS_1

>  #undef DEF_HELPER_FLAGS_2

> diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c

> index ddbe06b71a..6c42d76f3a 100644

> --- a/tcg/tcg-op-gvec.c

> +++ b/tcg/tcg-op-gvec.c

> @@ -547,6 +547,9 @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,

>          in_c = dup_const(vece, in_c);

>          if (in_c == 0) {

>              oprsz = maxsz;

> +            vece = MO_8;

> +        } else if (in_c == dup_const(MO_8, in_c)) {

> +            vece = MO_8;


Nice optimization.

>          }

>      }

>  

> @@ -628,6 +631,35 @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,

>      /* Otherwise implement out of line.  */

>      t_ptr = tcg_temp_new_ptr();

>      tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);

> +

> +    /*

> +     * This may be expand_clr for the tail of an operation, e.g.

> +     * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned

> +     * wrt simd_desc and will assert.  Simply pass all replicated byte

> +     * stores through to memset.

> +     */

> +    if (oprsz == maxsz && vece == MO_8) {

> +        TCGv_ptr t_size = tcg_const_ptr(oprsz);

> +        TCGv_i32 t_val;

> +

> +        if (in_32) {

> +            t_val = in_32;

> +        } else if (in_64) {

> +            t_val = tcg_temp_new_i32();

> +            tcg_gen_extrl_i64_i32(t_val, in_64);

> +        } else {

> +            t_val = tcg_const_i32(in_c);

> +        }

> +        gen_helper_memset(t_ptr, t_ptr, t_val, t_size);

> +

> +        tcg_temp_free_ptr(t_ptr);

> +        tcg_temp_free_ptr(t_size);

> +        if (!in_32) {

> +            tcg_temp_free_i32(t_val);

> +        }

> +        return;


LGTM but I'd rather have another R-b:
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>


BTW (nitpicking) I'd rewrite the epilogue as:

        if (!in_32) {
            tcg_temp_free_i32(t_val);
        }
        tcg_temp_free_ptr(t_size);

        tcg_temp_free_ptr(t_ptr);
        return;

t_val first, because the !in_32 allocs are few lines earlier,
t_ptr last because allocated in prologue, so keep close to
'return'. Matter of taste ;)

> +    }

> +

>      t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));

>  

>      if (vece == MO_64) {

>
Richard Henderson Dec. 16, 2020, 2:28 p.m. UTC | #2
On 12/15/20 5:38 PM, Philippe Mathieu-Daudé wrote:
> BTW (nitpicking) I'd rewrite the epilogue as:

> 

>         if (!in_32) {

>             tcg_temp_free_i32(t_val);

>         }

>         tcg_temp_free_ptr(t_size);

> 

>         tcg_temp_free_ptr(t_ptr);

>         return;

> 

> t_val first, because the !in_32 allocs are few lines earlier,

> t_ptr last because allocated in prologue, so keep close to

> 'return'. Matter of taste ;)


Yah sure, you bettcha.  Queued with this.


r~
diff mbox series

Patch

diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index 4eda24e63a..2e36d6eb0c 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -28,6 +28,17 @@  DEF_HELPER_FLAGS_1(lookup_tb_ptr, TCG_CALL_NO_WG_SE, ptr, env)
 
 DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env)
 
+#ifndef IN_HELPER_PROTO
+/*
+ * Pass calls to memset directly to libc, without a thunk in qemu.
+ * Do not re-declare memset, especially since we fudge the type here;
+ * we assume sizeof(void *) == sizeof(size_t), which is true for
+ * all supported hosts.
+ */
+#define helper_memset memset
+DEF_HELPER_FLAGS_3(memset, TCG_CALL_NO_RWG, ptr, ptr, int, ptr)
+#endif /* IN_HELPER_PROTO */
+
 #ifdef CONFIG_SOFTMMU
 
 DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG,
diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
index a0a8d9aa46..659f9298e8 100644
--- a/include/exec/helper-proto.h
+++ b/include/exec/helper-proto.h
@@ -35,11 +35,15 @@  dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
                             dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
                             dh_ctype(t7));
 
+#define IN_HELPER_PROTO
+
 #include "helper.h"
 #include "trace/generated-helpers.h"
 #include "tcg-runtime.h"
 #include "plugin-helpers.h"
 
+#undef IN_HELPER_PROTO
+
 #undef DEF_HELPER_FLAGS_0
 #undef DEF_HELPER_FLAGS_1
 #undef DEF_HELPER_FLAGS_2
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index ddbe06b71a..6c42d76f3a 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -547,6 +547,9 @@  static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
         in_c = dup_const(vece, in_c);
         if (in_c == 0) {
             oprsz = maxsz;
+            vece = MO_8;
+        } else if (in_c == dup_const(MO_8, in_c)) {
+            vece = MO_8;
         }
     }
 
@@ -628,6 +631,35 @@  static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
     /* Otherwise implement out of line.  */
     t_ptr = tcg_temp_new_ptr();
     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
+
+    /*
+     * This may be expand_clr for the tail of an operation, e.g.
+     * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
+     * wrt simd_desc and will assert.  Simply pass all replicated byte
+     * stores through to memset.
+     */
+    if (oprsz == maxsz && vece == MO_8) {
+        TCGv_ptr t_size = tcg_const_ptr(oprsz);
+        TCGv_i32 t_val;
+
+        if (in_32) {
+            t_val = in_32;
+        } else if (in_64) {
+            t_val = tcg_temp_new_i32();
+            tcg_gen_extrl_i64_i32(t_val, in_64);
+        } else {
+            t_val = tcg_const_i32(in_c);
+        }
+        gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
+
+        tcg_temp_free_ptr(t_ptr);
+        tcg_temp_free_ptr(t_size);
+        if (!in_32) {
+            tcg_temp_free_i32(t_val);
+        }
+        return;
+    }
+
     t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
 
     if (vece == MO_64) {