diff mbox series

[v3,04/11] target/s390x: vxeh2: Update for changes to vector shifts

Message ID 20220308015358.188499-5-richard.henderson@linaro.org
State New
Headers show
Series s390x/tcg: Implement Vector-Enhancements Facility 2 | expand

Commit Message

Richard Henderson March 8, 2022, 1:53 a.m. UTC
From: David Miller <dmiller423@gmail.com>

Prior to vector enhancements 2, the shift count was supposed to be equal
for each byte lest the result be unpredictable, which allowed us to assume
that the shift count was the same, and optimize accordingly.

With vector enhancements 2, the shift count is allowed to be different
for each byte, and we must cope with that.

Signed-off-by: David Miller <dmiller423@gmail.com>
Message-Id: <20220307020327.3003-4-dmiller423@gmail.com>
[rth: Split out of larger patch; simplify shift/merge code.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/helper.h               |  3 ++
 target/s390x/tcg/vec_int_helper.c   | 58 ++++++++++++++++++++++
 target/s390x/tcg/translate_vx.c.inc | 77 ++++++++++++-----------------
 target/s390x/tcg/insn-data.def      | 12 ++---
 4 files changed, 99 insertions(+), 51 deletions(-)

Comments

David Hildenbrand March 21, 2022, 11:15 a.m. UTC | #1
On 08.03.22 02:53, Richard Henderson wrote:
> From: David Miller <dmiller423@gmail.com>
> 
> Prior to vector enhancements 2, the shift count was supposed to be equal
> for each byte lest the result be unpredictable, which allowed us to assume
> that the shift count was the same, and optimize accordingly.
> 
> With vector enhancements 2, the shift count is allowed to be different
> for each byte, and we must cope with that.
> 
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Message-Id: <20220307020327.3003-4-dmiller423@gmail.com>
> [rth: Split out of larger patch; simplify shift/merge code.]
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/s390x/helper.h               |  3 ++
>  target/s390x/tcg/vec_int_helper.c   | 58 ++++++++++++++++++++++
>  target/s390x/tcg/translate_vx.c.inc | 77 ++++++++++++-----------------
>  target/s390x/tcg/insn-data.def      | 12 ++---
>  4 files changed, 99 insertions(+), 51 deletions(-)
> 
> diff --git a/target/s390x/helper.h b/target/s390x/helper.h
> index 7412130883..bf33d86f74 100644
> --- a/target/s390x/helper.h
> +++ b/target/s390x/helper.h
> @@ -203,8 +203,11 @@ DEF_HELPER_FLAGS_3(gvec_vpopct16, TCG_CALL_NO_RWG, void, ptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_verim8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_verim16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vsl, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_vsl_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vsra, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_vsra_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vsrl, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_vsrl_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vscbi8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vscbi16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_4(gvec_vtm, void, ptr, cptr, env, i32)
> diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
> index 5561b3ed90..a881d5d267 100644
> --- a/target/s390x/tcg/vec_int_helper.c
> +++ b/target/s390x/tcg/vec_int_helper.c
> @@ -540,18 +540,76 @@ void HELPER(gvec_vsl)(void *v1, const void *v2, uint64_t count,
>      s390_vec_shl(v1, v2, count);
>  }
>  
> +void HELPER(gvec_vsl_ve2)(void *v1, const void *v2, const void *v3,
> +                          uint32_t desc)
> +{
> +    S390Vector tmp;
> +    uint32_t sh, e0, e1 = 0;

int i;

> +
> +    for (int i = 15; i >= 0; --i, e1 = e0 << 24) {

I'd only do "e1 = e0" here and do the shift for the rol32 ...

> +        e0 = s390_vec_read_element8(v2, i);
> +        sh = s390_vec_read_element8(v3, i) & 7;
> +
> +        s390_vec_write_element8(&tmp, i, rol32(e0 | e1, sh));

... here

s390_vec_write_element8(&tmp, i, rol32(e0 | e1 << 24, sh));

> +    }
> +
> +    *(S390Vector *)v1 = tmp;
> +}
> +
>  void HELPER(gvec_vsra)(void *v1, const void *v2, uint64_t count,
>                         uint32_t desc)
>  {
>      s390_vec_sar(v1, v2, count);
>  }
>  
> +void HELPER(gvec_vsra_ve2)(void *v1, const void *v2, const void *v3,
> +                           uint32_t desc)
> +{
> +    S390Vector tmp;
> +    uint32_t sh, e0, e1;
> +    int i = 0;
> +
> +    e0 = s390_vec_read_element8(v2, 0);
> +    e1 = -(e0 >> 7) << 8;
> +
> +    for (;;) {
> +        sh = s390_vec_read_element8(v3, i) & 7;
> +
> +        s390_vec_write_element8(&tmp, i, (e0 | e1) >> sh);
> +
> +        if (++i >= 16) {
> +            break;
> +        }
> +
> +        e1 = e0 << 8;
> +        e0 = s390_vec_read_element8(v2, i);
> +    }

Can't we use the following that resembles the other helpers or am I
missing something?

S390Vector tmp;
uint32_t sh, e0, e1 = 0;

/* Byte 0 is special only. */
e0 = (int32_t)(int8_t)s390_vec_read_element8(v2, i);
sh = s390_vec_read_element8(v3, i) & 7;
s390_vec_write_element8(&tmp, i, e0 >> sh);

e1 = e0;
for (int i = 1; i < 16; ++i, e1 = e0) {
	e0 = s390_vec_read_element8(v2, i);
	sh = s390_vec_read_element8(v3, i) & 7;
	s390_vec_write_element8(&tmp, i, (e0 | e1 << 8) >> sh);
}

*(S390Vector *)v1 = tmp;


> +
> +    *(S390Vector *)v1 = tmp;
> +}
> +
>  void HELPER(gvec_vsrl)(void *v1, const void *v2, uint64_t count,
>                         uint32_t desc)
>  {
>      s390_vec_shr(v1, v2, count);
>  }
>  
> +void HELPER(gvec_vsrl_ve2)(void *v1, const void *v2, const void *v3,
> +                           uint32_t desc)
> +{
> +    S390Vector tmp;
> +    uint32_t sh, e0, e1 = 0;
> +
> +    for (int i = 0; i < 16; ++i, e1 = e0 << 8) {

Dito, I'd do the shift below ...

> +        e0 = s390_vec_read_element8(v2, i);
> +        sh = s390_vec_read_element8(v3, i) & 7;
> +
> +        s390_vec_write_element8(&tmp, i, (e0 | e1) >> sh);

s390_vec_write_element8(&tmp, i, (e0 | e1 << 8) >> sh);

> +    }
> +
> +    *(S390Vector *)v1 = tmp;
> +}
> +
>  #define DEF_VSCBI(BITS)                                                        \
>  void HELPER(gvec_vscbi##BITS)(void *v1, const void *v2, const void *v3,        \
>                                uint32_t desc)                                   \
> diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
> index d514e8b218..967f6213d8 100644
> --- a/target/s390x/tcg/translate_vx.c.inc
> +++ b/target/s390x/tcg/translate_vx.c.inc
> @@ -2018,21 +2018,42 @@ static DisasJumpType op_ves(DisasContext *s, DisasOps *o)
>      return DISAS_NEXT;
>  }
>  
> +static DisasJumpType gen_vsh_bit_byte(DisasContext *s, DisasOps *o,
> +                                      gen_helper_gvec_2i *gen,
> +                                      gen_helper_gvec_3 *gen_ve2)
> +{
> +    bool byte = s->insn->data;

Nit: I'd have called this "by_byte".
diff mbox series

Patch

diff --git a/target/s390x/helper.h b/target/s390x/helper.h
index 7412130883..bf33d86f74 100644
--- a/target/s390x/helper.h
+++ b/target/s390x/helper.h
@@ -203,8 +203,11 @@  DEF_HELPER_FLAGS_3(gvec_vpopct16, TCG_CALL_NO_RWG, void, ptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_verim8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_verim16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vsl, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_vsl_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vsra, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_vsra_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vsrl, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_vsrl_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vscbi8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vscbi16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_4(gvec_vtm, void, ptr, cptr, env, i32)
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
index 5561b3ed90..a881d5d267 100644
--- a/target/s390x/tcg/vec_int_helper.c
+++ b/target/s390x/tcg/vec_int_helper.c
@@ -540,18 +540,76 @@  void HELPER(gvec_vsl)(void *v1, const void *v2, uint64_t count,
     s390_vec_shl(v1, v2, count);
 }
 
+void HELPER(gvec_vsl_ve2)(void *v1, const void *v2, const void *v3,
+                          uint32_t desc)
+{
+    S390Vector tmp;
+    uint32_t sh, e0, e1 = 0;
+
+    for (int i = 15; i >= 0; --i, e1 = e0 << 24) {
+        e0 = s390_vec_read_element8(v2, i);
+        sh = s390_vec_read_element8(v3, i) & 7;
+
+        s390_vec_write_element8(&tmp, i, rol32(e0 | e1, sh));
+    }
+
+    *(S390Vector *)v1 = tmp;
+}
+
 void HELPER(gvec_vsra)(void *v1, const void *v2, uint64_t count,
                        uint32_t desc)
 {
     s390_vec_sar(v1, v2, count);
 }
 
+void HELPER(gvec_vsra_ve2)(void *v1, const void *v2, const void *v3,
+                           uint32_t desc)
+{
+    S390Vector tmp;
+    uint32_t sh, e0, e1;
+    int i = 0;
+
+    e0 = s390_vec_read_element8(v2, 0);
+    e1 = -(e0 >> 7) << 8;
+
+    for (;;) {
+        sh = s390_vec_read_element8(v3, i) & 7;
+
+        s390_vec_write_element8(&tmp, i, (e0 | e1) >> sh);
+
+        if (++i >= 16) {
+            break;
+        }
+
+        e1 = e0 << 8;
+        e0 = s390_vec_read_element8(v2, i);
+    }
+
+    *(S390Vector *)v1 = tmp;
+}
+
 void HELPER(gvec_vsrl)(void *v1, const void *v2, uint64_t count,
                        uint32_t desc)
 {
     s390_vec_shr(v1, v2, count);
 }
 
+void HELPER(gvec_vsrl_ve2)(void *v1, const void *v2, const void *v3,
+                           uint32_t desc)
+{
+    S390Vector tmp;
+    uint32_t sh, e0, e1 = 0;
+
+    for (int i = 0; i < 16; ++i, e1 = e0 << 8) {
+        e0 = s390_vec_read_element8(v2, i);
+        sh = s390_vec_read_element8(v3, i) & 7;
+
+        s390_vec_write_element8(&tmp, i, (e0 | e1) >> sh);
+    }
+
+    *(S390Vector *)v1 = tmp;
+}
+
 #define DEF_VSCBI(BITS)                                                        \
 void HELPER(gvec_vscbi##BITS)(void *v1, const void *v2, const void *v3,        \
                               uint32_t desc)                                   \
diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
index d514e8b218..967f6213d8 100644
--- a/target/s390x/tcg/translate_vx.c.inc
+++ b/target/s390x/tcg/translate_vx.c.inc
@@ -2018,21 +2018,42 @@  static DisasJumpType op_ves(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
+static DisasJumpType gen_vsh_bit_byte(DisasContext *s, DisasOps *o,
+                                      gen_helper_gvec_2i *gen,
+                                      gen_helper_gvec_3 *gen_ve2)
+{
+    bool byte = s->insn->data;
+
+    if (!byte && s390_has_feat(S390_FEAT_VECTOR_ENH2)) {
+        gen_gvec_3_ool(get_field(s, v1), get_field(s, v2),
+                       get_field(s, v3), 0, gen_ve2);
+    } else {
+        TCGv_i64 shift = tcg_temp_new_i64();
+
+        read_vec_element_i64(shift, get_field(s, v3), 7, ES_8);
+        tcg_gen_andi_i64(shift, shift, byte ? 0x78 : 7);
+        gen_gvec_2i_ool(get_field(s, v1), get_field(s, v2), shift, 0, gen);
+        tcg_temp_free_i64(shift);
+    }
+    return DISAS_NEXT;
+}
+
 static DisasJumpType op_vsl(DisasContext *s, DisasOps *o)
 {
-    TCGv_i64 shift = tcg_temp_new_i64();
+    return gen_vsh_bit_byte(s, o, gen_helper_gvec_vsl,
+                            gen_helper_gvec_vsl_ve2);
+}
 
-    read_vec_element_i64(shift, get_field(s, v3), 7, ES_8);
-    if (s->fields.op2 == 0x74) {
-        tcg_gen_andi_i64(shift, shift, 0x7);
-    } else {
-        tcg_gen_andi_i64(shift, shift, 0x78);
-    }
+static DisasJumpType op_vsra(DisasContext *s, DisasOps *o)
+{
+    return gen_vsh_bit_byte(s, o, gen_helper_gvec_vsra,
+                            gen_helper_gvec_vsra_ve2);
+}
 
-    gen_gvec_2i_ool(get_field(s, v1), get_field(s, v2),
-                    shift, 0, gen_helper_gvec_vsl);
-    tcg_temp_free_i64(shift);
-    return DISAS_NEXT;
+static DisasJumpType op_vsrl(DisasContext *s, DisasOps *o)
+{
+    return gen_vsh_bit_byte(s, o, gen_helper_gvec_vsrl,
+                            gen_helper_gvec_vsrl_ve2);
 }
 
 static DisasJumpType op_vsldb(DisasContext *s, DisasOps *o)
@@ -2064,40 +2085,6 @@  static DisasJumpType op_vsldb(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
-static DisasJumpType op_vsra(DisasContext *s, DisasOps *o)
-{
-    TCGv_i64 shift = tcg_temp_new_i64();
-
-    read_vec_element_i64(shift, get_field(s, v3), 7, ES_8);
-    if (s->fields.op2 == 0x7e) {
-        tcg_gen_andi_i64(shift, shift, 0x7);
-    } else {
-        tcg_gen_andi_i64(shift, shift, 0x78);
-    }
-
-    gen_gvec_2i_ool(get_field(s, v1), get_field(s, v2),
-                    shift, 0, gen_helper_gvec_vsra);
-    tcg_temp_free_i64(shift);
-    return DISAS_NEXT;
-}
-
-static DisasJumpType op_vsrl(DisasContext *s, DisasOps *o)
-{
-    TCGv_i64 shift = tcg_temp_new_i64();
-
-    read_vec_element_i64(shift, get_field(s, v3), 7, ES_8);
-    if (s->fields.op2 == 0x7c) {
-        tcg_gen_andi_i64(shift, shift, 0x7);
-    } else {
-        tcg_gen_andi_i64(shift, shift, 0x78);
-    }
-
-    gen_gvec_2i_ool(get_field(s, v1), get_field(s, v2),
-                    shift, 0, gen_helper_gvec_vsrl);
-    tcg_temp_free_i64(shift);
-    return DISAS_NEXT;
-}
-
 static DisasJumpType op_vs(DisasContext *s, DisasOps *o)
 {
     const uint8_t es = get_field(s, m4);
diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
index 46add91a0e..f487a64abf 100644
--- a/target/s390x/tcg/insn-data.def
+++ b/target/s390x/tcg/insn-data.def
@@ -1204,19 +1204,19 @@ 
     F(0xe778, VESRLV,  VRR_c, V,   0, 0, 0, 0, vesv, 0, IF_VEC)
     F(0xe738, VESRL,   VRS_a, V,   la2, 0, 0, 0, ves, 0, IF_VEC)
 /* VECTOR SHIFT LEFT */
-    F(0xe774, VSL,     VRR_c, V,   0, 0, 0, 0, vsl, 0, IF_VEC)
+    E(0xe774, VSL,     VRR_c, V,   0, 0, 0, 0, vsl, 0, 0, IF_VEC)
 /* VECTOR SHIFT LEFT BY BYTE */
-    F(0xe775, VSLB,    VRR_c, V,   0, 0, 0, 0, vsl, 0, IF_VEC)
+    E(0xe775, VSLB,    VRR_c, V,   0, 0, 0, 0, vsl, 0, 1, IF_VEC)
 /* VECTOR SHIFT LEFT DOUBLE BY BYTE */
     F(0xe777, VSLDB,   VRI_d, V,   0, 0, 0, 0, vsldb, 0, IF_VEC)
 /* VECTOR SHIFT RIGHT ARITHMETIC */
-    F(0xe77e, VSRA,    VRR_c, V,   0, 0, 0, 0, vsra, 0, IF_VEC)
+    E(0xe77e, VSRA,    VRR_c, V,   0, 0, 0, 0, vsra, 0, 0, IF_VEC)
 /* VECTOR SHIFT RIGHT ARITHMETIC BY BYTE */
-    F(0xe77f, VSRAB,   VRR_c, V,   0, 0, 0, 0, vsra, 0, IF_VEC)
+    E(0xe77f, VSRAB,   VRR_c, V,   0, 0, 0, 0, vsra, 0, 1, IF_VEC)
 /* VECTOR SHIFT RIGHT LOGICAL */
-    F(0xe77c, VSRL,    VRR_c, V,   0, 0, 0, 0, vsrl, 0, IF_VEC)
+    E(0xe77c, VSRL,    VRR_c, V,   0, 0, 0, 0, vsrl, 0, 0, IF_VEC)
 /* VECTOR SHIFT RIGHT LOGICAL BY BYTE */
-    F(0xe77d, VSRLB,   VRR_c, V,   0, 0, 0, 0, vsrl, 0, IF_VEC)
+    E(0xe77d, VSRLB,   VRR_c, V,   0, 0, 0, 0, vsrl, 0, 1, IF_VEC)
 /* VECTOR SUBTRACT */
     F(0xe7f7, VS,      VRR_c, V,   0, 0, 0, 0, vs, 0, IF_VEC)
 /* VECTOR SUBTRACT COMPUTE BORROW INDICATION */