diff mbox series

[v3,07/11] target/s390x: vxeh2: vector {load, store} byte reversed elements

Message ID 20220308015358.188499-8-richard.henderson@linaro.org
State New
Headers show
Series s390x/tcg: Implement Vector-Enhancements Facility 2 | expand

Commit Message

Richard Henderson March 8, 2022, 1:53 a.m. UTC
From: David Miller <dmiller423@gmail.com>

Signed-off-by: David Miller <dmiller423@gmail.com>
Message-Id: <20220307020327.3003-6-dmiller423@gmail.com>
[rth: Split out elements (plural) from element (scalar)
      Use tcg little-endian memory ops, plus hswap and wswap.]
Signed-off-by: Richard Henderson <richard.henderson@linar.org>
---
 target/s390x/tcg/translate_vx.c.inc | 101 ++++++++++++++++++++++++++++
 target/s390x/tcg/insn-data.def      |   4 ++
 2 files changed, 105 insertions(+)

Comments

David Hildenbrand March 21, 2022, 11:45 a.m. UTC | #1
On 08.03.22 02:53, Richard Henderson wrote:
> From: David Miller <dmiller423@gmail.com>
> 
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Message-Id: <20220307020327.3003-6-dmiller423@gmail.com>
> [rth: Split out elements (plural) from element (scalar)
>       Use tcg little-endian memory ops, plus hswap and wswap.]
> Signed-off-by: Richard Henderson <richard.henderson@linar.org>
> ---
>  target/s390x/tcg/translate_vx.c.inc | 101 ++++++++++++++++++++++++++++
>  target/s390x/tcg/insn-data.def      |   4 ++
>  2 files changed, 105 insertions(+)
> 
> diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
> index ac807122a3..9a82401d71 100644
> --- a/target/s390x/tcg/translate_vx.c.inc
> +++ b/target/s390x/tcg/translate_vx.c.inc
> @@ -457,6 +457,56 @@ static DisasJumpType op_vlrep(DisasContext *s, DisasOps *o)
>      return DISAS_NEXT;
>  }
>  
> +static DisasJumpType op_vlbr(DisasContext *s, DisasOps *o)
> +{
> +    const uint8_t es = get_field(s, m3);
> +    TCGv_i64 t0, t1, tt;
> +
> +    if (es < ES_16 || es > ES_128) {
> +        gen_program_exception(s, PGM_SPECIFICATION);
> +        return DISAS_NORETURN;
> +    }
> +
> +    t0 = tcg_temp_new_i64();
> +    t1 = tcg_temp_new_i64();
> +
> +    /* Begin with byte reversed doublewords... */
> +    tcg_gen_qemu_ld_i64(t0, o->addr1, get_mem_index(s), MO_LEUQ);
> +    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
> +    tcg_gen_qemu_ld_i64(t1, o->addr1, get_mem_index(s), MO_LEUQ);
> +

Would it make sense to just special-case ES_128, by loading them into
the proper t0/t1 right away?

if (es == ES_128) {
    tcg_gen_qemu_ld_i64(t1, o->addr1, get_mem_index(s), MO_LEUQ);
    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
    tcg_gen_qemu_ld_i64(t0, o->addr1, get_mem_index(s), MO_LEUQ);
    goto write;
}

/* Begin with byte reversed doublewords... */
tcg_gen_qemu_ld_i64(t0, o->addr1, get_mem_index(s), MO_LEUQ);
gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
tcg_gen_qemu_ld_i64(t1, o->addr1, get_mem_index(s), MO_LEUQ);

/*
 * For 16 and 32-bit elements, the doubleword bswap also reversed
 * the order of the elements.  Perform a larger order swap to put
 * them back into place.
 */
switch (es) {
...
}

write:
write_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
write_vec_element_i64(t1, get_field(s, v1), 1, ES_64);

> +    /*
> +     * For 16 and 32-bit elements, the doubleword bswap also reversed
> +     * the order of the elements.  Perform a larger order swap to put
> +     * them back into place.  For the 128-bit "element", finish the
> +     * bswap by swapping the doublewords.
> +     */
> +    switch (es) {
> +    case ES_16:
> +        tcg_gen_hswap_i64(t0, t0);
> +        tcg_gen_hswap_i64(t1, t1);
> +        break;
> +    case ES_32:
> +        tcg_gen_wswap_i64(t0, t0);
> +        tcg_gen_wswap_i64(t1, t1);
> +        break;
> +    case ES_64:
> +        break;
> +    case ES_128:
> +        tt = t0, t0 = t1, t1 = tt;
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +
> +    write_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
> +    write_vec_element_i64(t1, get_field(s, v1), 1, ES_64);
> +
> +    tcg_temp_free(t0);
> +    tcg_temp_free(t1);
> +    return DISAS_NEXT;
> +}
> +
>  static DisasJumpType op_vle(DisasContext *s, DisasOps *o)
>  {
>      const uint8_t es = s->insn->data;
> @@ -998,6 +1048,57 @@ static DisasJumpType op_vst(DisasContext *s, DisasOps *o)
>      return DISAS_NEXT;
>  }
>  
> +static DisasJumpType op_vstbr(DisasContext *s, DisasOps *o)
> +{
> +    const uint8_t es = get_field(s, m3);
> +    TCGv_i64 t0, t1, tt;
> +
> +    if (es < ES_16 || es > ES_128) {
> +        gen_program_exception(s, PGM_SPECIFICATION);
> +        return DISAS_NORETURN;
> +    }
> +
> +    /* Probe write access before actually modifying memory */
> +    gen_helper_probe_write_access(cpu_env, o->addr1, tcg_constant_i64(16));
> +
> +    t0 = tcg_temp_new_i64();
> +    t1 = tcg_temp_new_i64();
> +    read_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
> +    read_vec_element_i64(t1, get_field(s, v1), 1, ES_64);


Dito, eventually just special case on MO_128 directly.

> +
> +    /*
> +     * For 16 and 32-bit elements, the doubleword bswap below will
> +     * reverse the order of the elements.  Perform a larger order
> +     * swap to put them back into place.  For the 128-bit "element",
> +     * finish the bswap by swapping the doublewords.
> +     */
> +    switch (es) {
> +    case MO_16:
> +        tcg_gen_hswap_i64(t0, t0);
> +        tcg_gen_hswap_i64(t1, t1);
> +        break;
> +    case MO_32:
> +        tcg_gen_wswap_i64(t0, t0);
> +        tcg_gen_wswap_i64(t1, t1);
> +        break;
> +    case MO_64:
> +        break;
> +    case MO_128:
> +        tt = t0, t0 = t1, t1 = tt;
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +
> +    tcg_gen_qemu_st_i64(t0, o->addr1, get_mem_index(s), MO_LEUQ);
> +    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
> +    tcg_gen_qemu_st_i64(t1, o->addr1, get_mem_index(s), MO_LEUQ);
> +
> +    tcg_temp_free(t0);
> +    tcg_temp_free(t1);
> +    return DISAS_NEXT;
> +}
> +
>  static DisasJumpType op_vste(DisasContext *s, DisasOps *o)
>  {
>      const uint8_t es = s->insn->data;
> diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
> index b524541a7d..ee6e1dc9e5 100644
> --- a/target/s390x/tcg/insn-data.def
> +++ b/target/s390x/tcg/insn-data.def
> @@ -1027,6 +1027,8 @@
>      F(0xe756, VLR,     VRR_a, V,   0, 0, 0, 0, vlr, 0, IF_VEC)
>  /* VECTOR LOAD AND REPLICATE */
>      F(0xe705, VLREP,   VRX,   V,   la2, 0, 0, 0, vlrep, 0, IF_VEC)
> +/* VECTOR LOAD BYTE REVERSED ELEMENTS */
> +    F(0xe606, VLBR,    VRX,   VE2, la2, 0, 0, 0, vlbr, 0, IF_VEC)
>  /* VECTOR LOAD ELEMENT */
>      E(0xe700, VLEB,    VRX,   V,   la2, 0, 0, 0, vle, 0, ES_8, IF_VEC)
>      E(0xe701, VLEH,    VRX,   V,   la2, 0, 0, 0, vle, 0, ES_16, IF_VEC)
> @@ -1079,6 +1081,8 @@
>      F(0xe75f, VSEG,    VRR_a, V,   0, 0, 0, 0, vseg, 0, IF_VEC)
>  /* VECTOR STORE */
>      F(0xe70e, VST,     VRX,   V,   la2, 0, 0, 0, vst, 0, IF_VEC)
> +/* VECTOR STORE BYTE REVERSED ELEMENTS */
> +    F(0xe60e, VSTBR,    VRX,   VE2, la2, 0, 0, 0, vstbr, 0, IF_VEC)
>  /* VECTOR STORE ELEMENT */
>      E(0xe708, VSTEB,   VRX,   V,   la2, 0, 0, 0, vste, 0, ES_8, IF_VEC)
>      E(0xe709, VSTEH,   VRX,   V,   la2, 0, 0, 0, vste, 0, ES_16, IF_VEC)
diff mbox series

Patch

diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
index ac807122a3..9a82401d71 100644
--- a/target/s390x/tcg/translate_vx.c.inc
+++ b/target/s390x/tcg/translate_vx.c.inc
@@ -457,6 +457,56 @@  static DisasJumpType op_vlrep(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
+static DisasJumpType op_vlbr(DisasContext *s, DisasOps *o)
+{
+    const uint8_t es = get_field(s, m3);
+    TCGv_i64 t0, t1, tt;
+
+    if (es < ES_16 || es > ES_128) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+
+    /* Begin with byte reversed doublewords... */
+    tcg_gen_qemu_ld_i64(t0, o->addr1, get_mem_index(s), MO_LEUQ);
+    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
+    tcg_gen_qemu_ld_i64(t1, o->addr1, get_mem_index(s), MO_LEUQ);
+
+    /*
+     * For 16 and 32-bit elements, the doubleword bswap also reversed
+     * the order of the elements.  Perform a larger order swap to put
+     * them back into place.  For the 128-bit "element", finish the
+     * bswap by swapping the doublewords.
+     */
+    switch (es) {
+    case ES_16:
+        tcg_gen_hswap_i64(t0, t0);
+        tcg_gen_hswap_i64(t1, t1);
+        break;
+    case ES_32:
+        tcg_gen_wswap_i64(t0, t0);
+        tcg_gen_wswap_i64(t1, t1);
+        break;
+    case ES_64:
+        break;
+    case ES_128:
+        tt = t0, t0 = t1, t1 = tt;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    write_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
+    write_vec_element_i64(t1, get_field(s, v1), 1, ES_64);
+
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
+    return DISAS_NEXT;
+}
+
 static DisasJumpType op_vle(DisasContext *s, DisasOps *o)
 {
     const uint8_t es = s->insn->data;
@@ -998,6 +1048,57 @@  static DisasJumpType op_vst(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
+static DisasJumpType op_vstbr(DisasContext *s, DisasOps *o)
+{
+    const uint8_t es = get_field(s, m3);
+    TCGv_i64 t0, t1, tt;
+
+    if (es < ES_16 || es > ES_128) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    /* Probe write access before actually modifying memory */
+    gen_helper_probe_write_access(cpu_env, o->addr1, tcg_constant_i64(16));
+
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+    read_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
+    read_vec_element_i64(t1, get_field(s, v1), 1, ES_64);
+
+    /*
+     * For 16 and 32-bit elements, the doubleword bswap below will
+     * reverse the order of the elements.  Perform a larger order
+     * swap to put them back into place.  For the 128-bit "element",
+     * finish the bswap by swapping the doublewords.
+     */
+    switch (es) {
+    case MO_16:
+        tcg_gen_hswap_i64(t0, t0);
+        tcg_gen_hswap_i64(t1, t1);
+        break;
+    case MO_32:
+        tcg_gen_wswap_i64(t0, t0);
+        tcg_gen_wswap_i64(t1, t1);
+        break;
+    case MO_64:
+        break;
+    case MO_128:
+        tt = t0, t0 = t1, t1 = tt;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    tcg_gen_qemu_st_i64(t0, o->addr1, get_mem_index(s), MO_LEUQ);
+    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
+    tcg_gen_qemu_st_i64(t1, o->addr1, get_mem_index(s), MO_LEUQ);
+
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
+    return DISAS_NEXT;
+}
+
 static DisasJumpType op_vste(DisasContext *s, DisasOps *o)
 {
     const uint8_t es = s->insn->data;
diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
index b524541a7d..ee6e1dc9e5 100644
--- a/target/s390x/tcg/insn-data.def
+++ b/target/s390x/tcg/insn-data.def
@@ -1027,6 +1027,8 @@ 
     F(0xe756, VLR,     VRR_a, V,   0, 0, 0, 0, vlr, 0, IF_VEC)
 /* VECTOR LOAD AND REPLICATE */
     F(0xe705, VLREP,   VRX,   V,   la2, 0, 0, 0, vlrep, 0, IF_VEC)
+/* VECTOR LOAD BYTE REVERSED ELEMENTS */
+    F(0xe606, VLBR,    VRX,   VE2, la2, 0, 0, 0, vlbr, 0, IF_VEC)
 /* VECTOR LOAD ELEMENT */
     E(0xe700, VLEB,    VRX,   V,   la2, 0, 0, 0, vle, 0, ES_8, IF_VEC)
     E(0xe701, VLEH,    VRX,   V,   la2, 0, 0, 0, vle, 0, ES_16, IF_VEC)
@@ -1079,6 +1081,8 @@ 
     F(0xe75f, VSEG,    VRR_a, V,   0, 0, 0, 0, vseg, 0, IF_VEC)
 /* VECTOR STORE */
     F(0xe70e, VST,     VRX,   V,   la2, 0, 0, 0, vst, 0, IF_VEC)
+/* VECTOR STORE BYTE REVERSED ELEMENTS */
+    F(0xe60e, VSTBR,    VRX,   VE2, la2, 0, 0, 0, vstbr, 0, IF_VEC)
 /* VECTOR STORE ELEMENT */
     E(0xe708, VSTEB,   VRX,   V,   la2, 0, 0, 0, vste, 0, ES_8, IF_VEC)
     E(0xe709, VSTEH,   VRX,   V,   la2, 0, 0, 0, vste, 0, ES_16, IF_VEC)