Message ID | 20200930145523.71087-4-david@redhat.com |
---|---|
State | Accepted |
Commit | 8c18fa5b3eba2b5c4d1285714682db066ea711fa |
Headers | show |
Series | s390x/tcg: Implement Vector enhancements facility and switch to z14 | expand |
On 9/30/20 9:55 AM, David Hildenbrand wrote: > + /* Multipy both even elements from v2 and v3 */ > + read_vec_element_i64(l1, get_field(s, v2), 0, ES_64); > + read_vec_element_i64(h1, get_field(s, v3), 0, ES_64); > + tcg_gen_mulu2_i64(l1, h1, l1, h1); > + /* Shift result left by one bit if requested */ > + if (extract32(get_field(s, m6), 3, 1)) { > + tcg_gen_extract2_i64(h1, l1, h1, 63); > + tcg_gen_shli_i64(l1, l1, 1); > + } Not a bug, but some hosts require 3 insns for extract2 (so 4 total for this sequence). This doubling can also be had via add2: tcg_gen_add2_i64(l1, h1, l1, h1, l1, h1); At which point most hosts will require only 2 insns for this sequence. The two hosts that don't have a carry bit (mips, riscv), will still be able to perform the add in 3 insns. So add is never more expensive and sometimes half as expensive. Regardless, Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r~
On 01.10.20 17:26, Richard Henderson wrote: > On 9/30/20 9:55 AM, David Hildenbrand wrote: >> + /* Multipy both even elements from v2 and v3 */ >> + read_vec_element_i64(l1, get_field(s, v2), 0, ES_64); >> + read_vec_element_i64(h1, get_field(s, v3), 0, ES_64); >> + tcg_gen_mulu2_i64(l1, h1, l1, h1); >> + /* Shift result left by one bit if requested */ >> + if (extract32(get_field(s, m6), 3, 1)) { >> + tcg_gen_extract2_i64(h1, l1, h1, 63); >> + tcg_gen_shli_i64(l1, l1, 1); >> + } > > Not a bug, but some hosts require 3 insns for extract2 (so 4 total for this > sequence). > > This doubling can also be had via add2: > > tcg_gen_add2_i64(l1, h1, l1, h1, l1, h1); Took me longer than it should to realize this is really just doubling the value ... will use tcg_gen_add2_i64() and add a comment. Thanks! -- Thanks, David / dhildenb
diff --git a/target/s390x/insn-data.def b/target/s390x/insn-data.def index b55cb44f60..da7fe6f21c 100644 --- a/target/s390x/insn-data.def +++ b/target/s390x/insn-data.def @@ -1151,6 +1151,8 @@ F(0xe7a7, VMO, VRR_c, V, 0, 0, 0, 0, vm, 0, IF_VEC) /* VECTOR MULTIPLY LOGICAL ODD */ F(0xe7a5, VMLO, VRR_c, V, 0, 0, 0, 0, vm, 0, IF_VEC) +/* VECTOR MULTIPLY SUM LOGICAL */ + F(0xe7b8, VMSL, VRR_d, VE, 0, 0, 0, 0, vmsl, 0, IF_VEC) /* VECTOR NAND */ F(0xe76e, VNN, VRR_c, VE, 0, 0, 0, 0, vnn, 0, IF_VEC) /* VECTOR NOR */ diff --git a/target/s390x/translate_vx.c.inc b/target/s390x/translate_vx.c.inc index 44f54a79f4..4c1b430013 100644 --- a/target/s390x/translate_vx.c.inc +++ b/target/s390x/translate_vx.c.inc @@ -1779,6 +1779,58 @@ static DisasJumpType op_vm(DisasContext *s, DisasOps *o) return DISAS_NEXT; } +static DisasJumpType op_vmsl(DisasContext *s, DisasOps *o) +{ + TCGv_i64 l1, h1, l2, h2; + + if (get_field(s, m4) != ES_64) { + gen_program_exception(s, PGM_SPECIFICATION); + return DISAS_NORETURN; + } + + l1 = tcg_temp_new_i64(); + h1 = tcg_temp_new_i64(); + l2 = tcg_temp_new_i64(); + h2 = tcg_temp_new_i64(); + + /* Multipy both even elements from v2 and v3 */ + read_vec_element_i64(l1, get_field(s, v2), 0, ES_64); + read_vec_element_i64(h1, get_field(s, v3), 0, ES_64); + tcg_gen_mulu2_i64(l1, h1, l1, h1); + /* Shift result left by one bit if requested */ + if (extract32(get_field(s, m6), 3, 1)) { + tcg_gen_extract2_i64(h1, l1, h1, 63); + tcg_gen_shli_i64(l1, l1, 1); + } + + /* Multipy both odd elements from v2 and v3 */ + read_vec_element_i64(l2, get_field(s, v2), 1, ES_64); + read_vec_element_i64(h2, get_field(s, v3), 1, ES_64); + tcg_gen_mulu2_i64(l2, h2, l2, h2); + /* Shift result left by one bit if requested */ + if (extract32(get_field(s, m6), 2, 1)) { + tcg_gen_extract2_i64(h2, l2, h2, 63); + tcg_gen_shli_i64(l2, l2, 1); + } + + /* Add both intermediate results */ + tcg_gen_add2_i64(l1, h1, l1, h1, l2, h2); + /* Add whole v4 */ + read_vec_element_i64(h2, get_field(s, v4), 0, ES_64); + read_vec_element_i64(l2, get_field(s, v4), 1, ES_64); + tcg_gen_add2_i64(l1, h1, l1, h1, l2, h2); + + /* Store final result into v1. */ + write_vec_element_i64(h1, get_field(s, v1), 0, ES_64); + write_vec_element_i64(l1, get_field(s, v1), 1, ES_64); + + tcg_temp_free_i64(l1); + tcg_temp_free_i64(h1); + tcg_temp_free_i64(l2); + tcg_temp_free_i64(h2); + return DISAS_NEXT; +} + static DisasJumpType op_vnn(DisasContext *s, DisasOps *o) { gen_gvec_fn_3(nand, ES_8, get_field(s, v1),
Fortunately, we only need the Doubleword implementation. Signed-off-by: David Hildenbrand <david@redhat.com> --- target/s390x/insn-data.def | 2 ++ target/s390x/translate_vx.c.inc | 52 +++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+)