Message ID | 20210503183541.2014496-6-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | tcg/s390x: host vector support | expand |
On 03.05.21 20:35, Richard Henderson wrote: > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > tcg/s390x/tcg-target.c.inc | 117 +++++++++++++++++++++++++++++++++---- > 1 file changed, 105 insertions(+), 12 deletions(-) > > diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc > index 52df20a1ed..6ed9a309c1 100644 > --- a/tcg/s390x/tcg-target.c.inc > +++ b/tcg/s390x/tcg-target.c.inc > @@ -265,6 +265,12 @@ typedef enum S390Opcode { > RX_STC = 0x42, > RX_STH = 0x40, > > + VRX_VL = 0xe706, > + VRX_VLLEZ = 0xe704, > + VRX_VST = 0xe70e, > + VRX_VSTEF = 0xe70b, > + VRX_VSTEG = 0xe70a, > + > NOP = 0x0707, > } S390Opcode; > > @@ -532,6 +538,26 @@ static void tcg_out_insn_RSY(TCGContext *s, S390Opcode op, TCGReg r1, > #define tcg_out_insn_RX tcg_out_insn_RS > #define tcg_out_insn_RXY tcg_out_insn_RSY > > +static int RXB(TCGReg v1, TCGReg v2, TCGReg v3, TCGReg v4) > +{ > + return ((v1 & 16) << (7 - 3)) > + | ((v2 & 16) << (6 - 3)) > + | ((v3 & 16) << (5 - 3)) > + | ((v4 & 16) << (4 - 3)); > +} This a bit confusing (and maybe wrong?). If we have v1=TCG_REG_V31, corresponding to "32 + 31", we'd get (63 & 16) << (7 - 3) = 16 << 4 = 256 = 0x100 And with v4=TCG_REG_V31 (63 & 16) << (4 - 3) = 16 << 1 = 32 = 0x20 Which doesn't make any sense to me, because the RXB is bit 36-39 in an 48-bit instruction (here: 0xf00) . But maybe I messed up. RXB is 4 bit. Can we just make RXB() return these 4 bits and do any shifting in the caller? At least for me that would be easier to grasp. static int RXB(TCGReg v1, TCGReg v2, TCGReg v3, TCGReg v4) { return (v1 - TCG_REG_V0 >= 16) << 3 | (v2 - TCG_REG_V0 >= 16) << 2 | (v3 - TCG_REG_V0 >= 16) << 1 | (v4 - TCG_REG_V0 >= 16); } or without comparisons static uint8_t RXB(TCGReg v1, TCGReg v2, TCGReg v3, TCGReg v4) { return ((v1 & 16) >> (4 - 3)) | ((v2 & 16) >> (4 - 2)) | ((v3 & 16) >> (4 - 1)) | ((v4 & 16) >> (4 - 0)); } And then maybe add a simple static uint16_t RXB_INSTR(TCGReg v1, TCGReg v2, TCGReg v3, TCGReg v4) { return RXB(v1, v2, v3, v4) << 8; } that shifts the RXB into place if necessary. > + > +static void tcg_out_insn_VRX(TCGContext *s, S390Opcode op, TCGReg v1, > + TCGReg b2, TCGReg x2, intptr_t d2, int m3) > +{ > + tcg_debug_assert(v1 >= TCG_REG_V0 && v1 <= TCG_REG_V31); > + tcg_debug_assert(d2 >= 0 && d2 <= 0xfff); > + tcg_debug_assert(x2 <= TCG_REG_R15); > + tcg_debug_assert(b2 <= TCG_REG_R15); > + tcg_out16(s, (op & 0xff00) | ((v1 & 15) << 4) | x2); > + tcg_out16(s, (b2 << 12) | d2); > + tcg_out16(s, (op & 0x00ff) | RXB(v1, 0, 0, 0) | (m3 << 12)); > +} > + > /* Emit an opcode with "type-checking" of the format. */ > #define tcg_out_insn(S, FMT, OP, ...) \ > glue(tcg_out_insn_,FMT)(S, glue(glue(FMT,_),OP), ## __VA_ARGS__) > @@ -708,25 +734,92 @@ static void tcg_out_mem(TCGContext *s, S390Opcode opc_rx, S390Opcode opc_rxy, > } > } > > +static void tcg_out_vrx_mem(TCGContext *s, S390Opcode opc_vrx, > + TCGReg data, TCGReg base, TCGReg index, > + tcg_target_long ofs, int m3) > +{ > + if (ofs < 0 || ofs >= 0x1000) { > + if (ofs >= -0x80000 && ofs < 0x80000) { > + tcg_out_insn(s, RXY, LAY, TCG_TMP0, base, index, ofs); > + base = TCG_TMP0; > + index = TCG_REG_NONE; > + ofs = 0; > + } else { > + tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, ofs); > + if (index != TCG_REG_NONE) { > + tcg_out_insn(s, RRE, AGR, TCG_TMP0, index); > + } > + index = TCG_TMP0; > + ofs = 0; > + } > + } > + tcg_out_insn_VRX(s, opc_vrx, data, base, index, ofs, m3); > +} > > /* load data without address translation or endianness conversion */ > -static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg data, > - TCGReg base, intptr_t ofs) > +static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg data, > + TCGReg base, intptr_t ofs) > { > - if (type == TCG_TYPE_I32) { > - tcg_out_mem(s, RX_L, RXY_LY, data, base, TCG_REG_NONE, ofs); > - } else { > - tcg_out_mem(s, 0, RXY_LG, data, base, TCG_REG_NONE, ofs); > + switch (type) { > + case TCG_TYPE_I32: > + if (likely(data < 16)) { > + tcg_out_mem(s, RX_L, RXY_LY, data, base, TCG_REG_NONE, ofs); > + break; > + } > + tcg_out_vrx_mem(s, VRX_VLLEZ, data, base, TCG_REG_NONE, ofs, MO_32); > + break; > + > + case TCG_TYPE_I64: > + if (likely(data < 16)) { > + tcg_out_mem(s, 0, RXY_LG, data, base, TCG_REG_NONE, ofs); > + break; > + } > + /* fallthru */ > + > + case TCG_TYPE_V64: > + tcg_out_vrx_mem(s, VRX_VLLEZ, data, base, TCG_REG_NONE, ofs, MO_64); > + break; > + > + case TCG_TYPE_V128: > + /* Hint quadword aligned. */ > + tcg_out_vrx_mem(s, VRX_VL, data, base, TCG_REG_NONE, ofs, 4); > + break; > + > + default: > + g_assert_not_reached(); > } > } > > -static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg data, > - TCGReg base, intptr_t ofs) > +static void tcg_out_st(TCGContext *s, TCGType type, TCGReg data, > + TCGReg base, intptr_t ofs) > { > - if (type == TCG_TYPE_I32) { > - tcg_out_mem(s, RX_ST, RXY_STY, data, base, TCG_REG_NONE, ofs); > - } else { > - tcg_out_mem(s, 0, RXY_STG, data, base, TCG_REG_NONE, ofs); > + switch (type) { > + case TCG_TYPE_I32: > + if (likely(data < 16)) { > + tcg_out_mem(s, RX_ST, RXY_STY, data, base, TCG_REG_NONE, ofs); > + } else { > + tcg_out_vrx_mem(s, VRX_VSTEF, data, base, TCG_REG_NONE, ofs, 1); > + } > + break; > + > + case TCG_TYPE_I64: > + if (likely(data < 16)) { > + tcg_out_mem(s, 0, RXY_STG, data, base, TCG_REG_NONE, ofs); > + break; > + } > + /* fallthru */ > + > + case TCG_TYPE_V64: > + tcg_out_vrx_mem(s, VRX_VSTEG, data, base, TCG_REG_NONE, ofs, 0); > + break; > + > + case TCG_TYPE_V128: > + /* Hint quadword aligned. */ > + tcg_out_vrx_mem(s, VRX_VST, data, base, TCG_REG_NONE, ofs, 4); > + break; > + > + default: > + g_assert_not_reached(); > } > } > > Remaining stuff LGTM, although I have little experience with that code -- Thanks, David / dhildenb
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc index 52df20a1ed..6ed9a309c1 100644 --- a/tcg/s390x/tcg-target.c.inc +++ b/tcg/s390x/tcg-target.c.inc @@ -265,6 +265,12 @@ typedef enum S390Opcode { RX_STC = 0x42, RX_STH = 0x40, + VRX_VL = 0xe706, + VRX_VLLEZ = 0xe704, + VRX_VST = 0xe70e, + VRX_VSTEF = 0xe70b, + VRX_VSTEG = 0xe70a, + NOP = 0x0707, } S390Opcode; @@ -532,6 +538,26 @@ static void tcg_out_insn_RSY(TCGContext *s, S390Opcode op, TCGReg r1, #define tcg_out_insn_RX tcg_out_insn_RS #define tcg_out_insn_RXY tcg_out_insn_RSY +static int RXB(TCGReg v1, TCGReg v2, TCGReg v3, TCGReg v4) +{ + return ((v1 & 16) << (7 - 3)) + | ((v2 & 16) << (6 - 3)) + | ((v3 & 16) << (5 - 3)) + | ((v4 & 16) << (4 - 3)); +} + +static void tcg_out_insn_VRX(TCGContext *s, S390Opcode op, TCGReg v1, + TCGReg b2, TCGReg x2, intptr_t d2, int m3) +{ + tcg_debug_assert(v1 >= TCG_REG_V0 && v1 <= TCG_REG_V31); + tcg_debug_assert(d2 >= 0 && d2 <= 0xfff); + tcg_debug_assert(x2 <= TCG_REG_R15); + tcg_debug_assert(b2 <= TCG_REG_R15); + tcg_out16(s, (op & 0xff00) | ((v1 & 15) << 4) | x2); + tcg_out16(s, (b2 << 12) | d2); + tcg_out16(s, (op & 0x00ff) | RXB(v1, 0, 0, 0) | (m3 << 12)); +} + /* Emit an opcode with "type-checking" of the format. */ #define tcg_out_insn(S, FMT, OP, ...) \ glue(tcg_out_insn_,FMT)(S, glue(glue(FMT,_),OP), ## __VA_ARGS__) @@ -708,25 +734,92 @@ static void tcg_out_mem(TCGContext *s, S390Opcode opc_rx, S390Opcode opc_rxy, } } +static void tcg_out_vrx_mem(TCGContext *s, S390Opcode opc_vrx, + TCGReg data, TCGReg base, TCGReg index, + tcg_target_long ofs, int m3) +{ + if (ofs < 0 || ofs >= 0x1000) { + if (ofs >= -0x80000 && ofs < 0x80000) { + tcg_out_insn(s, RXY, LAY, TCG_TMP0, base, index, ofs); + base = TCG_TMP0; + index = TCG_REG_NONE; + ofs = 0; + } else { + tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, ofs); + if (index != TCG_REG_NONE) { + tcg_out_insn(s, RRE, AGR, TCG_TMP0, index); + } + index = TCG_TMP0; + ofs = 0; + } + } + tcg_out_insn_VRX(s, opc_vrx, data, base, index, ofs, m3); +} /* load data without address translation or endianness conversion */ -static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg data, - TCGReg base, intptr_t ofs) +static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg data, + TCGReg base, intptr_t ofs) { - if (type == TCG_TYPE_I32) { - tcg_out_mem(s, RX_L, RXY_LY, data, base, TCG_REG_NONE, ofs); - } else { - tcg_out_mem(s, 0, RXY_LG, data, base, TCG_REG_NONE, ofs); + switch (type) { + case TCG_TYPE_I32: + if (likely(data < 16)) { + tcg_out_mem(s, RX_L, RXY_LY, data, base, TCG_REG_NONE, ofs); + break; + } + tcg_out_vrx_mem(s, VRX_VLLEZ, data, base, TCG_REG_NONE, ofs, MO_32); + break; + + case TCG_TYPE_I64: + if (likely(data < 16)) { + tcg_out_mem(s, 0, RXY_LG, data, base, TCG_REG_NONE, ofs); + break; + } + /* fallthru */ + + case TCG_TYPE_V64: + tcg_out_vrx_mem(s, VRX_VLLEZ, data, base, TCG_REG_NONE, ofs, MO_64); + break; + + case TCG_TYPE_V128: + /* Hint quadword aligned. */ + tcg_out_vrx_mem(s, VRX_VL, data, base, TCG_REG_NONE, ofs, 4); + break; + + default: + g_assert_not_reached(); } } -static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg data, - TCGReg base, intptr_t ofs) +static void tcg_out_st(TCGContext *s, TCGType type, TCGReg data, + TCGReg base, intptr_t ofs) { - if (type == TCG_TYPE_I32) { - tcg_out_mem(s, RX_ST, RXY_STY, data, base, TCG_REG_NONE, ofs); - } else { - tcg_out_mem(s, 0, RXY_STG, data, base, TCG_REG_NONE, ofs); + switch (type) { + case TCG_TYPE_I32: + if (likely(data < 16)) { + tcg_out_mem(s, RX_ST, RXY_STY, data, base, TCG_REG_NONE, ofs); + } else { + tcg_out_vrx_mem(s, VRX_VSTEF, data, base, TCG_REG_NONE, ofs, 1); + } + break; + + case TCG_TYPE_I64: + if (likely(data < 16)) { + tcg_out_mem(s, 0, RXY_STG, data, base, TCG_REG_NONE, ofs); + break; + } + /* fallthru */ + + case TCG_TYPE_V64: + tcg_out_vrx_mem(s, VRX_VSTEG, data, base, TCG_REG_NONE, ofs, 0); + break; + + case TCG_TYPE_V128: + /* Hint quadword aligned. */ + tcg_out_vrx_mem(s, VRX_VST, data, base, TCG_REG_NONE, ofs, 4); + break; + + default: + g_assert_not_reached(); } }
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- tcg/s390x/tcg-target.c.inc | 117 +++++++++++++++++++++++++++++++++---- 1 file changed, 105 insertions(+), 12 deletions(-) -- 2.25.1