diff mbox series

[v2,17/29] tcg: Add gvec expanders for vector shift by scalar

Message ID 20190501050536.15580-18-richard.henderson@linaro.org
State New
Headers show
Series tcg vector improvements | expand

Commit Message

Richard Henderson May 1, 2019, 5:05 a.m. UTC
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 tcg/tcg-op-gvec.h |   7 ++
 tcg/tcg-op.h      |   4 +
 tcg/tcg-op-gvec.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++
 tcg/tcg-op-vec.c  |  54 ++++++++++++
 4 files changed, 269 insertions(+)

-- 
2.17.1

Comments

Alex Bennée May 2, 2019, 2:37 p.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

>  tcg/tcg-op-gvec.h |   7 ++

>  tcg/tcg-op.h      |   4 +

>  tcg/tcg-op-gvec.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++

>  tcg/tcg-op-vec.c  |  54 ++++++++++++

>  4 files changed, 269 insertions(+)

>

> diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h

> index 84a6247b16..6ee98f3378 100644

> --- a/tcg/tcg-op-gvec.h

> +++ b/tcg/tcg-op-gvec.h

> @@ -318,6 +318,13 @@ void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,

>  void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,

>                         int64_t shift, uint32_t oprsz, uint32_t maxsz);

>

> +void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,

> +                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);

> +void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,

> +                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);

> +void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,

> +                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);

> +

>  /*

>   * Perform vector shift by vector element, modulo the element size.

>   * E.g.  D[i] = A[i] << (B[i] % (8 << vece)).

> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h

> index 833c6330b5..472b73cb38 100644

> --- a/tcg/tcg-op.h

> +++ b/tcg/tcg-op.h

> @@ -986,6 +986,10 @@ void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);

>  void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);

>  void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);

>

> +void tcg_gen_shls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);

> +void tcg_gen_shrs_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);

> +void tcg_gen_sars_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);

> +

>  void tcg_gen_shlv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);

>  void tcg_gen_shrv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);

>  void tcg_gen_sarv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);

> diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c

> index 061ef329f1..8fc5ba042b 100644

> --- a/tcg/tcg-op-gvec.c

> +++ b/tcg/tcg-op-gvec.c

> @@ -2555,6 +2555,210 @@ void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,

>      }

>  }

>

> +/*

> + * Specialized generation vector shifts by a non-constant scalar.

> + */

> +

> +static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,

> +                           uint32_t oprsz, uint32_t tysz, TCGType type,

> +                           TCGv_i32 shift,

> +                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))

> +{

> +    TCGv_vec t0 = tcg_temp_new_vec(type);

> +    uint32_t i;

> +

> +    for (i = 0; i < oprsz; i += tysz) {

> +        tcg_gen_ld_vec(t0, cpu_env, aofs + i);

> +        fni(vece, t0, t0, shift);

> +        tcg_gen_st_vec(t0, cpu_env, dofs + i);

> +    }

> +    tcg_temp_free_vec(t0);

> +}

> +

> +static void

> +do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs,

> +               TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz,

> +               void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32),

> +               void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64),

> +               void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32),

> +               void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec),

> +               gen_helper_gvec_2 *fno,


This prototype seems a little heavy given we usually just pass around
a reference to the relevant GVecGenFoo structure with the various
options in it? Why the anti-pattern?

> +               const TCGOpcode *s_list, const TCGOpcode *v_list)

> +{

> +    TCGType type;

> +    uint32_t some;

> +

> +    check_size_align(oprsz, maxsz, dofs | aofs);

> +    check_overlap_2(dofs, aofs, maxsz);

> +

> +    /* If the backend has a scalar expansion, great.  */

> +    type = choose_vector_type(s_list, vece, oprsz, vece == MO_64);

> +    if (type) {

> +        const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);

> +        switch (type) {

> +        case TCG_TYPE_V256:

> +            some = QEMU_ALIGN_DOWN(oprsz, 32);

> +            expand_2sh_vec(vece, dofs, aofs, some, 32,

> +                           TCG_TYPE_V256, shift, fniv_s);

> +            if (some == oprsz) {

> +                break;

> +            }

> +            dofs += some;

> +            aofs += some;

> +            oprsz -= some;

> +            maxsz -= some;

> +            /* fallthru */

> +        case TCG_TYPE_V128:

> +            expand_2sh_vec(vece, dofs, aofs, oprsz, 16,

> +                           TCG_TYPE_V128, shift, fniv_s);

> +            break;

> +        case TCG_TYPE_V64:

> +            expand_2sh_vec(vece, dofs, aofs, oprsz, 8,

> +                           TCG_TYPE_V64, shift, fniv_s);

> +            break;

> +        default:

> +            g_assert_not_reached();

> +        }

> +        tcg_swap_vecop_list(hold_list);

> +        goto clear_tail;

> +    }

> +

> +    /* If the backend supports variable vector shifts, also cool.  */

> +    type = choose_vector_type(v_list, vece, oprsz, vece == MO_64);

> +    if (type) {

> +        const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);

> +        TCGv_vec v_shift = tcg_temp_new_vec(type);

> +

> +        if (vece == MO_64) {

> +            TCGv_i64 sh64 = tcg_temp_new_i64();

> +            tcg_gen_extu_i32_i64(sh64, shift);

> +            tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);

> +            tcg_temp_free_i64(sh64);

> +        } else {

> +            tcg_gen_dup_i32_vec(vece, v_shift, shift);

> +        }

> +

> +        switch (type) {

> +        case TCG_TYPE_V256:

> +            some = QEMU_ALIGN_DOWN(oprsz, 32);

> +            expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,

> +                          v_shift, false, fniv_v);

> +            if (some == oprsz) {

> +                break;

> +            }

> +            dofs += some;

> +            aofs += some;

> +            oprsz -= some;

> +            maxsz -= some;

> +            /* fallthru */

> +        case TCG_TYPE_V128:

> +            expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,

> +                          v_shift, false, fniv_v);

> +            break;

> +        case TCG_TYPE_V64:

> +            expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,

> +                          v_shift, false, fniv_v);

> +            break;

> +        default:

> +            g_assert_not_reached();

> +        }

> +        tcg_temp_free_vec(v_shift);

> +        tcg_swap_vecop_list(hold_list);

> +        goto clear_tail;

> +    }

> +

> +    /* Otherwise fall back to integral... */

> +    if (fni4 && check_size_impl(oprsz, 4)) {

> +        expand_2s_i32(dofs, aofs, oprsz, shift, false, fni4);

> +    } else if (fni8 && check_size_impl(oprsz, 8)) {

> +        TCGv_i64 sh64 = tcg_temp_new_i64();

> +        tcg_gen_extu_i32_i64(sh64, shift);

> +        expand_2s_i64(dofs, aofs, oprsz, sh64, false, fni8);

> +        tcg_temp_free_i64(sh64);

> +    } else {

> +        TCGv_ptr a0 = tcg_temp_new_ptr();

> +        TCGv_ptr a1 = tcg_temp_new_ptr();

> +        TCGv_i32 desc = tcg_temp_new_i32();

> +

> +        tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);

> +        tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));

> +        tcg_gen_addi_ptr(a0, cpu_env, dofs);

> +        tcg_gen_addi_ptr(a1, cpu_env, aofs);

> +

> +        fno(a0, a1, desc);

> +

> +        tcg_temp_free_ptr(a0);

> +        tcg_temp_free_ptr(a1);

> +        tcg_temp_free_i32(desc);

> +        return;

> +    }

> +

> + clear_tail:

> +    if (oprsz < maxsz) {

> +        expand_clr(dofs + oprsz, maxsz - oprsz);

> +    }

> +}

> +

> +void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,

> +                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)

> +{

> +    static const TCGOpcode scalar_list[] = { INDEX_op_shls_vec, 0 };

> +    static const TCGOpcode vector_list[] = { INDEX_op_shlv_vec, 0 };

> +    static gen_helper_gvec_2 * const fno[4] = {

> +        gen_helper_gvec_shl8i,

> +        gen_helper_gvec_shl16i,

> +        gen_helper_gvec_shl32i,

> +        gen_helper_gvec_shl64i,

> +    };

> +

> +    tcg_debug_assert(vece <= MO_64);

> +    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz,

> +                   vece == MO_32 ? tcg_gen_shl_i32 : NULL,

> +                   vece == MO_64 ? tcg_gen_shl_i64 : NULL,

> +                   tcg_gen_shls_vec, tcg_gen_shlv_vec, fno[vece],

> +                   scalar_list, vector_list);


Hmm I guess:

    static GVecGenFoo const ops[4] = {
        {
            .fno = gen_helper_gvec_shl8i
        },
        {
            .fno = gen_helper_gvec_shl16i
        },
        {
            .fno = gen_helper_gvec_shl32i,
            .fni4 = tcg_gen_shl_i32
        },
        {
            .fno = gen_helper_gvec_shl64i,
            .fni8 = tcg_gen_shl_i64
        }
    };
    tcg_debug_assert(vece <= MO_64);
    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &ops[vece],
                   tcg_gen_shls_vec, tcg_gen_shlv_vec,
                   scalar_list, vector_list);

gets a little verbose....

> +}

> +

> +void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,

> +                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)

> +{

> +    static const TCGOpcode scalar_list[] = { INDEX_op_shrs_vec, 0 };

> +    static const TCGOpcode vector_list[] = { INDEX_op_shrv_vec, 0 };

> +    static gen_helper_gvec_2 * const fno[4] = {

> +        gen_helper_gvec_shr8i,

> +        gen_helper_gvec_shr16i,

> +        gen_helper_gvec_shr32i,

> +        gen_helper_gvec_shr64i,

> +    };

> +

> +    tcg_debug_assert(vece <= MO_64);

> +    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz,

> +                   vece == MO_32 ? tcg_gen_shr_i32 : NULL,

> +                   vece == MO_64 ? tcg_gen_shr_i64 : NULL,

> +                   tcg_gen_shrs_vec, tcg_gen_shrv_vec, fno[vece],

> +                   scalar_list, vector_list);

> +}

> +

> +void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,

> +                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)

> +{

> +    static const TCGOpcode scalar_list[] = { INDEX_op_sars_vec, 0 };

> +    static const TCGOpcode vector_list[] = { INDEX_op_sarv_vec, 0 };

> +    static gen_helper_gvec_2 * const fno[4] = {

> +        gen_helper_gvec_sar8i,

> +        gen_helper_gvec_sar16i,

> +        gen_helper_gvec_sar32i,

> +        gen_helper_gvec_sar64i,

> +    };

> +

> +    tcg_debug_assert(vece <= MO_64);

> +    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz,

> +                   vece == MO_32 ? tcg_gen_sar_i32 : NULL,

> +                   vece == MO_64 ? tcg_gen_sar_i64 : NULL,

> +                   tcg_gen_sars_vec, tcg_gen_sarv_vec, fno[vece],

> +                   scalar_list, vector_list);

> +}

> +

>  /*

>   * Expand D = A << (B % element bits)

>   *

> diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c

> index 96317dbd10..16062f5995 100644

> --- a/tcg/tcg-op-vec.c

> +++ b/tcg/tcg-op-vec.c

> @@ -598,3 +598,57 @@ void tcg_gen_sarv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)

>  {

>      do_op3(vece, r, a, b, INDEX_op_sarv_vec);

>  }

> +

> +static void do_shifts(unsigned vece, TCGv_vec r, TCGv_vec a,

> +                      TCGv_i32 s, TCGOpcode opc_s, TCGOpcode opc_v)

> +{

> +    TCGTemp *rt = tcgv_vec_temp(r);

> +    TCGTemp *at = tcgv_vec_temp(a);

> +    TCGTemp *st = tcgv_i32_temp(s);

> +    TCGArg ri = temp_arg(rt);

> +    TCGArg ai = temp_arg(at);

> +    TCGArg si = temp_arg(st);

> +    TCGType type = rt->base_type;

> +    const TCGOpcode *hold_list;

> +    int can;

> +

> +    tcg_debug_assert(at->base_type >= type);

> +    tcg_assert_listed_vecop(opc_s);

> +    hold_list = tcg_swap_vecop_list(NULL);

> +

> +    can = tcg_can_emit_vec_op(opc_s, type, vece);

> +    if (can > 0) {

> +        vec_gen_3(opc_s, type, vece, ri, ai, si);

> +    } else if (can < 0) {

> +        tcg_expand_vec_op(opc_s, type, vece, ri, ai, si);

> +    } else {

> +        TCGv_vec vec_s = tcg_temp_new_vec(type);

> +

> +        if (vece == MO_64) {

> +            TCGv_i64 s64 = tcg_temp_new_i64();

> +            tcg_gen_extu_i32_i64(s64, s);

> +            tcg_gen_dup_i64_vec(MO_64, vec_s, s64);

> +            tcg_temp_free_i64(s64);

> +        } else {

> +            tcg_gen_dup_i32_vec(vece, vec_s, s);

> +        }

> +        do_op3(vece, r, a, vec_s, opc_v);

> +        tcg_temp_free_vec(vec_s);

> +    }

> +    tcg_swap_vecop_list(hold_list);

> +}

> +

> +void tcg_gen_shls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 b)

> +{

> +    do_shifts(vece, r, a, b, INDEX_op_shls_vec, INDEX_op_shlv_vec);

> +}

> +

> +void tcg_gen_shrs_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 b)

> +{

> +    do_shifts(vece, r, a, b, INDEX_op_shrs_vec, INDEX_op_shrv_vec);

> +}

> +

> +void tcg_gen_sars_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 b)

> +{

> +    do_shifts(vece, r, a, b, INDEX_op_sars_vec, INDEX_op_sarv_vec);

> +}



Otherwise:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>


--
Alex Bennée
Richard Henderson May 2, 2019, 3:46 p.m. UTC | #2
On 5/2/19 7:37 AM, Alex Bennée wrote:
>> +void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,

>> +                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)

>> +{

>> +    static const TCGOpcode scalar_list[] = { INDEX_op_shls_vec, 0 };

>> +    static const TCGOpcode vector_list[] = { INDEX_op_shlv_vec, 0 };

>> +    static gen_helper_gvec_2 * const fno[4] = {

>> +        gen_helper_gvec_shl8i,

>> +        gen_helper_gvec_shl16i,

>> +        gen_helper_gvec_shl32i,

>> +        gen_helper_gvec_shl64i,

>> +    };

>> +

>> +    tcg_debug_assert(vece <= MO_64);

>> +    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz,

>> +                   vece == MO_32 ? tcg_gen_shl_i32 : NULL,

>> +                   vece == MO_64 ? tcg_gen_shl_i64 : NULL,

>> +                   tcg_gen_shls_vec, tcg_gen_shlv_vec, fno[vece],

>> +                   scalar_list, vector_list);

> 

> Hmm I guess:

> 

>     static GVecGenFoo const ops[4] = {

>         {

>             .fno = gen_helper_gvec_shl8i

>         },

>         {

>             .fno = gen_helper_gvec_shl16i

>         },

>         {

>             .fno = gen_helper_gvec_shl32i,

>             .fni4 = tcg_gen_shl_i32

>         },

>         {

>             .fno = gen_helper_gvec_shl64i,

>             .fni8 = tcg_gen_shl_i64

>         }

>     };

>     tcg_debug_assert(vece <= MO_64);

>     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &ops[vece],

>                    tcg_gen_shls_vec, tcg_gen_shlv_vec,

>                    scalar_list, vector_list);

> 

> gets a little verbose....


That's exactly it.

The GVecGenFoo structures were created so that front ends would be able to
define their own.  For that I wanted full generality.  This case didn't seem to
warrant that.

I suppose I could create a denser GVecGenFoo for this case.
Which actually seems like a good idea now I think about it.

r~
diff mbox series

Patch

diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
index 84a6247b16..6ee98f3378 100644
--- a/tcg/tcg-op-gvec.h
+++ b/tcg/tcg-op-gvec.h
@@ -318,6 +318,13 @@  void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
                        int64_t shift, uint32_t oprsz, uint32_t maxsz);
 
+void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+
 /*
  * Perform vector shift by vector element, modulo the element size.
  * E.g.  D[i] = A[i] << (B[i] % (8 << vece)).
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 833c6330b5..472b73cb38 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -986,6 +986,10 @@  void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
 void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
 void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
 
+void tcg_gen_shls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
+void tcg_gen_shrs_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
+void tcg_gen_sars_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
+
 void tcg_gen_shlv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
 void tcg_gen_shrv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
 void tcg_gen_sarv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 061ef329f1..8fc5ba042b 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -2555,6 +2555,210 @@  void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
     }
 }
 
+/*
+ * Specialized generation vector shifts by a non-constant scalar.
+ */
+
+static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
+                           uint32_t oprsz, uint32_t tysz, TCGType type,
+                           TCGv_i32 shift,
+                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
+{
+    TCGv_vec t0 = tcg_temp_new_vec(type);
+    uint32_t i;
+
+    for (i = 0; i < oprsz; i += tysz) {
+        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
+        fni(vece, t0, t0, shift);
+        tcg_gen_st_vec(t0, cpu_env, dofs + i);
+    }
+    tcg_temp_free_vec(t0);
+}
+
+static void
+do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs,
+               TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz,
+               void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32),
+               void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64),
+               void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32),
+               void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec),
+               gen_helper_gvec_2 *fno,
+               const TCGOpcode *s_list, const TCGOpcode *v_list)
+{
+    TCGType type;
+    uint32_t some;
+
+    check_size_align(oprsz, maxsz, dofs | aofs);
+    check_overlap_2(dofs, aofs, maxsz);
+
+    /* If the backend has a scalar expansion, great.  */
+    type = choose_vector_type(s_list, vece, oprsz, vece == MO_64);
+    if (type) {
+        const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
+        switch (type) {
+        case TCG_TYPE_V256:
+            some = QEMU_ALIGN_DOWN(oprsz, 32);
+            expand_2sh_vec(vece, dofs, aofs, some, 32,
+                           TCG_TYPE_V256, shift, fniv_s);
+            if (some == oprsz) {
+                break;
+            }
+            dofs += some;
+            aofs += some;
+            oprsz -= some;
+            maxsz -= some;
+            /* fallthru */
+        case TCG_TYPE_V128:
+            expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
+                           TCG_TYPE_V128, shift, fniv_s);
+            break;
+        case TCG_TYPE_V64:
+            expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
+                           TCG_TYPE_V64, shift, fniv_s);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        tcg_swap_vecop_list(hold_list);
+        goto clear_tail;
+    }
+
+    /* If the backend supports variable vector shifts, also cool.  */
+    type = choose_vector_type(v_list, vece, oprsz, vece == MO_64);
+    if (type) {
+        const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
+        TCGv_vec v_shift = tcg_temp_new_vec(type);
+
+        if (vece == MO_64) {
+            TCGv_i64 sh64 = tcg_temp_new_i64();
+            tcg_gen_extu_i32_i64(sh64, shift);
+            tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
+            tcg_temp_free_i64(sh64);
+        } else {
+            tcg_gen_dup_i32_vec(vece, v_shift, shift);
+        }
+
+        switch (type) {
+        case TCG_TYPE_V256:
+            some = QEMU_ALIGN_DOWN(oprsz, 32);
+            expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
+                          v_shift, false, fniv_v);
+            if (some == oprsz) {
+                break;
+            }
+            dofs += some;
+            aofs += some;
+            oprsz -= some;
+            maxsz -= some;
+            /* fallthru */
+        case TCG_TYPE_V128:
+            expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
+                          v_shift, false, fniv_v);
+            break;
+        case TCG_TYPE_V64:
+            expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
+                          v_shift, false, fniv_v);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        tcg_temp_free_vec(v_shift);
+        tcg_swap_vecop_list(hold_list);
+        goto clear_tail;
+    }
+
+    /* Otherwise fall back to integral... */
+    if (fni4 && check_size_impl(oprsz, 4)) {
+        expand_2s_i32(dofs, aofs, oprsz, shift, false, fni4);
+    } else if (fni8 && check_size_impl(oprsz, 8)) {
+        TCGv_i64 sh64 = tcg_temp_new_i64();
+        tcg_gen_extu_i32_i64(sh64, shift);
+        expand_2s_i64(dofs, aofs, oprsz, sh64, false, fni8);
+        tcg_temp_free_i64(sh64);
+    } else {
+        TCGv_ptr a0 = tcg_temp_new_ptr();
+        TCGv_ptr a1 = tcg_temp_new_ptr();
+        TCGv_i32 desc = tcg_temp_new_i32();
+
+        tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
+        tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
+        tcg_gen_addi_ptr(a0, cpu_env, dofs);
+        tcg_gen_addi_ptr(a1, cpu_env, aofs);
+
+        fno(a0, a1, desc);
+
+        tcg_temp_free_ptr(a0);
+        tcg_temp_free_ptr(a1);
+        tcg_temp_free_i32(desc);
+        return;
+    }
+
+ clear_tail:
+    if (oprsz < maxsz) {
+        expand_clr(dofs + oprsz, maxsz - oprsz);
+    }
+}
+
+void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
+{
+    static const TCGOpcode scalar_list[] = { INDEX_op_shls_vec, 0 };
+    static const TCGOpcode vector_list[] = { INDEX_op_shlv_vec, 0 };
+    static gen_helper_gvec_2 * const fno[4] = {
+        gen_helper_gvec_shl8i,
+        gen_helper_gvec_shl16i,
+        gen_helper_gvec_shl32i,
+        gen_helper_gvec_shl64i,
+    };
+
+    tcg_debug_assert(vece <= MO_64);
+    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz,
+                   vece == MO_32 ? tcg_gen_shl_i32 : NULL,
+                   vece == MO_64 ? tcg_gen_shl_i64 : NULL,
+                   tcg_gen_shls_vec, tcg_gen_shlv_vec, fno[vece],
+                   scalar_list, vector_list);
+}
+
+void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
+{
+    static const TCGOpcode scalar_list[] = { INDEX_op_shrs_vec, 0 };
+    static const TCGOpcode vector_list[] = { INDEX_op_shrv_vec, 0 };
+    static gen_helper_gvec_2 * const fno[4] = {
+        gen_helper_gvec_shr8i,
+        gen_helper_gvec_shr16i,
+        gen_helper_gvec_shr32i,
+        gen_helper_gvec_shr64i,
+    };
+
+    tcg_debug_assert(vece <= MO_64);
+    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz,
+                   vece == MO_32 ? tcg_gen_shr_i32 : NULL,
+                   vece == MO_64 ? tcg_gen_shr_i64 : NULL,
+                   tcg_gen_shrs_vec, tcg_gen_shrv_vec, fno[vece],
+                   scalar_list, vector_list);
+}
+
+void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
+{
+    static const TCGOpcode scalar_list[] = { INDEX_op_sars_vec, 0 };
+    static const TCGOpcode vector_list[] = { INDEX_op_sarv_vec, 0 };
+    static gen_helper_gvec_2 * const fno[4] = {
+        gen_helper_gvec_sar8i,
+        gen_helper_gvec_sar16i,
+        gen_helper_gvec_sar32i,
+        gen_helper_gvec_sar64i,
+    };
+
+    tcg_debug_assert(vece <= MO_64);
+    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz,
+                   vece == MO_32 ? tcg_gen_sar_i32 : NULL,
+                   vece == MO_64 ? tcg_gen_sar_i64 : NULL,
+                   tcg_gen_sars_vec, tcg_gen_sarv_vec, fno[vece],
+                   scalar_list, vector_list);
+}
+
 /*
  * Expand D = A << (B % element bits)
  *
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index 96317dbd10..16062f5995 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -598,3 +598,57 @@  void tcg_gen_sarv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
 {
     do_op3(vece, r, a, b, INDEX_op_sarv_vec);
 }
+
+static void do_shifts(unsigned vece, TCGv_vec r, TCGv_vec a,
+                      TCGv_i32 s, TCGOpcode opc_s, TCGOpcode opc_v)
+{
+    TCGTemp *rt = tcgv_vec_temp(r);
+    TCGTemp *at = tcgv_vec_temp(a);
+    TCGTemp *st = tcgv_i32_temp(s);
+    TCGArg ri = temp_arg(rt);
+    TCGArg ai = temp_arg(at);
+    TCGArg si = temp_arg(st);
+    TCGType type = rt->base_type;
+    const TCGOpcode *hold_list;
+    int can;
+
+    tcg_debug_assert(at->base_type >= type);
+    tcg_assert_listed_vecop(opc_s);
+    hold_list = tcg_swap_vecop_list(NULL);
+
+    can = tcg_can_emit_vec_op(opc_s, type, vece);
+    if (can > 0) {
+        vec_gen_3(opc_s, type, vece, ri, ai, si);
+    } else if (can < 0) {
+        tcg_expand_vec_op(opc_s, type, vece, ri, ai, si);
+    } else {
+        TCGv_vec vec_s = tcg_temp_new_vec(type);
+
+        if (vece == MO_64) {
+            TCGv_i64 s64 = tcg_temp_new_i64();
+            tcg_gen_extu_i32_i64(s64, s);
+            tcg_gen_dup_i64_vec(MO_64, vec_s, s64);
+            tcg_temp_free_i64(s64);
+        } else {
+            tcg_gen_dup_i32_vec(vece, vec_s, s);
+        }
+        do_op3(vece, r, a, vec_s, opc_v);
+        tcg_temp_free_vec(vec_s);
+    }
+    tcg_swap_vecop_list(hold_list);
+}
+
+void tcg_gen_shls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 b)
+{
+    do_shifts(vece, r, a, b, INDEX_op_shls_vec, INDEX_op_shlv_vec);
+}
+
+void tcg_gen_shrs_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 b)
+{
+    do_shifts(vece, r, a, b, INDEX_op_shrs_vec, INDEX_op_shrv_vec);
+}
+
+void tcg_gen_sars_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 b)
+{
+    do_shifts(vece, r, a, b, INDEX_op_sars_vec, INDEX_op_sarv_vec);
+}