===================================================================
@@ -392,6 +392,8 @@ OPTAB_D (set_thread_pointer_optab, "set_
OPTAB_D (gather_load_optab, "gather_load$a")
OPTAB_D (mask_gather_load_optab, "mask_gather_load$a")
+OPTAB_D (scatter_store_optab, "scatter_store$a")
+OPTAB_D (mask_scatter_store_optab, "mask_scatter_store$a")
OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE)
OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
===================================================================
@@ -4934,6 +4934,35 @@ operand 5. Bit @var{i} of the mask is s
of the result should be loaded from memory and clear if element @var{i}
of the result should be set to zero.
+@cindex @code{scatter_store@var{m}} instruction pattern
+@item @samp{scatter_store@var{m}}
+Store a vector of mode @var{m} into several distinct memory locations.
+Operand 0 is a scalar base address and operand 1 is a vector of offsets
+from that base. Operand 4 is the vector of values that should be stored,
+which has the same number of elements as the offset. For each element
+index @var{i}:
+
+@itemize @bullet
+@item
+extend the offset element @var{i} to address width, using zero
+extension if operand 2 is 1 and sign extension if operand 2 is zero;
+@item
+multiply the extended offset by operand 3;
+@item
+add the result to the base; and
+@item
+store element @var{i} of operand 4 to that address.
+@end itemize
+
+The value of operand 2 does not matter if the offsets are already
+address width.
+
+@cindex @code{mask_scatter_store@var{m}} instruction pattern
+@item @samp{mask_scatter_store@var{m}}
+Like @samp{scatter_store@var{m}}, but takes an extra mask operand as
+operand 5. Bit @var{i} of the mask is set if element @var{i}
+of the result should be stored to memory.
+
@cindex @code{vec_set@var{m}} instruction pattern
@item @samp{vec_set@var{m}}
Set given field in the vector value. Operand 0 is the vector to modify,
===================================================================
@@ -239,6 +239,8 @@ main (int argc, const char **argv)
" mode. */\n"
" bool supports_vec_gather_load;\n"
" bool supports_vec_gather_load_cached;\n"
+ " bool supports_vec_scatter_store;\n"
+ " bool supports_vec_scatter_store_cached;\n"
"};\n"
"extern void init_all_optabs (struct target_optabs *);\n"
"\n"
===================================================================
@@ -6319,11 +6319,18 @@ gimple_expr_type (const gimple *stmt)
if (code == GIMPLE_CALL)
{
const gcall *call_stmt = as_a <const gcall *> (stmt);
- if (gimple_call_internal_p (call_stmt)
- && gimple_call_internal_fn (call_stmt) == IFN_MASK_STORE)
- return TREE_TYPE (gimple_call_arg (call_stmt, 3));
- else
- return gimple_call_return_type (call_stmt);
+ if (gimple_call_internal_p (call_stmt))
+ switch (gimple_call_internal_fn (call_stmt))
+ {
+ case IFN_MASK_STORE:
+ case IFN_SCATTER_STORE:
+ return TREE_TYPE (gimple_call_arg (call_stmt, 3));
+ case IFN_MASK_SCATTER_STORE:
+ return TREE_TYPE (gimple_call_arg (call_stmt, 4));
+ default:
+ break;
+ }
+ return gimple_call_return_type (call_stmt);
}
else if (code == GIMPLE_ASSIGN)
{
===================================================================
@@ -52,6 +52,7 @@ along with GCC; see the file COPYING3.
- mask_store: currently just maskstore
- store_lanes: currently just vec_store_lanes
- mask_store_lanes: currently just vec_mask_store_lanes
+ - scatter_store: used for {mask_,}scatter_store
- unary: a normal unary optab, such as vec_reverse_<mode>
- binary: a normal binary optab, such as vec_interleave_lo_<mode>
@@ -115,6 +116,10 @@ DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_
DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
mask_gather_load, gather_load)
+DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store)
+DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0,
+ mask_scatter_store, scatter_store)
+
DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store)
DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0,
===================================================================
@@ -193,8 +193,10 @@ extern bool set_edom_supported_p (void);
extern internal_fn get_conditional_internal_fn (tree_code, tree);
extern bool internal_load_fn_p (internal_fn);
+extern bool internal_store_fn_p (internal_fn);
extern bool internal_gather_scatter_fn_p (internal_fn);
extern int internal_fn_mask_index (internal_fn);
+extern int internal_fn_stored_value_index (internal_fn);
extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
tree, signop, int);
===================================================================
@@ -87,6 +87,7 @@ #define gather_load_direct { -1, 1, fals
#define mask_store_direct { 3, 2, false }
#define store_lanes_direct { 0, 0, false }
#define mask_store_lanes_direct { 0, 0, false }
+#define scatter_store_direct { 3, 1, false }
#define unary_direct { 0, 0, true }
#define binary_direct { 0, 0, true }
#define cond_unary_direct { 1, 1, true }
@@ -2677,6 +2678,42 @@ expand_LAUNDER (internal_fn, gcall *call
expand_assignment (lhs, gimple_call_arg (call, 0), false);
}
+/* Expand {MASK_,}SCATTER_STORE{S,U} call CALL using optab OPTAB. */
+
+static void
+expand_scatter_store_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
+{
+ internal_fn ifn = gimple_call_internal_fn (stmt);
+ int rhs_index = internal_fn_stored_value_index (ifn);
+ int mask_index = internal_fn_mask_index (ifn);
+ tree base = gimple_call_arg (stmt, 0);
+ tree offset = gimple_call_arg (stmt, 1);
+ tree scale = gimple_call_arg (stmt, 2);
+ tree rhs = gimple_call_arg (stmt, rhs_index);
+
+ rtx base_rtx = expand_normal (base);
+ rtx offset_rtx = expand_normal (offset);
+ HOST_WIDE_INT scale_int = tree_to_shwi (scale);
+ rtx rhs_rtx = expand_normal (rhs);
+
+ struct expand_operand ops[6];
+ int i = 0;
+ create_address_operand (&ops[i++], base_rtx);
+ create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
+ create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
+ create_integer_operand (&ops[i++], scale_int);
+ create_input_operand (&ops[i++], rhs_rtx, TYPE_MODE (TREE_TYPE (rhs)));
+ if (mask_index >= 0)
+ {
+ tree mask = gimple_call_arg (stmt, mask_index);
+ rtx mask_rtx = expand_normal (mask);
+ create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
+ }
+
+ insn_code icode = direct_optab_handler (optab, TYPE_MODE (TREE_TYPE (rhs)));
+ expand_insn (icode, i, ops);
+}
+
/* Expand {MASK_,}GATHER_LOAD call CALL using optab OPTAB. */
static void
@@ -2952,6 +2989,7 @@ #define direct_gather_load_optab_support
#define direct_mask_store_optab_supported_p direct_optab_supported_p
#define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p
#define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
+#define direct_scatter_store_optab_supported_p direct_optab_supported_p
#define direct_while_optab_supported_p convert_optab_supported_p
#define direct_fold_extract_optab_supported_p direct_optab_supported_p
@@ -3094,6 +3132,25 @@ internal_load_fn_p (internal_fn fn)
}
}
+/* Return true if IFN is some form of store to memory. */
+
+bool
+internal_store_fn_p (internal_fn fn)
+{
+ switch (fn)
+ {
+ case IFN_MASK_STORE:
+ case IFN_STORE_LANES:
+ case IFN_MASK_STORE_LANES:
+ case IFN_SCATTER_STORE:
+ case IFN_MASK_SCATTER_STORE:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
/* Return true if IFN is some form of gather load or scatter store. */
bool
@@ -3103,6 +3160,8 @@ internal_gather_scatter_fn_p (internal_f
{
case IFN_GATHER_LOAD:
case IFN_MASK_GATHER_LOAD:
+ case IFN_SCATTER_STORE:
+ case IFN_MASK_SCATTER_STORE:
return true;
default:
@@ -3127,6 +3186,27 @@ internal_fn_mask_index (internal_fn fn)
case IFN_MASK_GATHER_LOAD:
return 3;
+ case IFN_MASK_SCATTER_STORE:
+ return 4;
+
+ default:
+ return -1;
+ }
+}
+
+/* If FN takes a value that should be stored to memory, return the index
+ of that argument, otherwise return -1. */
+
+int
+internal_fn_stored_value_index (internal_fn fn)
+{
+ switch (fn)
+ {
+ case IFN_MASK_STORE:
+ case IFN_SCATTER_STORE:
+ case IFN_MASK_SCATTER_STORE:
+ return 3;
+
default:
return -1;
}
@@ -3151,9 +3231,12 @@ internal_gather_scatter_fn_supported_p (
return false;
optab optab = direct_internal_fn_optab (ifn);
insn_code icode = direct_optab_handler (optab, TYPE_MODE (vector_type));
+ int output_ops = internal_load_fn_p (ifn) ? 1 : 0;
return (icode != CODE_FOR_nothing
- && insn_operand_matches (icode, 3, GEN_INT (offset_sign == UNSIGNED))
- && insn_operand_matches (icode, 4, GEN_INT (scale)));
+ && insn_operand_matches (icode, 2 + output_ops,
+ GEN_INT (offset_sign == UNSIGNED))
+ && insn_operand_matches (icode, 3 + output_ops,
+ GEN_INT (scale)));
}
/* Expand STMT as though it were a call to internal function FN. */
===================================================================
@@ -188,6 +188,7 @@ bool can_atomic_exchange_p (machine_mode
bool can_atomic_load_p (machine_mode);
bool lshift_cheap_p (bool);
bool supports_vec_gather_load_p ();
+bool supports_vec_scatter_store_p ();
/* Version of find_widening_optab_handler_and_mode that operates on
specific mode types. */
===================================================================
@@ -650,3 +650,21 @@ supports_vec_gather_load_p ()
return this_fn_optabs->supports_vec_gather_load;
}
+
+/* Return true if vec_scatter_store is available for at least one vector
+ mode. */
+
+bool
+supports_vec_scatter_store_p ()
+{
+ if (this_fn_optabs->supports_vec_scatter_store_cached)
+ return this_fn_optabs->supports_vec_scatter_store;
+
+ this_fn_optabs->supports_vec_scatter_store_cached = true;
+
+ this_fn_optabs->supports_vec_scatter_store
+ = supports_at_least_one_mode_p (scatter_store_optab);
+
+ return this_fn_optabs->supports_vec_scatter_store;
+}
+
===================================================================
@@ -1412,6 +1412,7 @@ extern void vect_finish_replace_stmt (gi
extern void vect_finish_stmt_generation (gimple *, gimple *,
gimple_stmt_iterator *);
extern bool vect_mark_stmts_to_be_vectorized (loop_vec_info);
+extern tree vect_get_store_rhs (gimple *);
extern tree vect_get_vec_def_for_operand_1 (gimple *, enum vect_def_type);
extern tree vect_get_vec_def_for_operand (tree, gimple *, tree = NULL);
extern void vect_get_vec_defs (tree, tree, gimple *, vec<tree> *,
===================================================================
@@ -2659,6 +2659,9 @@ vect_analyze_data_ref_access (struct dat
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
struct loop *loop = NULL;
+ if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ return true;
+
if (loop_vinfo)
loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -3331,7 +3334,7 @@ vect_gather_scatter_fn_p (bool read_p, b
if (read_p)
ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
else
- return false;
+ ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
/* Test whether the target supports this combination. */
if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
@@ -3403,7 +3406,8 @@ vect_check_gather_scatter (gimple *stmt,
/* True if we should aim to use internal functions rather than
built-in functions. */
bool use_ifn_p = (DR_IS_READ (dr)
- && supports_vec_gather_load_p ());
+ ? supports_vec_gather_load_p ()
+ : supports_vec_scatter_store_p ());
base = DR_REF (dr);
/* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
@@ -3716,7 +3720,8 @@ vect_analyze_data_refs (vec_info *vinfo,
bool maybe_scatter
= DR_IS_WRITE (dr)
&& !TREE_THIS_VOLATILE (DR_REF (dr))
- && targetm.vectorize.builtin_scatter != NULL;
+ && (targetm.vectorize.builtin_scatter != NULL
+ || supports_vec_scatter_store_p ());
bool maybe_simd_lane_access
= is_a <loop_vec_info> (vinfo) && loop->simduid;
===================================================================
@@ -4207,10 +4207,6 @@ vect_try_gather_scatter_pattern (gimple
if (!dr || !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
return NULL;
- /* Reject stores for now. */
- if (!DR_IS_READ (dr))
- return NULL;
-
/* Get the boolean that controls whether the load or store happens.
This is null if the operation is unconditional. */
tree mask = vect_get_load_store_mask (stmt);
@@ -4249,8 +4245,16 @@ vect_try_gather_scatter_pattern (gimple
gimple_call_set_lhs (pattern_stmt, load_lhs);
}
else
- /* Not yet supported. */
- gcc_unreachable ();
+ {
+ tree rhs = vect_get_store_rhs (stmt);
+ if (mask != NULL)
+ pattern_stmt = gimple_build_call_internal (IFN_MASK_SCATTER_STORE, 5,
+ base, offset, scale, rhs,
+ mask);
+ else
+ pattern_stmt = gimple_build_call_internal (IFN_SCATTER_STORE, 4,
+ base, offset, scale, rhs);
+ }
gimple_call_set_nothrow (pattern_stmt, true);
/* Copy across relevant vectorization info and associate DR with the
===================================================================
@@ -395,12 +395,13 @@ exist_non_indexing_operands_for_use_p (t
if (mask_index >= 0
&& use == gimple_call_arg (stmt, mask_index))
return true;
+ int stored_value_index = internal_fn_stored_value_index (ifn);
+ if (stored_value_index >= 0
+ && use == gimple_call_arg (stmt, stored_value_index))
+ return true;
if (internal_gather_scatter_fn_p (ifn)
&& use == gimple_call_arg (stmt, 1))
return true;
- if (ifn == IFN_MASK_STORE
- && use == gimple_call_arg (stmt, 3))
- return true;
}
return false;
}
@@ -1763,10 +1764,11 @@ check_load_store_masking (loop_vec_info
if (memory_access_type == VMAT_GATHER_SCATTER)
{
- gcc_assert (is_load);
+ internal_fn ifn = (is_load
+ ? IFN_MASK_GATHER_LOAD
+ : IFN_MASK_SCATTER_STORE);
tree offset_type = TREE_TYPE (gs_info->offset);
- if (!internal_gather_scatter_fn_supported_p (IFN_MASK_GATHER_LOAD,
- vectype,
+ if (!internal_gather_scatter_fn_supported_p (ifn, vectype,
gs_info->memory_type,
TYPE_SIGN (offset_type),
gs_info->scale))
@@ -1775,7 +1777,7 @@ check_load_store_masking (loop_vec_info
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"can't use a fully-masked loop because the"
" target doesn't have an appropriate masked"
- " gather load instruction.\n");
+ " gather load or scatter store instruction.\n");
LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
return;
}
@@ -2059,7 +2061,7 @@ reverse_vector (tree vec_dest, tree inpu
/* STMT is either a masked or unconditional store. Return the value
being stored. */
-static tree
+tree
vect_get_store_rhs (gimple *stmt)
{
if (gassign *assign = dyn_cast <gassign *> (stmt))
@@ -2070,8 +2072,9 @@ vect_get_store_rhs (gimple *stmt)
if (gcall *call = dyn_cast <gcall *> (stmt))
{
internal_fn ifn = gimple_call_internal_fn (call);
- gcc_assert (ifn == IFN_MASK_STORE);
- return gimple_call_arg (stmt, 3);
+ int index = internal_fn_stored_value_index (ifn);
+ gcc_assert (index >= 0);
+ return gimple_call_arg (stmt, index);
}
gcc_unreachable ();
}
@@ -3051,7 +3054,7 @@ vectorizable_call (gimple *gs, gimple_st
if (gimple_call_internal_p (stmt)
&& (internal_load_fn_p (gimple_call_internal_fn (stmt))
- || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
+ || internal_store_fn_p (gimple_call_internal_fn (stmt))))
/* Handled by vectorizable_load and vectorizable_store. */
return false;
@@ -6122,7 +6125,11 @@ vectorizable_store (gimple *stmt, gimple
else
{
gcall *call = dyn_cast <gcall *> (stmt);
- if (!call || !gimple_call_internal_p (call, IFN_MASK_STORE))
+ if (!call || !gimple_call_internal_p (call))
+ return false;
+
+ internal_fn ifn = gimple_call_internal_fn (call);
+ if (!internal_store_fn_p (ifn))
return false;
if (slp_node != NULL)
@@ -6133,10 +6140,13 @@ vectorizable_store (gimple *stmt, gimple
return false;
}
- ref_type = TREE_TYPE (gimple_call_arg (call, 1));
- mask = gimple_call_arg (call, 2);
- if (!vect_check_load_store_mask (stmt, mask, &mask_vectype))
- return false;
+ int mask_index = internal_fn_mask_index (ifn);
+ if (mask_index >= 0)
+ {
+ mask = gimple_call_arg (call, mask_index);
+ if (!vect_check_load_store_mask (stmt, mask, &mask_vectype))
+ return false;
+ }
}
op = vect_get_store_rhs (stmt);
@@ -6198,7 +6208,8 @@ vectorizable_store (gimple *stmt, gimple
TYPE_MODE (mask_vectype), false))
return false;
}
- else if (memory_access_type != VMAT_LOAD_STORE_LANES)
+ else if (memory_access_type != VMAT_LOAD_STORE_LANES
+ && memory_access_type != VMAT_GATHER_SCATTER)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6214,7 +6225,8 @@ vectorizable_store (gimple *stmt, gimple
return false;
}
- grouped_store = STMT_VINFO_GROUPED_ACCESS (stmt_info);
+ grouped_store = (STMT_VINFO_GROUPED_ACCESS (stmt_info)
+ && memory_access_type != VMAT_GATHER_SCATTER);
if (grouped_store)
{
first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
@@ -6250,7 +6262,7 @@ vectorizable_store (gimple *stmt, gimple
ensure_base_align (dr);
- if (memory_access_type == VMAT_GATHER_SCATTER)
+ if (memory_access_type == VMAT_GATHER_SCATTER && gs_info.decl)
{
tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, src;
tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
@@ -6397,10 +6409,14 @@ vectorizable_store (gimple *stmt, gimple
return true;
}
- if (grouped_store)
+ if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
{
- GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
+ gimple *group_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+ GROUP_STORE_COUNT (vinfo_for_stmt (group_stmt))++;
+ }
+ if (grouped_store)
+ {
/* FORNOW */
gcc_assert (!loop || !nested_in_vect_loop_p (loop, stmt));
@@ -6700,10 +6716,27 @@ vectorizable_store (gimple *stmt, gimple
|| memory_access_type == VMAT_CONTIGUOUS_REVERSE)
offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
- if (memory_access_type == VMAT_LOAD_STORE_LANES)
- aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+ tree bump;
+ tree vec_offset = NULL_TREE;
+ if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ {
+ aggr_type = NULL_TREE;
+ bump = NULL_TREE;
+ }
+ else if (memory_access_type == VMAT_GATHER_SCATTER)
+ {
+ aggr_type = elem_type;
+ vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info,
+ &bump, &vec_offset);
+ }
else
- aggr_type = vectype;
+ {
+ if (memory_access_type == VMAT_LOAD_STORE_LANES)
+ aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+ else
+ aggr_type = vectype;
+ bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type);
+ }
if (mask)
LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
@@ -6808,12 +6841,16 @@ vectorizable_store (gimple *stmt, gimple
dataref_offset = build_int_cst (ref_type, 0);
inv_p = false;
}
+ else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ vect_get_gather_scatter_ops (loop, stmt, &gs_info,
+ &dataref_ptr, &vec_offset);
else
dataref_ptr
= vect_create_data_ref_ptr (first_stmt, aggr_type,
simd_lane_access_p ? loop : NULL,
offset, &dummy, gsi, &ptr_incr,
- simd_lane_access_p, &inv_p);
+ simd_lane_access_p, &inv_p,
+ NULL_TREE, bump);
gcc_assert (bb_vinfo || !inv_p);
}
else
@@ -6840,11 +6877,17 @@ vectorizable_store (gimple *stmt, gimple
}
if (dataref_offset)
dataref_offset
- = int_const_binop (PLUS_EXPR, dataref_offset,
- TYPE_SIZE_UNIT (aggr_type));
+ = int_const_binop (PLUS_EXPR, dataref_offset, bump);
+ else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ {
+ gimple *def_stmt;
+ vect_def_type dt;
+ vect_is_simple_use (vec_offset, loop_vinfo, &def_stmt, &dt);
+ vec_offset = vect_get_vec_def_for_stmt_copy (dt, vec_offset);
+ }
else
dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
- TYPE_SIZE_UNIT (aggr_type));
+ bump);
}
if (memory_access_type == VMAT_LOAD_STORE_LANES)
@@ -6916,10 +6959,28 @@ vectorizable_store (gimple *stmt, gimple
final_mask = prepare_load_store_mask (mask_vectype, final_mask,
vec_mask, gsi);
+ if (memory_access_type == VMAT_GATHER_SCATTER)
+ {
+ tree scale = size_int (gs_info.scale);
+ gcall *call;
+ if (masked_loop_p)
+ call = gimple_build_call_internal
+ (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
+ scale, vec_oprnd, final_mask);
+ else
+ call = gimple_build_call_internal
+ (IFN_SCATTER_STORE, 4, dataref_ptr, vec_offset,
+ scale, vec_oprnd);
+ gimple_call_set_nothrow (call, true);
+ new_stmt = call;
+ vect_finish_stmt_generation (stmt, new_stmt, gsi);
+ break;
+ }
+
if (i > 0)
/* Bump the vector pointer. */
dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi,
- stmt, NULL_TREE);
+ stmt, bump);
if (slp)
vec_oprnd = vec_oprnds[i];
@@ -9394,9 +9455,11 @@ vect_transform_stmt (gimple *stmt, gimpl
one are skipped, and there vec_stmt_info shouldn't be freed
meanwhile. */
*grouped_store = true;
- if (STMT_VINFO_VEC_STMT (stmt_info))
+ stmt_vec_info group_info
+ = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
+ if (GROUP_STORE_COUNT (group_info) == GROUP_SIZE (group_info))
is_store = true;
- }
+ }
else
is_store = true;
break;
===================================================================
@@ -152,6 +152,7 @@ (define_c_enum "unspec" [
UNSPEC_LD1_SVE
UNSPEC_ST1_SVE
UNSPEC_LD1_GATHER
+ UNSPEC_ST1_SCATTER
UNSPEC_MERGE_PTRUE
UNSPEC_PTEST_PTRUE
UNSPEC_UNPACKSHI
===================================================================
@@ -246,6 +246,63 @@ (define_insn "mask_gather_load<mode>"
ld1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
)
+;; Unpredicated scatter store.
+(define_expand "scatter_store<mode>"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_dup 5)
+ (match_operand:DI 0 "aarch64_reg_or_zero")
+ (match_operand:<V_INT_EQUIV> 1 "register_operand")
+ (match_operand:DI 2 "const_int_operand")
+ (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>")
+ (match_operand:SVE_SD 4 "register_operand")]
+ UNSPEC_ST1_SCATTER))]
+ "TARGET_SVE"
+ {
+ operands[5] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+ }
+)
+
+;; Predicated scatter stores for 32-bit elements. Operand 2 is true for
+;; unsigned extension and false for signed extension.
+(define_insn "mask_scatter_store<mode>"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+ (match_operand:DI 0 "aarch64_reg_or_zero" "Z, rk, rk, rk, rk")
+ (match_operand:<V_INT_EQUIV> 1 "register_operand" "w, w, w, w, w")
+ (match_operand:DI 2 "const_int_operand" "i, Z, Ui1, Z, Ui1")
+ (match_operand:DI 3 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, i, i")
+ (match_operand:SVE_S 4 "register_operand" "w, w, w, w, w")]
+ UNSPEC_ST1_SCATTER))]
+ "TARGET_SVE"
+ "@
+ st1w\t%4.s, %5, [%1.s]
+ st1w\t%4.s, %5, [%0, %1.s, sxtw]
+ st1w\t%4.s, %5, [%0, %1.s, uxtw]
+ st1w\t%4.s, %5, [%0, %1.s, sxtw %p3]
+ st1w\t%4.s, %5, [%0, %1.s, uxtw %p3]"
+)
+
+;; Predicated scatter stores for 64-bit elements. The value of operand 2
+;; doesn't matter in this case.
+(define_insn "mask_scatter_store<mode>"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl")
+ (match_operand:DI 0 "aarch64_reg_or_zero" "Z, rk, rk")
+ (match_operand:<V_INT_EQUIV> 1 "register_operand" "w, w, w")
+ (match_operand:DI 2 "const_int_operand")
+ (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, Ui1, i")
+ (match_operand:SVE_D 4 "register_operand" "w, w, w")]
+ UNSPEC_ST1_SCATTER))]
+ "TARGET_SVE"
+ "@
+ st1d\t%4.d, %5, [%1.d]
+ st1d\t%4.d, %5, [%0, %1.d]
+ st1d\t%4.d, %5, [%0, %1.d, lsl %p3]"
+)
+
;; SVE structure moves.
(define_expand "mov<mode>"
[(set (match_operand:SVE_STRUCT 0 "nonimmediate_operand")
===================================================================
@@ -0,0 +1,51 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, CMP_TYPE, BITS) \
+ void \
+ f_##DATA_TYPE##_##CMP_TYPE \
+ (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ CMP_TYPE *restrict cmp1, CMP_TYPE *restrict cmp2, \
+ INDEX##BITS *restrict indices, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cmp1[i] == cmp2[i]) \
+ dest[indices[i]] = src[i] + 1; \
+ }
+
+#define TEST32(T, DATA_TYPE) \
+ T (DATA_TYPE, int32_t, 32) \
+ T (DATA_TYPE, uint32_t, 32) \
+ T (DATA_TYPE, float, 32)
+
+#define TEST64(T, DATA_TYPE) \
+ T (DATA_TYPE, int64_t, 64) \
+ T (DATA_TYPE, uint64_t, 64) \
+ T (DATA_TYPE, double, 64)
+
+#define TEST_ALL(T) \
+ TEST32 (T, int32_t) \
+ TEST32 (T, uint32_t) \
+ TEST32 (T, float) \
+ TEST64 (T, int64_t) \
+ TEST64 (T, uint64_t) \
+ TEST64 (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+\.s, sxtw 2\]\n} 9 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+\.d, lsl 3\]\n} 9 } } */
===================================================================
@@ -0,0 +1,17 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_mask_scatter_store_1.c"
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+\.s, uxtw 2\]\n} 9 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+\.d, lsl 3\]\n} 9 } } */
===================================================================
@@ -0,0 +1,31 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS *indices, int n) \
+ { \
+ for (int i = 9; i < n; ++i) \
+ dest[indices[i]] = src[i] + 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int32_t, 32) \
+ T (uint32_t, 32) \
+ T (float, 32) \
+ T (int64_t, 64) \
+ T (uint64_t, 64) \
+ T (double, 64)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */
===================================================================
@@ -0,0 +1,10 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_scatter_store_1.c"
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */
===================================================================
@@ -0,0 +1,32 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+/* Invoked 18 times for each data size. */
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS *indices, int n) \
+ { \
+ for (int i = 9; i < n; ++i) \
+ *(DATA_TYPE *) ((char *) dest + indices[i]) = src[i] + 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int32_t, 32) \
+ T (uint32_t, 32) \
+ T (float, 32) \
+ T (int64_t, 64) \
+ T (uint64_t, 64) \
+ T (double, 64)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */
===================================================================
@@ -0,0 +1,10 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_scatter_store_3.c"
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */
===================================================================
@@ -0,0 +1,23 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+/* Invoked 18 times for each data size. */
+#define TEST_LOOP(DATA_TYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict *dest, DATA_TYPE *restrict src, \
+ int n) \
+ { \
+ for (int i = 9; i < n; ++i) \
+ *dest[i] = src[i] + 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int64_t) \
+ T (uint64_t) \
+ T (double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[z[0-9]+.d\]\n} 3 } } */
===================================================================
@@ -0,0 +1,36 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -fwrapv -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX16 int16_t
+#define INDEX32 int32_t
+#endif
+
+/* Invoked 18 times for each data size. */
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS *indices, INDEX##BITS mask, int n) \
+ { \
+ for (int i = 9; i < n; ++i) \
+ dest[(INDEX##BITS) (indices[i] | mask)] = src[i] + 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int32_t, 16) \
+ T (uint32_t, 16) \
+ T (float, 16) \
+ T (int64_t, 32) \
+ T (uint64_t, 32) \
+ T (double, 32)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tsunpkhi\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsunpklo\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */
===================================================================
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#define INDEX16 uint16_t
+#define INDEX32 uint32_t
+
+#include "sve_scatter_store_6.c"
+
+/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* Either extension type is OK here. */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, [us]xtw 2\]\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */
===================================================================
@@ -0,0 +1,40 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX8
+#define INDEX8 int8_t
+#define INDEX16 int16_t
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src, \
+ INDEX##BITS stride, INDEX##BITS n) \
+ { \
+ for (INDEX##BITS i = 0; i < n; ++i) \
+ dest[i * stride] = src[i] + 1; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 8) \
+ T (DATA_TYPE, 16) \
+ T (DATA_TYPE, 32) \
+ T (DATA_TYPE, 64)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 12 } } */
===================================================================
@@ -0,0 +1,18 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#define INDEX8 uint8_t
+#define INDEX16 uint16_t
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_strided_store_1.c"
+
+/* 8 and 16 bits are signed because the multiplication promotes to int.
+ Using uxtw for all 9 would be OK. */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */
+/* The 32-bit loop needs to honor the defined overflow in uint32_t,
+ so we vectorize the offset calculation. This means that the
+ 64-bit version needs two copies. */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
===================================================================
@@ -0,0 +1,33 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, OTHER_TYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src, \
+ OTHER_TYPE *restrict other, \
+ OTHER_TYPE mask, \
+ int stride, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ dest[i * stride] = src[i] + (OTHER_TYPE) (other[i] | mask); \
+ }
+
+#define TEST_ALL(T) \
+ T (int32_t, int16_t) \
+ T (uint32_t, int16_t) \
+ T (float, int16_t) \
+ T (int64_t, int32_t) \
+ T (uint64_t, int32_t) \
+ T (double, int32_t)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 1\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */
===================================================================
@@ -0,0 +1,33 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ dest[i * SCALE] = src[i] + 1; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 5, 5) \
+ T (DATA_TYPE, 7, 7) \
+ T (DATA_TYPE, 11, 11) \
+ T (DATA_TYPE, 200, 200) \
+ T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
===================================================================
@@ -0,0 +1,34 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -msve-vector-bits=256 --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src, long n) \
+ { \
+ for (long i = 0; i < n; ++i) \
+ dest[i * SCALE] = src[i] + 1; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 5, 5) \
+ T (DATA_TYPE, 7, 7) \
+ T (DATA_TYPE, 11, 11) \
+ T (DATA_TYPE, 200, 200) \
+ T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
===================================================================
@@ -0,0 +1,7 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -msve-vector-bits=scalable --save-temps" } */
+
+#include "sve_strided_store_5.c"
+
+/* { dg-final { scan-assembler-not {\[x[0-9]+, z[0-9]+\.s} } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
===================================================================
@@ -0,0 +1,34 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src) \
+ { \
+ for (long i = 0; i < 1000; ++i) \
+ dest[i * SCALE] = src[i] + 1; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 5, 5) \
+ T (DATA_TYPE, 7, 7) \
+ T (DATA_TYPE, 11, 11) \
+ T (DATA_TYPE, 200, 200) \
+ T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */