===================================================================
@@ -1461,7 +1461,7 @@ extern void vect_record_base_alignments
extern tree vect_create_data_ref_ptr (gimple *, tree, struct loop *, tree,
tree *, gimple_stmt_iterator *,
gimple **, bool, bool *,
- tree = NULL_TREE);
+ tree = NULL_TREE, tree = NULL_TREE);
extern tree bump_vector_ptr (tree, gimple *, gimple_stmt_iterator *, gimple *,
tree);
extern tree vect_create_destination_var (tree, tree);
===================================================================
@@ -4362,6 +4362,10 @@ vect_create_addr_base_for_vector_ref (gi
to the initial address accessed by the data-ref in STMT. This is
similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
in bytes.
+ 8. IV_STEP (optional, defaults to NULL): the amount that should be added
+ to the IV during each iteration of the loop. NULL says to move
+ by one copy of AGGR_TYPE up or down, depending on the step of the
+ data reference.
Output:
1. Declare a new ptr to vector_type, and have it point to the base of the
@@ -4394,7 +4398,8 @@ vect_create_addr_base_for_vector_ref (gi
vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
tree offset, tree *initial_address,
gimple_stmt_iterator *gsi, gimple **ptr_incr,
- bool only_init, bool *inv_p, tree byte_offset)
+ bool only_init, bool *inv_p, tree byte_offset,
+ tree iv_step)
{
const char *base_name;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
@@ -4418,7 +4423,8 @@ vect_create_data_ref_ptr (gimple *stmt,
tree step;
bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
- gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
+ gcc_assert (iv_step != NULL_TREE
+ || TREE_CODE (aggr_type) == ARRAY_TYPE
|| TREE_CODE (aggr_type) == VECTOR_TYPE);
if (loop_vinfo)
@@ -4559,14 +4565,17 @@ vect_create_data_ref_ptr (gimple *stmt,
aptr = aggr_ptr_init;
else
{
- /* The step of the aggregate pointer is the type size. */
- tree iv_step = TYPE_SIZE_UNIT (aggr_type);
- /* One exception to the above is when the scalar step of the load in
- LOOP is zero. In this case the step here is also zero. */
- if (*inv_p)
- iv_step = size_zero_node;
- else if (tree_int_cst_sgn (step) == -1)
- iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
+ if (iv_step == NULL_TREE)
+ {
+ /* The step of the aggregate pointer is the type size. */
+ iv_step = TYPE_SIZE_UNIT (aggr_type);
+ /* One exception to the above is when the scalar step of the load in
+ LOOP is zero. In this case the step here is also zero. */
+ if (*inv_p)
+ iv_step = size_zero_node;
+ else if (tree_int_cst_sgn (step) == -1)
+ iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
+ }
standard_iv_increment_position (loop, &incr_gsi, &insert_after);
@@ -4699,7 +4708,7 @@ bump_vector_ptr (tree dataref_ptr, gimpl
if (use == dataref_ptr)
SET_USE (use_p, new_dataref_ptr);
else
- gcc_assert (tree_int_cst_compare (use, update) == 0);
+ gcc_assert (operand_equal_p (use, update, 0));
}
return new_dataref_ptr;
===================================================================
@@ -1847,6 +1847,43 @@ prepare_load_store_mask (tree mask_type,
return and_res;
}
+/* Return true if we can use gather/scatter internal functions to
+ vectorize STMT, which is a grouped or strided load or store.
+ When returning true, fill in GS_INFO with the information required
+ to perform the operation. */
+
+static bool
+vect_use_strided_gather_scatters_p (gimple *stmt, loop_vec_info loop_vinfo,
+ gather_scatter_info *gs_info)
+{
+ if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info))
+ return false;
+
+ scalar_mode element_mode = SCALAR_TYPE_MODE (gs_info->element_type);
+ unsigned int element_bits = GET_MODE_BITSIZE (element_mode);
+ tree offset_type = TREE_TYPE (gs_info->offset);
+ unsigned int offset_bits = TYPE_PRECISION (offset_type);
+
+ /* Enforced by vect_check_gather_scatter. */
+ gcc_assert (element_bits >= offset_bits);
+
+ /* If the elements are wider than the offset, convert the offset to the
+ same width, without changing its sign. */
+ if (element_bits > offset_bits)
+ {
+ bool unsigned_p = TYPE_UNSIGNED (offset_type);
+ offset_type = build_nonstandard_integer_type (element_bits, unsigned_p);
+ gs_info->offset = fold_convert (offset_type, gs_info->offset);
+ }
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "using gather/scatter for strided/grouped access,"
+ " scale = %d\n", gs_info->scale);
+
+ return true;
+}
+
/* STMT is a non-strided load or store, meaning that it accesses
elements with a known constant step. Return -1 if that step
is negative, 0 if it is zero, and 1 if it is greater than zero. */
@@ -2200,7 +2237,11 @@ get_load_store_type (gimple *stmt, tree
else if (STMT_VINFO_STRIDED_P (stmt_info))
{
gcc_assert (!slp);
- *memory_access_type = VMAT_ELEMENTWISE;
+ if (loop_vinfo
+ && vect_use_strided_gather_scatters_p (stmt, loop_vinfo, gs_info))
+ *memory_access_type = VMAT_GATHER_SCATTER;
+ else
+ *memory_access_type = VMAT_ELEMENTWISE;
}
else
{
@@ -2640,6 +2681,71 @@ vect_get_gather_scatter_ops (struct loop
offset_vectype);
}
+/* Prepare to implement a grouped or strided load or store using
+ the gather load or scatter store operation described by GS_INFO.
+ STMT is the load or store statement.
+
+ Set *DATAREF_BUMP to the amount that should be added to the base
+ address after each copy of the vectorized statement. Set *VEC_OFFSET
+ to an invariant offset vector in which element I has the value
+ I * DR_STEP / SCALE. */
+
+static void
+vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo,
+ gather_scatter_info *gs_info,
+ tree *dataref_bump, tree *vec_offset)
+{
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ gimple_seq stmts;
+
+ tree bump = size_binop (MULT_EXPR,
+ fold_convert (sizetype, DR_STEP (dr)),
+ size_int (TYPE_VECTOR_SUBPARTS (vectype)));
+ *dataref_bump = force_gimple_operand (bump, &stmts, true, NULL_TREE);
+ if (stmts)
+ gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+
+ /* The offset given in GS_INFO can have pointer type, so use the element
+ type of the vector instead. */
+ tree offset_type = TREE_TYPE (gs_info->offset);
+ tree offset_vectype = get_vectype_for_scalar_type (offset_type);
+ offset_type = TREE_TYPE (offset_vectype);
+
+ /* Calculate X = DR_STEP / SCALE and convert it to the appropriate type. */
+ tree step = size_binop (EXACT_DIV_EXPR, DR_STEP (dr),
+ ssize_int (gs_info->scale));
+ step = fold_convert (offset_type, step);
+ step = force_gimple_operand (step, &stmts, true, NULL_TREE);
+
+ /* Create {0, X, X*2, X*3, ...}. */
+ *vec_offset = gimple_build (&stmts, VEC_SERIES_EXPR, offset_vectype,
+ build_zero_cst (offset_type), step);
+ if (stmts)
+ gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+}
+
+/* Return the amount that should be added to a vector pointer to move
+ to the next or previous copy of AGGR_TYPE. DR is the data reference
+ being vectorized and MEMORY_ACCESS_TYPE describes the type of
+ vectorization. */
+
+static tree
+vect_get_data_ptr_increment (data_reference *dr, tree aggr_type,
+ vect_memory_access_type memory_access_type)
+{
+ if (memory_access_type == VMAT_INVARIANT)
+ return size_zero_node;
+
+ tree iv_step = TYPE_SIZE_UNIT (aggr_type);
+ tree step = vect_dr_behavior (dr)->step;
+ if (tree_int_cst_sgn (step) == -1)
+ iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
+ return iv_step;
+}
+
/* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64}. */
static bool
@@ -7417,6 +7523,9 @@ vectorizable_load (gimple *stmt, gimple_
return true;
}
+ if (memory_access_type == VMAT_GATHER_SCATTER)
+ grouped_load = false;
+
if (grouped_load)
{
first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
@@ -7628,13 +7737,29 @@ vectorizable_load (gimple *stmt, gimple_
if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
- if (memory_access_type == VMAT_LOAD_STORE_LANES)
- aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+ tree bump;
+ tree vec_offset = NULL_TREE;
+ if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ {
+ aggr_type = NULL_TREE;
+ bump = NULL_TREE;
+ }
+ else if (memory_access_type == VMAT_GATHER_SCATTER)
+ {
+ aggr_type = elem_type;
+ vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info,
+ &bump, &vec_offset);
+ }
else
- aggr_type = vectype;
+ {
+ if (memory_access_type == VMAT_LOAD_STORE_LANES)
+ aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+ else
+ aggr_type = vectype;
+ bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type);
+ }
tree vec_mask = NULL_TREE;
- tree vec_offset = NULL_TREE;
prev_stmt_info = NULL;
poly_uint64 group_elt = 0;
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
@@ -7666,7 +7791,7 @@ vectorizable_load (gimple *stmt, gimple_
= vect_create_data_ref_ptr (first_stmt_for_drptr, aggr_type,
at_loop, offset, &dummy, gsi,
&ptr_incr, simd_lane_access_p,
- &inv_p, byte_offset);
+ &inv_p, byte_offset, bump);
/* Adjust the pointer by the difference to first_stmt. */
data_reference_p ptrdr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt_for_drptr));
@@ -7688,7 +7813,7 @@ vectorizable_load (gimple *stmt, gimple_
= vect_create_data_ref_ptr (first_stmt, aggr_type, at_loop,
offset, &dummy, gsi, &ptr_incr,
simd_lane_access_p, &inv_p,
- byte_offset);
+ byte_offset, bump);
if (mask)
vec_mask = vect_get_vec_def_for_operand (mask, stmt,
mask_vectype);
@@ -7697,7 +7822,7 @@ vectorizable_load (gimple *stmt, gimple_
{
if (dataref_offset)
dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
- TYPE_SIZE_UNIT (aggr_type));
+ bump);
else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
{
gimple *def_stmt;
@@ -7706,8 +7831,8 @@ vectorizable_load (gimple *stmt, gimple_
vec_offset = vect_get_vec_def_for_stmt_copy (dt, vec_offset);
}
else
- dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
- TYPE_SIZE_UNIT (aggr_type));
+ dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi,
+ stmt, bump);
if (mask)
{
gimple *def_stmt;
@@ -7783,7 +7908,7 @@ vectorizable_load (gimple *stmt, gimple_
if (i > 0)
dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi,
- stmt, NULL_TREE);
+ stmt, bump);
/* 2. Create the vector-load in the loop. */
switch (alignment_support_scheme)
===================================================================
@@ -0,0 +1,40 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX8
+#define INDEX8 int8_t
+#define INDEX16 int16_t
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src, \
+ INDEX##BITS stride, INDEX##BITS n) \
+ { \
+ for (INDEX##BITS i = 0; i < n; ++i) \
+ dest[i] += src[i * stride]; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 8) \
+ T (DATA_TYPE, 16) \
+ T (DATA_TYPE, 32) \
+ T (DATA_TYPE, 64)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 12 } } */
===================================================================
@@ -0,0 +1,18 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#define INDEX8 uint8_t
+#define INDEX16 uint16_t
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_strided_load_1.c"
+
+/* 8 and 16 bits are signed because the multiplication promotes to int.
+ Using uxtw for all 9 would be OK. */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */
+/* The 32-bit loop needs to honor the defined overflow in uint32_t,
+ so we vectorize the offset calculation. This means that the
+ 64-bit version needs two copies. */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
===================================================================
@@ -0,0 +1,32 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, OTHER_TYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src, \
+ OTHER_TYPE *restrict other, \
+ OTHER_TYPE mask, \
+ int stride, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ dest[i] = src[i * stride] + (OTHER_TYPE) (other[i] | mask); \
+ }
+
+#define TEST_ALL(T) \
+ T (int32_t, int16_t) \
+ T (uint32_t, int16_t) \
+ T (float, int16_t) \
+ T (int64_t, int32_t) \
+ T (uint64_t, int32_t) \
+ T (double, int32_t)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 1\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */