===================================================================
@@ -1454,6 +1454,8 @@ extern bool vect_verify_datarefs_alignme
extern bool vect_slp_analyze_and_verify_instance_alignment (slp_instance);
extern bool vect_analyze_data_ref_accesses (vec_info *);
extern bool vect_prune_runtime_alias_test_list (loop_vec_info);
+extern bool vect_gather_scatter_fn_p (bool, bool, tree, tree, unsigned int,
+ signop, int, internal_fn *, tree *);
extern bool vect_check_gather_scatter (gimple *, loop_vec_info,
gather_scatter_info *);
extern bool vect_analyze_data_refs (vec_info *, poly_uint64 *);
===================================================================
@@ -3307,7 +3307,7 @@ vect_prune_runtime_alias_test_list (loop
Return true if the function is supported, storing the function
id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
-static bool
+bool
vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
tree memory_type, unsigned int offset_bits,
signop offset_sign, int scale,
===================================================================
@@ -1847,17 +1847,116 @@ prepare_load_store_mask (tree mask_type,
return and_res;
}
+/* Determine whether we can use a gather load or scatter store to vectorize
+ strided load or store STMT by truncating the current offset to a smaller
+ width. We need to be able to construct an offset vector:
+
+ { 0, X, X*2, X*3, ... }
+
+ without loss of precision, where X is STMT's DR_STEP.
+
+ Return true if this is possible, describing the gather load or scatter
+ store in GS_INFO. MASKED_P is true if the load or store is conditional. */
+
+static bool
+vect_truncate_gather_scatter_offset (gimple *stmt, loop_vec_info loop_vinfo,
+ bool masked_p,
+ gather_scatter_info *gs_info)
+{
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ tree step = DR_STEP (dr);
+ if (TREE_CODE (step) != INTEGER_CST)
+ {
+ /* ??? Perhaps we could use range information here? */
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "cannot truncate variable step.\n");
+ return false;
+ }
+
+ /* Get the number of bits in an element. */
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ scalar_mode element_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
+ unsigned int element_bits = GET_MODE_BITSIZE (element_mode);
+
+ /* Set COUNT to the upper limit on the number of elements - 1.
+ Start with the maximum vectorization factor. */
+ unsigned HOST_WIDE_INT count = vect_max_vf (loop_vinfo) - 1;
+
+ /* Try lowering COUNT to the number of scalar latch iterations. */
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ widest_int max_iters;
+ if (max_loop_iterations (loop, &max_iters)
+ && max_iters < count)
+ count = max_iters.to_shwi ();
+
+ /* Try scales of 1 and the element size. */
+ int scales[] = { 1, vect_get_scalar_dr_size (dr) };
+ bool overflow_p = false;
+ for (int i = 0; i < 2; ++i)
+ {
+ int scale = scales[i];
+ widest_int factor;
+ if (!wi::multiple_of_p (wi::to_widest (step), scale, SIGNED, &factor))
+ continue;
+
+ /* See whether we can calculate (COUNT - 1) * STEP / SCALE
+ in OFFSET_BITS bits. */
+ widest_int range = wi::mul (count, factor, SIGNED, &overflow_p);
+ if (overflow_p)
+ continue;
+ signop sign = range >= 0 ? UNSIGNED : SIGNED;
+ if (wi::min_precision (range, sign) > element_bits)
+ {
+ overflow_p = true;
+ continue;
+ }
+
+ /* See whether the target supports the operation. */
+ tree memory_type = TREE_TYPE (DR_REF (dr));
+ if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
+ memory_type, element_bits, sign, scale,
+ &gs_info->ifn, &gs_info->element_type))
+ continue;
+
+ tree offset_type = build_nonstandard_integer_type (element_bits,
+ sign == UNSIGNED);
+
+ gs_info->decl = NULL_TREE;
+ /* Logically the sum of DR_BASE_ADDRESS, DR_INIT and DR_OFFSET,
+ but we don't need to store that here. */
+ gs_info->base = NULL_TREE;
+ gs_info->offset = fold_convert (offset_type, step);
+ gs_info->offset_dt = vect_unknown_def_type;
+ gs_info->offset_vectype = NULL_TREE;
+ gs_info->scale = scale;
+ gs_info->memory_type = memory_type;
+ return true;
+ }
+
+ if (overflow_p && dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "truncating gather/scatter offset to %d bits"
+ " might change its value.\n", element_bits);
+
+ return false;
+}
+
/* Return true if we can use gather/scatter internal functions to
vectorize STMT, which is a grouped or strided load or store.
- When returning true, fill in GS_INFO with the information required
- to perform the operation. */
+ MASKED_P is true if load or store is conditional. When returning
+ true, fill in GS_INFO with the information required to perform the
+ operation. */
static bool
vect_use_strided_gather_scatters_p (gimple *stmt, loop_vec_info loop_vinfo,
+ bool masked_p,
gather_scatter_info *gs_info)
{
if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info))
- return false;
+ return vect_truncate_gather_scatter_offset (stmt, loop_vinfo,
+ masked_p, gs_info);
scalar_mode element_mode = SCALAR_TYPE_MODE (gs_info->element_type);
unsigned int element_bits = GET_MODE_BITSIZE (element_mode);
@@ -1989,7 +2088,8 @@ vect_get_store_rhs (gimple *stmt)
static bool
get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
bool masked_p, vec_load_store_type vls_type,
- vect_memory_access_type *memory_access_type)
+ vect_memory_access_type *memory_access_type,
+ gather_scatter_info *gs_info)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
vec_info *vinfo = stmt_info->vinfo;
@@ -2104,6 +2204,20 @@ get_group_load_store_type (gimple *stmt,
overrun_p = would_overrun_p;
}
}
+
+ /* As a last resort, trying using a gather load or scatter store.
+
+ ??? Although the code can handle all group sizes correctly,
+ it probably isn't a win to use separate strided accesses based
+ on nearby locations. Or, even if it's a win over scalar code,
+ it might not be a win over vectorizing at a lower VF, if that
+ allows us to use contiguous accesses. */
+ if (*memory_access_type == VMAT_ELEMENTWISE
+ && single_element_p
+ && loop_vinfo
+ && vect_use_strided_gather_scatters_p (stmt, loop_vinfo,
+ masked_p, gs_info))
+ *memory_access_type = VMAT_GATHER_SCATTER;
}
if (vls_type != VLS_LOAD && first_stmt == stmt)
@@ -2231,14 +2345,15 @@ get_load_store_type (gimple *stmt, tree
else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
{
if (!get_group_load_store_type (stmt, vectype, slp, masked_p, vls_type,
- memory_access_type))
+ memory_access_type, gs_info))
return false;
}
else if (STMT_VINFO_STRIDED_P (stmt_info))
{
gcc_assert (!slp);
if (loop_vinfo
- && vect_use_strided_gather_scatters_p (stmt, loop_vinfo, gs_info))
+ && vect_use_strided_gather_scatters_p (stmt, loop_vinfo,
+ masked_p, gs_info))
*memory_access_type = VMAT_GATHER_SCATTER;
else
*memory_access_type = VMAT_ELEMENTWISE;
===================================================================
@@ -0,0 +1,33 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ dest[i] += src[i * SCALE]; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 5, 5) \
+ T (DATA_TYPE, 7, 7) \
+ T (DATA_TYPE, 11, 11) \
+ T (DATA_TYPE, 200, 200) \
+ T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
===================================================================
@@ -0,0 +1,34 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -msve-vector-bits=256 --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src, long n) \
+ { \
+ for (long i = 0; i < n; ++i) \
+ dest[i] += src[i * SCALE]; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 5, 5) \
+ T (DATA_TYPE, 7, 7) \
+ T (DATA_TYPE, 11, 11) \
+ T (DATA_TYPE, 200, 200) \
+ T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
===================================================================
@@ -0,0 +1,7 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -msve-vector-bits=scalable --save-temps" } */
+
+#include "sve_strided_load_5.c"
+
+/* { dg-final { scan-assembler-not {\[x[0-9]+, z[0-9]+\.s} } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
===================================================================
@@ -0,0 +1,34 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src) \
+ { \
+ for (long i = 0; i < 1000; ++i) \
+ dest[i] += src[i * SCALE]; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 5, 5) \
+ T (DATA_TYPE, 7, 7) \
+ T (DATA_TYPE, 11, 11) \
+ T (DATA_TYPE, 200, 200) \
+ T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */