Use gather loads for strided accesses

Message ID	87efowwoly.fsf@linaro.org
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: domain of gcc-patches-return-467231-patch=linaro.org@gcc.gnu.org designates 209.132.180.131 as permitted sender) client-ip=209.132.180.131; DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:from :to:subject:date:message-id:mime-version:content-type; q=dns; s= default; b=JzXiD6aVODkVpKuGx+QH49DRZ4YXT2xs8w4FHiiNNU+Wly0bxmS5d KdlVbLjXC8APCYEahR5HaJ3HSM2Nr3rNEro3goiwTNel31x/mvrbFmncogLIwEJN tJTuQyMv8fFhJxtR4kHyTko5d+mrhvtYoimkNtiu9F0Acue/NXdH8E= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org From: Richard Sandiford <richard.sandiford@linaro.org> To: gcc-patches@gcc.gnu.org Mail-Followup-To: gcc-patches@gcc.gnu.org, richard.sandiford@linaro.org Subject: Use gather loads for strided accesses Date: Fri, 17 Nov 2017 22:02:17 +0000 Message-ID: <87efowwoly.fsf@linaro.org> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/25.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain
Series	Use gather loads for strided accesses \| expand Use gather loads for strided accesses

Index: gcc/tree-vectorizer.h =================================================================== --- gcc/tree-vectorizer.h 2017-11-17 21:57:43.920003721 +0000 +++ gcc/tree-vectorizer.h 2017-11-17 21:59:27.828803892 +0000 @@ -1461,7 +1461,7 @@ extern void vect_record_base_alignments extern tree vect_create_data_ref_ptr (gimple *, tree, struct loop *, tree, tree *, gimple_stmt_iterator *, gimple **, bool, bool *, - tree = NULL_TREE); + tree = NULL_TREE, tree = NULL_TREE); extern tree bump_vector_ptr (tree, gimple *, gimple_stmt_iterator *, gimple *, tree); extern tree vect_create_destination_var (tree, tree); Index: gcc/tree-vect-data-refs.c =================================================================== --- gcc/tree-vect-data-refs.c 2017-11-17 21:57:43.919003822 +0000 +++ gcc/tree-vect-data-refs.c 2017-11-17 21:59:27.827803892 +0000 @@ -4362,6 +4362,10 @@ vect_create_addr_base_for_vector_ref (gi to the initial address accessed by the data-ref in STMT. This is similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET in bytes. + 8. IV_STEP (optional, defaults to NULL): the amount that should be added + to the IV during each iteration of the loop. NULL says to move + by one copy of AGGR_TYPE up or down, depending on the step of the + data reference. Output: 1. Declare a new ptr to vector_type, and have it point to the base of the @@ -4394,7 +4398,8 @@ vect_create_addr_base_for_vector_ref (gi vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, tree offset, tree *initial_address, gimple_stmt_iterator *gsi, gimple **ptr_incr, - bool only_init, bool *inv_p, tree byte_offset) + bool only_init, bool *inv_p, tree byte_offset, + tree iv_step) { const char *base_name; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); @@ -4418,7 +4423,8 @@ vect_create_data_ref_ptr (gimple *stmt, tree step; bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); - gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE + gcc_assert (iv_step != NULL_TREE + || TREE_CODE (aggr_type) == ARRAY_TYPE || TREE_CODE (aggr_type) == VECTOR_TYPE); if (loop_vinfo) @@ -4559,14 +4565,17 @@ vect_create_data_ref_ptr (gimple *stmt, aptr = aggr_ptr_init; else { - /* The step of the aggregate pointer is the type size. */ - tree iv_step = TYPE_SIZE_UNIT (aggr_type); - /* One exception to the above is when the scalar step of the load in - LOOP is zero. In this case the step here is also zero. */ - if (*inv_p) - iv_step = size_zero_node; - else if (tree_int_cst_sgn (step) == -1) - iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); + if (iv_step == NULL_TREE) + { + /* The step of the aggregate pointer is the type size. */ + iv_step = TYPE_SIZE_UNIT (aggr_type); + /* One exception to the above is when the scalar step of the load in + LOOP is zero. In this case the step here is also zero. */ + if (*inv_p) + iv_step = size_zero_node; + else if (tree_int_cst_sgn (step) == -1) + iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); + } standard_iv_increment_position (loop, &incr_gsi, &insert_after); @@ -4699,7 +4708,7 @@ bump_vector_ptr (tree dataref_ptr, gimpl if (use == dataref_ptr) SET_USE (use_p, new_dataref_ptr); else - gcc_assert (tree_int_cst_compare (use, update) == 0); + gcc_assert (operand_equal_p (use, update, 0)); } return new_dataref_ptr; Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c 2017-11-17 21:57:43.920003721 +0000 +++ gcc/tree-vect-stmts.c 2017-11-17 21:59:27.828803892 +0000 @@ -1847,6 +1847,43 @@ prepare_load_store_mask (tree mask_type, return and_res; } +/* Return true if we can use gather/scatter internal functions to + vectorize STMT, which is a grouped or strided load or store. + When returning true, fill in GS_INFO with the information required + to perform the operation. */ + +static bool +vect_use_strided_gather_scatters_p (gimple *stmt, loop_vec_info loop_vinfo, + gather_scatter_info *gs_info) +{ + if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info)) + return false; + + scalar_mode element_mode = SCALAR_TYPE_MODE (gs_info->element_type); + unsigned int element_bits = GET_MODE_BITSIZE (element_mode); + tree offset_type = TREE_TYPE (gs_info->offset); + unsigned int offset_bits = TYPE_PRECISION (offset_type); + + /* Enforced by vect_check_gather_scatter. */ + gcc_assert (element_bits >= offset_bits); + + /* If the elements are wider than the offset, convert the offset to the + same width, without changing its sign. */ + if (element_bits > offset_bits) + { + bool unsigned_p = TYPE_UNSIGNED (offset_type); + offset_type = build_nonstandard_integer_type (element_bits, unsigned_p); + gs_info->offset = fold_convert (offset_type, gs_info->offset); + } + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "using gather/scatter for strided/grouped access," + " scale = %d\n", gs_info->scale); + + return true; +} + /* STMT is a non-strided load or store, meaning that it accesses elements with a known constant step. Return -1 if that step is negative, 0 if it is zero, and 1 if it is greater than zero. */ @@ -2200,7 +2237,11 @@ get_load_store_type (gimple *stmt, tree else if (STMT_VINFO_STRIDED_P (stmt_info)) { gcc_assert (!slp); - *memory_access_type = VMAT_ELEMENTWISE; + if (loop_vinfo + && vect_use_strided_gather_scatters_p (stmt, loop_vinfo, gs_info)) + *memory_access_type = VMAT_GATHER_SCATTER; + else + *memory_access_type = VMAT_ELEMENTWISE; } else { @@ -2640,6 +2681,71 @@ vect_get_gather_scatter_ops (struct loop offset_vectype); } +/* Prepare to implement a grouped or strided load or store using + the gather load or scatter store operation described by GS_INFO. + STMT is the load or store statement. + + Set *DATAREF_BUMP to the amount that should be added to the base + address after each copy of the vectorized statement. Set *VEC_OFFSET + to an invariant offset vector in which element I has the value + I * DR_STEP / SCALE. */ + +static void +vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo, + gather_scatter_info *gs_info, + tree *dataref_bump, tree *vec_offset) +{ + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + gimple_seq stmts; + + tree bump = size_binop (MULT_EXPR, + fold_convert (sizetype, DR_STEP (dr)), + size_int (TYPE_VECTOR_SUBPARTS (vectype))); + *dataref_bump = force_gimple_operand (bump, &stmts, true, NULL_TREE); + if (stmts) + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); + + /* The offset given in GS_INFO can have pointer type, so use the element + type of the vector instead. */ + tree offset_type = TREE_TYPE (gs_info->offset); + tree offset_vectype = get_vectype_for_scalar_type (offset_type); + offset_type = TREE_TYPE (offset_vectype); + + /* Calculate X = DR_STEP / SCALE and convert it to the appropriate type. */ + tree step = size_binop (EXACT_DIV_EXPR, DR_STEP (dr), + ssize_int (gs_info->scale)); + step = fold_convert (offset_type, step); + step = force_gimple_operand (step, &stmts, true, NULL_TREE); + + /* Create {0, X, X*2, X*3, ...}. */ + *vec_offset = gimple_build (&stmts, VEC_SERIES_EXPR, offset_vectype, + build_zero_cst (offset_type), step); + if (stmts) + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); +} + +/* Return the amount that should be added to a vector pointer to move + to the next or previous copy of AGGR_TYPE. DR is the data reference + being vectorized and MEMORY_ACCESS_TYPE describes the type of + vectorization. */ + +static tree +vect_get_data_ptr_increment (data_reference *dr, tree aggr_type, + vect_memory_access_type memory_access_type) +{ + if (memory_access_type == VMAT_INVARIANT) + return size_zero_node; + + tree iv_step = TYPE_SIZE_UNIT (aggr_type); + tree step = vect_dr_behavior (dr)->step; + if (tree_int_cst_sgn (step) == -1) + iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); + return iv_step; +} + /* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64}. */ static bool @@ -7417,6 +7523,9 @@ vectorizable_load (gimple *stmt, gimple_ return true; } + if (memory_access_type == VMAT_GATHER_SCATTER) + grouped_load = false; + if (grouped_load) { first_stmt = GROUP_FIRST_ELEMENT (stmt_info); @@ -7628,13 +7737,29 @@ vectorizable_load (gimple *stmt, gimple_ if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1); - if (memory_access_type == VMAT_LOAD_STORE_LANES) - aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); + tree bump; + tree vec_offset = NULL_TREE; + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + { + aggr_type = NULL_TREE; + bump = NULL_TREE; + } + else if (memory_access_type == VMAT_GATHER_SCATTER) + { + aggr_type = elem_type; + vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info, + &bump, &vec_offset); + } else - aggr_type = vectype; + { + if (memory_access_type == VMAT_LOAD_STORE_LANES) + aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); + else + aggr_type = vectype; + bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type); + } tree vec_mask = NULL_TREE; - tree vec_offset = NULL_TREE; prev_stmt_info = NULL; poly_uint64 group_elt = 0; vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); @@ -7666,7 +7791,7 @@ vectorizable_load (gimple *stmt, gimple_ = vect_create_data_ref_ptr (first_stmt_for_drptr, aggr_type, at_loop, offset, &dummy, gsi, &ptr_incr, simd_lane_access_p, - &inv_p, byte_offset); + &inv_p, byte_offset, bump); /* Adjust the pointer by the difference to first_stmt. */ data_reference_p ptrdr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt_for_drptr)); @@ -7688,7 +7813,7 @@ vectorizable_load (gimple *stmt, gimple_ = vect_create_data_ref_ptr (first_stmt, aggr_type, at_loop, offset, &dummy, gsi, &ptr_incr, simd_lane_access_p, &inv_p, - byte_offset); + byte_offset, bump); if (mask) vec_mask = vect_get_vec_def_for_operand (mask, stmt, mask_vectype); @@ -7697,7 +7822,7 @@ vectorizable_load (gimple *stmt, gimple_ { if (dataref_offset) dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, - TYPE_SIZE_UNIT (aggr_type)); + bump); else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) { gimple *def_stmt; @@ -7706,8 +7831,8 @@ vectorizable_load (gimple *stmt, gimple_ vec_offset = vect_get_vec_def_for_stmt_copy (dt, vec_offset); } else - dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, - TYPE_SIZE_UNIT (aggr_type)); + dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, + stmt, bump); if (mask) { gimple *def_stmt; @@ -7783,7 +7908,7 @@ vectorizable_load (gimple *stmt, gimple_ if (i > 0) dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, - stmt, NULL_TREE); + stmt, bump); /* 2. Create the vector-load in the loop. */ switch (alignment_support_scheme) Index: gcc/testsuite/gcc.target/aarch64/sve_strided_load_1.c =================================================================== --- /dev/null 2017-11-14 14:28:07.424493901 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve_strided_load_1.c 2017-11-17 21:59:27.825803893 +0000 @@ -0,0 +1,40 @@ +/* { dg-do assemble } */ +/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */ + +#include <stdint.h> + +#ifndef INDEX8 +#define INDEX8 int8_t +#define INDEX16 int16_t +#define INDEX32 int32_t +#define INDEX64 int64_t +#endif + +#define TEST_LOOP(DATA_TYPE, BITS) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \ + DATA_TYPE *restrict src, \ + INDEX##BITS stride, INDEX##BITS n) \ + { \ + for (INDEX##BITS i = 0; i < n; ++i) \ + dest[i] += src[i * stride]; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 8) \ + T (DATA_TYPE, 16) \ + T (DATA_TYPE, 32) \ + T (DATA_TYPE, 64) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 9 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 12 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve_strided_load_2.c =================================================================== --- /dev/null 2017-11-14 14:28:07.424493901 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve_strided_load_2.c 2017-11-17 21:59:27.826803893 +0000 @@ -0,0 +1,18 @@ +/* { dg-do assemble } */ +/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */ + +#define INDEX8 uint8_t +#define INDEX16 uint16_t +#define INDEX32 uint32_t +#define INDEX64 uint64_t + +#include "sve_strided_load_1.c" + +/* 8 and 16 bits are signed because the multiplication promotes to int. + Using uxtw for all 9 would be OK. */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */ +/* The 32-bit loop needs to honor the defined overflow in uint32_t, + so we vectorize the offset calculation. This means that the + 64-bit version needs two copies. */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve_strided_load_3.c =================================================================== --- /dev/null 2017-11-14 14:28:07.424493901 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve_strided_load_3.c 2017-11-17 21:59:27.826803893 +0000 @@ -0,0 +1,32 @@ +/* { dg-do assemble } */ +/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */ + +#include <stdint.h> + +#define TEST_LOOP(DATA_TYPE, OTHER_TYPE) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \ + DATA_TYPE *restrict src, \ + OTHER_TYPE *restrict other, \ + OTHER_TYPE mask, \ + int stride, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + dest[i] = src[i * stride] + (OTHER_TYPE) (other[i] | mask); \ + } + +#define TEST_ALL(T) \ + T (int32_t, int16_t) \ + T (uint32_t, int16_t) \ + T (float, int16_t) \ + T (int64_t, int32_t) \ + T (uint64_t, int32_t) \ + T (double, int32_t) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 1\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */ + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */

Use gather loads for strided accesses

Commit Message

Comments

Patch