Message ID | 87h8upixih.fsf@linaro.org |
---|---|
State | New |
Headers | show |
Series | [070/nnn] poly_int: vectorizable_reduction | expand |
Richard Sandiford <richard.sandiford@linaro.org> writes: > This patch makes vectorizable_reduction cope with variable-length vectors. > We can handle the simple case of an inner loop reduction for which > the target has native support for the epilogue operation. For now we > punt on other cases, but patches after the main SVE submission allow > SLP and double reductions too. Here's an updated version that applies on top of the recent removal of REDUC_*_EXPR. Thanks, Richard 2017-11-22 Richard Sandiford <richard.sandiford@linaro.org> Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> gcc/ * tree.h (build_index_vector): Declare. * tree.c (build_index_vector): New function. * tree-vect-loop.c (get_initial_def_for_reduction): Treat the number of units as polynomial, forcibly converting it to a constant if vectorizable_reduction has already enforced the condition. (get_initial_defs_for_reduction): Likewise. (vect_create_epilog_for_reduction): Likewise. Use build_index_vector to create a {1,2,3,...} vector. (vectorizable_reduction): Treat the number of units as polynomial. Choose vectype_in based on the largest scalar element size rather than the smallest number of units. Enforce the restrictions relied on above. Index: gcc/tree.h =================================================================== --- gcc/tree.h 2017-11-22 18:02:23.618126313 +0000 +++ gcc/tree.h 2017-11-22 18:02:28.724789004 +0000 @@ -4036,6 +4036,7 @@ extern tree build_vector (tree, vec<tree extern tree build_vector_from_ctor (tree, vec<constructor_elt, va_gc> *); extern tree build_vector_from_val (tree, tree); extern tree build_vec_series (tree, tree, tree); +extern tree build_index_vector (tree, poly_uint64, poly_uint64); extern void recompute_constructor_flags (tree); extern void verify_constructor_flags (tree); extern tree build_constructor (tree, vec<constructor_elt, va_gc> *); Index: gcc/tree.c =================================================================== --- gcc/tree.c 2017-11-22 18:02:23.618126313 +0000 +++ gcc/tree.c 2017-11-22 18:02:28.724789004 +0000 @@ -2027,6 +2027,37 @@ build_vec_series (tree type, tree base, return build2 (VEC_SERIES_EXPR, type, base, step); } +/* Return a vector with the same number of units and number of bits + as VEC_TYPE, but in which the elements are a linear series of unsigned + integers { BASE, BASE + STEP, BASE + STEP * 2, ... }. */ + +tree +build_index_vector (tree vec_type, poly_uint64 base, poly_uint64 step) +{ + tree index_vec_type = vec_type; + tree index_elt_type = TREE_TYPE (vec_type); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vec_type); + if (!INTEGRAL_TYPE_P (index_elt_type) || !TYPE_UNSIGNED (index_elt_type)) + { + index_elt_type = build_nonstandard_integer_type + (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (index_elt_type)), true); + index_vec_type = build_vector_type (index_elt_type, nunits); + } + + unsigned HOST_WIDE_INT count; + if (nunits.is_constant (&count)) + { + auto_vec<tree, 32> v (count); + for (unsigned int i = 0; i < count; ++i) + v.quick_push (build_int_cstu (index_elt_type, base + i * step)); + return build_vector (index_vec_type, v); + } + + return build_vec_series (index_vec_type, + build_int_cstu (index_elt_type, base), + build_int_cstu (index_elt_type, step)); +} + /* Something has messed with the elements of CONSTRUCTOR C after it was built; calculate TREE_CONSTANT and TREE_SIDE_EFFECTS. */ Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c 2017-11-22 18:02:23.618126313 +0000 +++ gcc/tree-vect-loop.c 2017-11-22 18:02:28.722773349 +0000 @@ -3997,11 +3997,10 @@ get_initial_def_for_reduction (gimple *s struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree scalar_type = TREE_TYPE (init_val); tree vectype = get_vectype_for_scalar_type (scalar_type); - int nunits; + poly_uint64 nunits; enum tree_code code = gimple_assign_rhs_code (stmt); tree def_for_init; tree init_def; - int i; bool nested_in_vect_loop = false; REAL_VALUE_TYPE real_init_val = dconst0; int int_init_val = 0; @@ -4082,9 +4081,13 @@ get_initial_def_for_reduction (gimple *s else { /* Option2: the first element is INIT_VAL. */ - auto_vec<tree, 32> elts (nunits); + + /* Enforced by vectorizable_reduction (which disallows double + reductions with variable-length vectors). */ + unsigned int count = nunits.to_constant (); + auto_vec<tree, 32> elts (count); elts.quick_push (init_val); - for (i = 1; i < nunits; ++i) + for (unsigned int i = 1; i < count; ++i) elts.quick_push (def_for_init); init_def = gimple_build_vector (&stmts, vectype, elts); } @@ -4144,6 +4147,8 @@ get_initial_defs_for_reduction (slp_tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); scalar_type = TREE_TYPE (vector_type); + /* vectorizable_reduction has already rejected SLP reductions on + variable-length vectors. */ nunits = TYPE_VECTOR_SUBPARTS (vector_type); gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def); @@ -4510,8 +4515,7 @@ vect_create_epilog_for_reduction (vec<tr if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) { tree indx_before_incr, indx_after_incr; - int nunits_out = TYPE_VECTOR_SUBPARTS (vectype); - int k; + poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); @@ -4527,10 +4531,7 @@ vect_create_epilog_for_reduction (vec<tr vector size (STEP). */ /* Create a {1,2,3,...} vector. */ - auto_vec<tree, 32> vtemp (nunits_out); - for (k = 0; k < nunits_out; ++k) - vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1)); - tree series_vect = build_vector (cr_index_vector_type, vtemp); + tree series_vect = build_index_vector (cr_index_vector_type, 1, 1); /* Create a vector of the step value. */ tree step = build_int_cst (cr_index_scalar_type, nunits_out); @@ -4908,8 +4909,11 @@ vect_create_epilog_for_reduction (vec<tr tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result)); tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index)); unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype)); - unsigned HOST_WIDE_INT v_size - = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); + /* Enforced by vectorizable_reduction, which ensures we have target + support before allowing a conditional reduction on variable-length + vectors. */ + unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant (); tree idx_val = NULL_TREE, val = NULL_TREE; for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size) { @@ -5026,6 +5030,9 @@ vect_create_epilog_for_reduction (vec<tr { bool reduce_with_shift = have_whole_vector_shift (mode); int element_bitsize = tree_to_uhwi (bitsize); + /* Enforced by vectorizable_reduction, which disallows SLP reductions + for variable-length vectors and also requires direct target support + for loop reductions. */ int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); tree vec_temp; @@ -5706,10 +5713,10 @@ vectorizable_reduction (gimple *stmt, gi if (k == 1 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR) continue; - tem = get_vectype_for_scalar_type (TREE_TYPE (op)); - if (! vectype_in - || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in)) - vectype_in = tem; + if (!vectype_in + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) + < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op))))) + vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op)); break; } gcc_assert (vectype_in); @@ -5875,7 +5882,8 @@ vectorizable_reduction (gimple *stmt, gi /* To properly compute ncopies we are interested in the widest input type in case we're looking at a widening accumulation. */ if (!vectype_in - || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem)) + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) + < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))) vectype_in = tem; } @@ -6022,6 +6030,7 @@ vectorizable_reduction (gimple *stmt, gi gcc_assert (ncopies >= 1); vec_mode = TYPE_MODE (vectype_in); + poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); if (code == COND_EXPR) { @@ -6203,14 +6212,23 @@ vectorizable_reduction (gimple *stmt, gi int scalar_precision = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); cr_index_scalar_type = make_unsigned_type (scalar_precision); - cr_index_vector_type = build_vector_type - (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out)); + cr_index_vector_type = build_vector_type (cr_index_scalar_type, + nunits_out); if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type, OPTIMIZE_FOR_SPEED)) reduc_fn = IFN_REDUC_MAX; } + if (reduc_fn == IFN_LAST && !nunits_out.is_constant ()) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "missing target support for reduction on" + " variable-length vectors.\n"); + return false; + } + if ((double_reduc || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION) && ncopies > 1) @@ -6222,6 +6240,27 @@ vectorizable_reduction (gimple *stmt, gi return false; } + if (double_reduc && !nunits_out.is_constant ()) + { + /* The current double-reduction code creates the initial value + element-by-element. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "double reduction not supported for variable-length" + " vectors.\n"); + return false; + } + + if (slp_node && !nunits_out.is_constant ()) + { + /* The current SLP code creates the initial value element-by-element. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "SLP reduction not supported for variable-length" + " vectors.\n"); + return false; + } + /* In case of widenning multiplication by a constant, we update the type of the constant to be the type of the other operand. We check that the constant fits the type in the pattern recognition pass. */
On 11/22/2017 11:09 AM, Richard Sandiford wrote: > Richard Sandiford <richard.sandiford@linaro.org> writes: >> This patch makes vectorizable_reduction cope with variable-length vectors. >> We can handle the simple case of an inner loop reduction for which >> the target has native support for the epilogue operation. For now we >> punt on other cases, but patches after the main SVE submission allow >> SLP and double reductions too. > > Here's an updated version that applies on top of the recent removal > of REDUC_*_EXPR. > > Thanks, > Richard > > > 2017-11-22 Richard Sandiford <richard.sandiford@linaro.org> > Alan Hayward <alan.hayward@arm.com> > David Sherwood <david.sherwood@arm.com> > > gcc/ > * tree.h (build_index_vector): Declare. > * tree.c (build_index_vector): New function. > * tree-vect-loop.c (get_initial_def_for_reduction): Treat the number > of units as polynomial, forcibly converting it to a constant if > vectorizable_reduction has already enforced the condition. > (get_initial_defs_for_reduction): Likewise. > (vect_create_epilog_for_reduction): Likewise. Use build_index_vector > to create a {1,2,3,...} vector. > (vectorizable_reduction): Treat the number of units as polynomial. > Choose vectype_in based on the largest scalar element size rather > than the smallest number of units. Enforce the restrictions > relied on above. I assume you'll work with Richi to address any conflicts with his patch to allow the target to specify a preferred mode for final reductions using shifts or extractions. OK. jeff
Index: gcc/tree.h =================================================================== --- gcc/tree.h 2017-10-23 17:22:32.736226191 +0100 +++ gcc/tree.h 2017-10-23 17:22:35.831905077 +0100 @@ -4050,6 +4050,7 @@ extern tree build_vector (tree, vec<tree extern tree build_vector_from_ctor (tree, vec<constructor_elt, va_gc> *); extern tree build_vector_from_val (tree, tree); extern tree build_vec_series (tree, tree, tree); +extern tree build_index_vector (tree, poly_uint64, poly_uint64); extern void recompute_constructor_flags (tree); extern void verify_constructor_flags (tree); extern tree build_constructor (tree, vec<constructor_elt, va_gc> *); Index: gcc/tree.c =================================================================== --- gcc/tree.c 2017-10-23 17:22:32.734226398 +0100 +++ gcc/tree.c 2017-10-23 17:22:35.830905181 +0100 @@ -1974,6 +1974,37 @@ build_vec_series (tree type, tree base, return build2 (VEC_SERIES_EXPR, type, base, step); } +/* Return a vector with the same number of units and number of bits + as VEC_TYPE, but in which the elements are a linear series of unsigned + integers { BASE, BASE + STEP, BASE + STEP * 2, ... }. */ + +tree +build_index_vector (tree vec_type, poly_uint64 base, poly_uint64 step) +{ + tree index_vec_type = vec_type; + tree index_elt_type = TREE_TYPE (vec_type); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vec_type); + if (!INTEGRAL_TYPE_P (index_elt_type) || !TYPE_UNSIGNED (index_elt_type)) + { + index_elt_type = build_nonstandard_integer_type + (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (index_elt_type)), true); + index_vec_type = build_vector_type (index_elt_type, nunits); + } + + unsigned HOST_WIDE_INT count; + if (nunits.is_constant (&count)) + { + auto_vec<tree, 32> v (count); + for (unsigned int i = 0; i < count; ++i) + v.quick_push (build_int_cstu (index_elt_type, base + i * step)); + return build_vector (index_vec_type, v); + } + + return build_vec_series (index_vec_type, + build_int_cstu (index_elt_type, base), + build_int_cstu (index_elt_type, step)); +} + /* Something has messed with the elements of CONSTRUCTOR C after it was built; calculate TREE_CONSTANT and TREE_SIDE_EFFECTS. */ Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c 2017-10-23 17:22:32.727227124 +0100 +++ gcc/tree-vect-loop.c 2017-10-23 17:22:35.829905285 +0100 @@ -3997,11 +3997,10 @@ get_initial_def_for_reduction (gimple *s struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree scalar_type = TREE_TYPE (init_val); tree vectype = get_vectype_for_scalar_type (scalar_type); - int nunits; + poly_uint64 nunits; enum tree_code code = gimple_assign_rhs_code (stmt); tree def_for_init; tree init_def; - int i; bool nested_in_vect_loop = false; REAL_VALUE_TYPE real_init_val = dconst0; int int_init_val = 0; @@ -4082,9 +4081,13 @@ get_initial_def_for_reduction (gimple *s else { /* Option2: the first element is INIT_VAL. */ - auto_vec<tree, 32> elts (nunits); + + /* Enforced by vectorizable_reduction (which disallows double + reductions with variable-length vectors). */ + unsigned int count = nunits.to_constant (); + auto_vec<tree, 32> elts (count); elts.quick_push (init_val); - for (i = 1; i < nunits; ++i) + for (unsigned int i = 1; i < count; ++i) elts.quick_push (def_for_init); init_def = gimple_build_vector (&stmts, vectype, elts); } @@ -4144,6 +4147,8 @@ get_initial_defs_for_reduction (slp_tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); scalar_type = TREE_TYPE (vector_type); + /* vectorizable_reduction has already rejected SLP reductions on + variable-length vectors. */ nunits = TYPE_VECTOR_SUBPARTS (vector_type); gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def); @@ -4510,8 +4515,7 @@ vect_create_epilog_for_reduction (vec<tr if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) { tree indx_before_incr, indx_after_incr; - int nunits_out = TYPE_VECTOR_SUBPARTS (vectype); - int k; + poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); @@ -4527,10 +4531,7 @@ vect_create_epilog_for_reduction (vec<tr vector size (STEP). */ /* Create a {1,2,3,...} vector. */ - auto_vec<tree, 32> vtemp (nunits_out); - for (k = 0; k < nunits_out; ++k) - vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1)); - tree series_vect = build_vector (cr_index_vector_type, vtemp); + tree series_vect = build_index_vector (cr_index_vector_type, 1, 1); /* Create a vector of the step value. */ tree step = build_int_cst (cr_index_scalar_type, nunits_out); @@ -4911,8 +4912,11 @@ vect_create_epilog_for_reduction (vec<tr tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result)); tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index)); unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype)); - unsigned HOST_WIDE_INT v_size - = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); + /* Enforced by vectorizable_reduction, which ensures we have target + support before allowing a conditional reduction on variable-length + vectors. */ + unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant (); tree idx_val = NULL_TREE, val = NULL_TREE; for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size) { @@ -5024,6 +5028,9 @@ vect_create_epilog_for_reduction (vec<tr { bool reduce_with_shift = have_whole_vector_shift (mode); int element_bitsize = tree_to_uhwi (bitsize); + /* Enforced by vectorizable_reduction, which disallows SLP reductions + for variable-length vectors and also requires direct target support + for loop reductions. */ int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); tree vec_temp; @@ -5703,10 +5710,10 @@ vectorizable_reduction (gimple *stmt, gi if (k == 1 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR) continue; - tem = get_vectype_for_scalar_type (TREE_TYPE (op)); - if (! vectype_in - || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in)) - vectype_in = tem; + if (!vectype_in + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) + < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op))))) + vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op)); break; } gcc_assert (vectype_in); @@ -6016,6 +6023,7 @@ vectorizable_reduction (gimple *stmt, gi gcc_assert (ncopies >= 1); vec_mode = TYPE_MODE (vectype_in); + poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); if (code == COND_EXPR) { @@ -6205,8 +6213,8 @@ vectorizable_reduction (gimple *stmt, gi int scalar_precision = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); cr_index_scalar_type = make_unsigned_type (scalar_precision); - cr_index_vector_type = build_vector_type - (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out)); + cr_index_vector_type = build_vector_type (cr_index_scalar_type, + nunits_out); optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type, optab_default); @@ -6215,6 +6223,15 @@ vectorizable_reduction (gimple *stmt, gi epilog_reduc_code = REDUC_MAX_EXPR; } + if (epilog_reduc_code == ERROR_MARK && !nunits_out.is_constant ()) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "missing target support for reduction on" + " variable-length vectors.\n"); + return false; + } + if ((double_reduc || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION) && ncopies > 1) @@ -6226,6 +6243,27 @@ vectorizable_reduction (gimple *stmt, gi return false; } + if (double_reduc && !nunits_out.is_constant ()) + { + /* The current double-reduction code creates the initial value + element-by-element. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "double reduction not supported for variable-length" + " vectors.\n"); + return false; + } + + if (slp_node && !nunits_out.is_constant ()) + { + /* The current SLP code creates the initial value element-by-element. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "SLP reduction not supported for variable-length" + " vectors.\n"); + return false; + } + /* In case of widenning multiplication by a constant, we update the type of the constant to be the type of the other operand. We check that the constant fits the type in the pattern recognition pass. */