Handle peeling for alignment with masking

Message ID	878tf5ym3f.fsf@linaro.org
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: domain of gcc-patches-return-467149-patch=linaro.org@gcc.gnu.org designates 209.132.180.131 as permitted sender) client-ip=209.132.180.131; DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:from :to:subject:date:message-id:mime-version:content-type; q=dns; s= default; b=jw4OZmTSyWbsZWC19XrsZznAdTGy5J3cdgVEOCBKicGOIalt7vM1D hcrFfQ16+pISRd5OaFn74Q3a+Qludg0J9gADypoUe2eeDPB/pilCVkWmEh0FG0Vo NMmfitAU0/PagR/UQ9PAWW464bTS7P+7n8YxucVbp1IZCWVJLj6qoQ= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org From: Richard Sandiford <richard.sandiford@linaro.org> To: gcc-patches@gcc.gnu.org Mail-Followup-To: gcc-patches@gcc.gnu.org, richard.sandiford@linaro.org Subject: Handle peeling for alignment with masking Date: Fri, 17 Nov 2017 15:13:40 +0000 Message-ID: <878tf5ym3f.fsf@linaro.org> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/25.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain
Series	Handle peeling for alignment with masking \| expand Handle peeling for alignment with masking

Index: gcc/tree-vectorizer.h =================================================================== --- gcc/tree-vectorizer.h 2017-11-17 15:07:59.275435265 +0000 +++ gcc/tree-vectorizer.h 2017-11-17 15:11:51.124849349 +0000 @@ -351,6 +351,12 @@ typedef struct _loop_vec_info : public v on inactive scalars. */ vec_loop_masks masks; + /* If we are using a loop mask to align memory addresses, this variable + contains the number of vector elements that we should skip in the + first iteration of the vector loop (i.e. the number of leading + elements that should be false in the first mask). */ + tree mask_skip_niters; + /* Type of the variables to use in the WHILE_ULT call for fully-masked loops. */ tree mask_compare_type; @@ -480,6 +486,7 @@ #define LOOP_VINFO_FULLY_MASKED_P(L) #define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor #define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor #define LOOP_VINFO_MASKS(L) (L)->masks +#define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters #define LOOP_VINFO_MASK_COMPARE_TYPE(L) (L)->mask_compare_type #define LOOP_VINFO_PTR_MASK(L) (L)->ptr_mask #define LOOP_VINFO_LOOP_NEST(L) (L)->loop_nest @@ -1230,6 +1237,17 @@ unlimited_cost_model (loop_p loop) return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED); } +/* Return true if the loop described by LOOP_VINFO is fully-masked and + if the first iteration should use a partial mask in order to achieve + alignment. */ + +static inline bool +vect_use_loop_mask_for_alignment_p (loop_vec_info loop_vinfo) +{ + return (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)); +} + /* Return the number of vectors of type VECTYPE that are needed to get NUNITS elements. NUNITS should be based on the vectorization factor, so it is always a known multiple of the number of elements in VECTYPE. */ @@ -1328,6 +1346,7 @@ extern void vect_loop_versioning (loop_v poly_uint64); extern struct loop *vect_do_peeling (loop_vec_info, tree, tree, tree *, tree *, tree *, int, bool, bool); +extern void vect_prepare_for_masked_peels (loop_vec_info); extern source_location find_loop_location (struct loop *); extern bool vect_can_advance_ivs_p (loop_vec_info); @@ -1392,6 +1411,7 @@ extern tree vect_gen_perm_mask_any (tree extern tree vect_gen_perm_mask_checked (tree, vec_perm_indices); extern void optimize_mask_stores (struct loop*); extern gcall *vect_gen_while (tree, tree, tree); +extern tree vect_gen_while_not (gimple_seq *, tree, tree, tree); /* In tree-vect-data-refs.c. */ extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int); Index: gcc/tree-vect-loop-manip.c =================================================================== --- gcc/tree-vect-loop-manip.c 2017-11-17 15:07:59.273608779 +0000 +++ gcc/tree-vect-loop-manip.c 2017-11-17 15:11:51.122849349 +0000 @@ -362,6 +362,11 @@ vect_maybe_permute_loop_masks (gimple_se times and has been vectorized according to LOOP_VINFO. Each iteration of the vectorized loop handles VF iterations of the scalar loop. + If NITERS_SKIP is nonnull, the first iteration of the vectorized loop + starts with NITERS_SKIP dummy iterations of the scalar loop before + the real work starts. The mask elements for these dummy iterations + must be 0, to ensure that the extra iterations do not have an effect. + It is known that: NITERS * RGM->max_nscalars_per_iter @@ -373,7 +378,7 @@ vect_maybe_permute_loop_masks (gimple_se might overflow before hitting a value above: - NITERS * RGM->max_nscalars_per_iter + (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter This means that we cannot guarantee that such an induction variable would ever hit a value that produces a set of all-false masks for RGM. */ @@ -383,7 +388,8 @@ vect_set_loop_masks_directly (struct loo gimple_seq *preheader_seq, gimple_stmt_iterator loop_cond_gsi, rgroup_masks *rgm, tree vf, - tree niters, bool might_wrap_p) + tree niters, tree niters_skip, + bool might_wrap_p) { tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); tree mask_type = rgm->mask_type; @@ -391,10 +397,12 @@ vect_set_loop_masks_directly (struct loo poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type); /* Calculate the maximum number of scalar values that the rgroup - handles in total and the number that it handles for each iteration - of the vector loop. */ + handles in total, the number that it handles for each iteration + of the vector loop, and the number that it should skip during the + first iteration of the vector loop. */ tree nscalars_total = niters; tree nscalars_step = vf; + tree nscalars_skip = niters_skip; if (nscalars_per_iter != 1) { /* We checked before choosing to use a fully-masked loop that these @@ -404,6 +412,9 @@ vect_set_loop_masks_directly (struct loo nscalars_total, factor); nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type, nscalars_step, factor); + if (nscalars_skip) + nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type, + nscalars_skip, factor); } /* Create an induction variable that counts the number of scalars @@ -416,29 +427,66 @@ vect_set_loop_masks_directly (struct loo create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi, insert_after, &index_before_incr, &index_after_incr); - tree test_index, test_limit; + tree test_index, test_limit, first_limit; gimple_stmt_iterator *test_gsi; if (might_wrap_p) { /* In principle the loop should stop iterating once the incremented - IV reaches a value greater than or equal to NSCALAR_TOTAL. - However, there's no guarantee that the IV hits a value above - this value before wrapping around. We therefore adjust the - limit down by one IV step: + IV reaches a value greater than or equal to: + + NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP + + However, there's no guarantee that this addition doesn't overflow + the comparison type, or that the IV hits a value above it before + wrapping around. We therefore adjust the limit down by one + IV step: - NSCALARS_TOTAL -[infinite-prec] NSCALARS_STEP + (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP) + -[infinite-prec] NSCALARS_STEP and compare the IV against this limit _before_ incrementing it. Since the comparison type is unsigned, we actually want the subtraction to saturate at zero: - NSCALARS_TOTAL -[sat] NSCALARS_STEP. */ + (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP) + -[sat] NSCALARS_STEP + + And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as: + + NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP) + + where the rightmost subtraction can be done directly in + COMPARE_TYPE. */ test_index = index_before_incr; + tree adjust = nscalars_step; + if (nscalars_skip) + adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type, + adjust, nscalars_skip); test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type, - nscalars_total, nscalars_step); + nscalars_total, adjust); test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type, - test_limit, nscalars_step); + test_limit, adjust); test_gsi = &incr_gsi; + + /* Get a safe limit for the first iteration. */ + if (nscalars_skip) + { + /* The first vector iteration can handle at most NSCALARS_STEP + scalars. NSCALARS_STEP <= CONST_LIMIT, and adding + NSCALARS_SKIP to that cannot overflow. */ + tree const_limit = build_int_cst (compare_type, + LOOP_VINFO_VECT_FACTOR (loop_vinfo) + * nscalars_per_iter); + first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type, + nscalars_total, const_limit); + first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type, + first_limit, nscalars_skip); + } + else + /* For the first iteration it doesn't matter whether the IV hits + a value above NSCALARS_TOTAL. That only matters for the latch + condition. */ + first_limit = nscalars_total; } else { @@ -446,7 +494,12 @@ vect_set_loop_masks_directly (struct loo the bound before wrapping. */ test_index = index_after_incr; test_limit = nscalars_total; + if (nscalars_skip) + test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type, + test_limit, nscalars_skip); test_gsi = &loop_cond_gsi; + + first_limit = test_limit; } /* Provide a definition of each mask in the group. */ @@ -465,7 +518,7 @@ vect_set_loop_masks_directly (struct loo to have a full mask. */ poly_uint64 const_limit; bool first_iteration_full - = (poly_int_tree_p (nscalars_total, &const_limit) + = (poly_int_tree_p (first_limit, &const_limit) && must_ge (const_limit, (i + 1) * nscalars_per_mask)); /* Rather than have a new IV that starts at BIAS and goes up to @@ -482,12 +535,13 @@ vect_set_loop_masks_directly (struct loo bias_tree); } - /* Create the initial mask. */ + /* Create the initial mask. First include all scalars that + are within the loop limit. */ tree init_mask = NULL_TREE; if (!first_iteration_full) { tree start, end; - if (nscalars_total == test_limit) + if (first_limit == test_limit) { /* Use a natural test between zero (the initial IV value) and the loop limit. The "else" block would be valid too, @@ -498,8 +552,11 @@ vect_set_loop_masks_directly (struct loo } else { + /* FIRST_LIMIT is the maximum number of scalars handled by the + first iteration of the vector loop. Test the portion + associated with this mask. */ start = bias_tree; - end = nscalars_total; + end = first_limit; } init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask"); @@ -507,6 +564,22 @@ vect_set_loop_masks_directly (struct loo gimple_seq_add_stmt (preheader_seq, tmp_stmt); } + /* Now AND out the bits that are within the number of skipped + scalars. */ + poly_uint64 const_skip; + if (nscalars_skip + && !(poly_int_tree_p (nscalars_skip, &const_skip) + && must_le (const_skip, bias))) + { + tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type, + bias_tree, nscalars_skip); + if (init_mask) + init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type, + init_mask, unskipped_mask); + else + init_mask = unskipped_mask; + } + if (!init_mask) /* First iteration is full. */ init_mask = build_minus_one_cst (mask_type); @@ -564,6 +637,9 @@ vect_set_loop_condition_masked (struct l else niters = gimple_convert (&preheader_seq, compare_type, niters); + /* Convert skip_niters to the right type. */ + tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); + /* Now calculate the value that the induction variable must be able to hit in order to ensure that we end the loop with an all-false mask. This involves adding the maximum number of inactive trailing scalar @@ -572,6 +648,15 @@ vect_set_loop_condition_masked (struct l bool known_max_iters = max_loop_iterations (loop, &iv_limit); if (known_max_iters) { + if (niters_skip) + { + /* Add the maximum number of skipped iterations to the + maximum iteration count. */ + if (TREE_CODE (niters_skip) == INTEGER_CST) + iv_limit += wi::to_widest (niters_skip); + else + iv_limit += max_vf - 1; + } /* IV_LIMIT is the maximum number of latch iterations, which is also the maximum in-range IV value. Round this value down to the previous vector alignment boundary and then add an extra full iteration. */ @@ -617,7 +702,8 @@ vect_set_loop_condition_masked (struct l test_mask = vect_set_loop_masks_directly (loop, loop_vinfo, &preheader_seq, loop_cond_gsi, rgm, vf, - niters, might_wrap_p); + niters, niters_skip, + might_wrap_p); } /* Emit all accumulated statements. */ @@ -1439,6 +1525,46 @@ vect_update_ivs_after_vectorizer (loop_v } } +/* Return a gimple value containing the misalignment (measured in vector + elements) for the loop described by LOOP_VINFO, i.e. how many elements + it is away from a perfectly aligned address. Add any new statements + to SEQ. */ + +static tree +get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo) +{ + struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); + gimple *dr_stmt = DR_STMT (dr); + stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + + unsigned int target_align = DR_TARGET_ALIGNMENT (dr); + gcc_assert (target_align != 0); + + bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; + tree offset = (negative + ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) + : size_zero_node); + tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, seq, + offset); + tree type = unsigned_type_for (TREE_TYPE (start_addr)); + tree target_align_minus_1 = build_int_cst (type, target_align - 1); + HOST_WIDE_INT elem_size + = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); + tree elem_size_log = build_int_cst (type, exact_log2 (elem_size)); + + /* Create: misalign_in_bytes = addr & (target_align - 1). */ + tree int_start_addr = fold_convert (type, start_addr); + tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr, + target_align_minus_1); + + /* Create: misalign_in_elems = misalign_in_bytes / element_size. */ + tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, + elem_size_log); + + return misalign_in_elems; +} + /* Function vect_gen_prolog_loop_niters Generate the number of iterations which should be peeled as prolog for the @@ -1450,7 +1576,7 @@ vect_update_ivs_after_vectorizer (loop_v If the misalignment of DR is known at compile time: addr_mis = int mis = DR_MISALIGNMENT (dr); Else, compute address misalignment in bytes: - addr_mis = addr & (vectype_align - 1) + addr_mis = addr & (target_align - 1) prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step @@ -1497,33 +1623,17 @@ vect_gen_prolog_loop_niters (loop_vec_in } else { - bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; - tree offset = negative - ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node; - tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, - &stmts, offset); - tree type = unsigned_type_for (TREE_TYPE (start_addr)); - tree target_align_minus_1 = build_int_cst (type, target_align - 1); + tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo); + tree type = TREE_TYPE (misalign_in_elems); HOST_WIDE_INT elem_size = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); - tree elem_size_log = build_int_cst (type, exact_log2 (elem_size)); HOST_WIDE_INT align_in_elems = target_align / elem_size; tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1); tree align_in_elems_tree = build_int_cst (type, align_in_elems); - tree misalign_in_bytes; - tree misalign_in_elems; - - /* Create: misalign_in_bytes = addr & (target_align - 1). */ - misalign_in_bytes - = fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), - target_align_minus_1); - - /* Create: misalign_in_elems = misalign_in_bytes / element_size. */ - misalign_in_elems - = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, elem_size_log); /* Create: (niters_type) ((align_in_elems - misalign_in_elems) & (align_in_elems - 1)). */ + bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; if (negative) iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems, align_in_elems_tree); @@ -1563,20 +1673,22 @@ vect_gen_prolog_loop_niters (loop_vec_in /* Function vect_update_init_of_dr - NITERS iterations were peeled from LOOP. DR represents a data reference - in LOOP. This function updates the information recorded in DR to - account for the fact that the first NITERS iterations had already been - executed. Specifically, it updates the OFFSET field of DR. */ + If CODE is PLUS, the vector loop starts NITERS iterations after the + scalar one, otherwise CODE is MINUS and the vector loop starts NITERS + iterations before the scalar one (using masking to skip inactive + elements). This function updates the information recorded in DR to + account for the difference. Specifically, it updates the OFFSET + field of DR. */ static void -vect_update_init_of_dr (struct data_reference *dr, tree niters) +vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code) { tree offset = DR_OFFSET (dr); niters = fold_build2 (MULT_EXPR, sizetype, fold_convert (sizetype, niters), fold_convert (sizetype, DR_STEP (dr))); - offset = fold_build2 (PLUS_EXPR, sizetype, + offset = fold_build2 (code, sizetype, fold_convert (sizetype, offset), niters); DR_OFFSET (dr) = offset; } @@ -1584,14 +1696,12 @@ vect_update_init_of_dr (struct data_refe /* Function vect_update_inits_of_drs - NITERS iterations were peeled from the loop represented by LOOP_VINFO. - This function updates the information recorded for the data references in - the loop to account for the fact that the first NITERS iterations had - already been executed. Specifically, it updates the initial_condition of - the access_function of all the data_references in the loop. */ + Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO. + CODE and NITERS are as for vect_update_inits_of_dr. */ static void -vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters) +vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters, + tree_code code) { unsigned int i; vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); @@ -1618,9 +1728,57 @@ vect_update_inits_of_drs (loop_vec_info } FOR_EACH_VEC_ELT (datarefs, i, dr) - vect_update_init_of_dr (dr, niters); + vect_update_init_of_dr (dr, niters, code); } +/* For the information recorded in LOOP_VINFO prepare the loop for peeling + by masking. This involves calculating the number of iterations to + be peeled and then aligning all memory references appropriately. */ + +void +vect_prepare_for_masked_peels (loop_vec_info loop_vinfo) +{ + tree misalign_in_elems; + tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); + + gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo)); + + /* From the information recorded in LOOP_VINFO get the number of iterations + that need to be skipped via masking. */ + if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) + { + poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo) + - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)); + misalign_in_elems = build_int_cst (type, misalign); + } + else + { + gimple_seq seq1 = NULL, seq2 = NULL; + misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo); + misalign_in_elems = fold_convert (type, misalign_in_elems); + misalign_in_elems = force_gimple_operand (misalign_in_elems, + &seq2, true, NULL_TREE); + gimple_seq_add_seq (&seq1, seq2); + if (seq1) + { + edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); + basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1); + gcc_assert (!new_bb); + } + } + + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "misalignment for fully-masked loop: "); + dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems); + dump_printf (MSG_NOTE, "\n"); + } + + LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems; + + vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR); +} /* This function builds ni_name = number of iterations. Statements are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set @@ -2226,7 +2384,9 @@ vect_do_peeling (loop_vec_info loop_vinf int bound_prolog = 0; poly_uint64 bound_scalar = 0; int estimated_vf; - int prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); + int prolog_peeling = 0; + if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) + prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); @@ -2343,7 +2503,7 @@ vect_do_peeling (loop_vec_info loop_vinf scale_loop_profile (prolog, prob_prolog, bound_prolog); } /* Update init address of DRs. */ - vect_update_inits_of_drs (loop_vinfo, niters_prolog); + vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR); /* Update niters for vector loop. */ LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR, type, niters, niters_prolog); Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c 2017-11-17 15:09:28.969330125 +0000 +++ gcc/tree-vect-loop.c 2017-11-17 15:11:51.123849349 +0000 @@ -1119,6 +1119,7 @@ _loop_vec_info::_loop_vec_info (struct l versioning_threshold (0), vectorization_factor (0), max_vectorization_factor (0), + mask_skip_niters (NULL_TREE), mask_compare_type (NULL_TREE), unaligned_dr (NULL), peeling_for_alignment (0), @@ -2266,16 +2267,6 @@ vect_analyze_loop_2 (loop_vec_info loop_ " gaps is required.\n"); } - if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) - && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) - { - LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't use a fully-masked loop because peeling for" - " alignment is required.\n"); - } - /* Decide whether to use a fully-masked loop for this vectorization factor. */ LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) @@ -2376,18 +2367,21 @@ vect_analyze_loop_2 (loop_vec_info loop_ increase threshold for this case if necessary. */ if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) { - poly_uint64 niters_th; + poly_uint64 niters_th = 0; - /* Niters for peeled prolog loop. */ - if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) + if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) { - struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); - tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); - - niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1; + /* Niters for peeled prolog loop. */ + if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) + { + struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); + tree vectype + = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); + niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1; + } + else + niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); } - else - niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); /* Niters for at least one iteration of vectorized loop. */ if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) @@ -7167,9 +7161,28 @@ vectorizable_induction (gimple *phi, init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (iv_loop)); - /* Convert the step to the desired type. */ + /* Convert the initial value and step to the desired type. */ stmts = NULL; + init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr); step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr); + + /* If we are using the loop mask to "peel" for alignment then we need + to adjust the start value here. */ + tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); + if (skip_niters != NULL_TREE) + { + if (FLOAT_TYPE_P (vectype)) + skip_niters = gimple_build (&stmts, FLOAT_EXPR, TREE_TYPE (vectype), + skip_niters); + else + skip_niters = gimple_convert (&stmts, TREE_TYPE (vectype), + skip_niters); + tree skip_step = gimple_build (&stmts, MULT_EXPR, TREE_TYPE (vectype), + skip_niters, step_expr); + init_expr = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (vectype), + init_expr, skip_step); + } + if (stmts) { new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); @@ -8040,6 +8053,11 @@ vect_transform_loop (loop_vec_info loop_ split_edge (loop_preheader_edge (loop)); + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && vect_use_loop_mask_for_alignment_p (loop_vinfo)) + /* This will deal with any possible peeling. */ + vect_prepare_for_masked_peels (loop_vinfo); + /* FORNOW: the vectorizer supports only loops which body consist of one basic block (header + empty latch). When the vectorizer will support more involved loop forms, the order by which the BBs are @@ -8319,29 +8337,40 @@ vect_transform_loop (loop_vec_info loop_ /* +1 to convert latch counts to loop iteration counts, -min_epilogue_iters to remove iterations that cannot be performed by the vector code. */ - int bias = 1 - min_epilogue_iters; + int bias_for_lowest = 1 - min_epilogue_iters; + int bias_for_assumed = bias_for_lowest; + int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); + if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + { + /* When the amount of peeling is known at compile time, the first + iteration will have exactly alignment_npeels active elements. + In the worst case it will have at least one. */ + int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1); + bias_for_lowest += lowest_vf - min_first_active; + bias_for_assumed += assumed_vf - min_first_active; + } /* In these calculations the "- 1" converts loop iteration counts back to latch counts. */ if (loop->any_upper_bound) loop->nb_iterations_upper_bound = (final_iter_may_be_partial - ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias, + ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest, lowest_vf) - 1 - : wi::udiv_floor (loop->nb_iterations_upper_bound + bias, + : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest, lowest_vf) - 1); if (loop->any_likely_upper_bound) loop->nb_iterations_likely_upper_bound = (final_iter_may_be_partial - ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound + bias, - lowest_vf) - 1 - : wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, - lowest_vf) - 1); + ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound + + bias_for_lowest, lowest_vf) - 1 + : wi::udiv_floor (loop->nb_iterations_likely_upper_bound + + bias_for_lowest, lowest_vf) - 1); if (loop->any_estimate) loop->nb_iterations_estimate = (final_iter_may_be_partial - ? wi::udiv_ceil (loop->nb_iterations_estimate + bias, + ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed, assumed_vf) - 1 - : wi::udiv_floor (loop->nb_iterations_estimate + bias, + : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed, assumed_vf) - 1); if (dump_enabled_p ()) Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c 2017-11-17 15:07:59.275435265 +0000 +++ gcc/tree-vect-stmts.c 2017-11-17 15:11:51.124849349 +0000 @@ -9975,3 +9975,16 @@ vect_gen_while (tree mask, tree start_in gimple_call_set_lhs (call, mask); return call; } + +/* Generate a vector mask of type MASK_TYPE for which index I is false iff + J + START_INDEX < END_INDEX for all J <= I. Add the statements to SEQ. */ + +tree +vect_gen_while_not (gimple_seq *seq, tree mask_type, tree start_index, + tree end_index) +{ + tree tmp = make_ssa_name (mask_type); + gcall *call = vect_gen_while (tmp, start_index, end_index); + gimple_seq_add_stmt (seq, call); + return gimple_build (seq, BIT_NOT_EXPR, mask_type, tmp); +} Index: gcc/testsuite/gcc.target/aarch64/sve_nopeel_1.c =================================================================== --- /dev/null 2017-11-14 14:28:07.424493901 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve_nopeel_1.c 2017-11-17 15:11:51.120849349 +0000 @@ -0,0 +1,39 @@ +/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -msve-vector-bits=256" } */ + +#include <stdint.h> + +#define TEST(NAME, TYPE) \ + void \ + NAME##1 (TYPE *x, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + x[i] += 1; \ + } \ + TYPE NAME##_array[1024]; \ + void \ + NAME##2 (void) \ + { \ + for (int i = 1; i < 200; ++i) \ + NAME##_array[i] += 1; \ + } + +TEST (s8, int8_t) +TEST (u8, uint8_t) +TEST (s16, int16_t) +TEST (u16, uint16_t) +TEST (s32, int32_t) +TEST (u32, uint32_t) +TEST (s64, int64_t) +TEST (u64, uint64_t) +TEST (f16, _Float16) +TEST (f32, float) +TEST (f64, double) + +/* No scalar memory accesses. */ +/* { dg-final { scan-assembler-not {[wx][0-9]*, \[} } } */ +/* 2 for each NAME##1 test, one in the header and one in the main loop + and 1 for each NAME##2 test, in the main loop only. */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b,} 6 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 6 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 9 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 9 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve_peel_ind_1.c =================================================================== --- /dev/null 2017-11-14 14:28:07.424493901 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve_peel_ind_1.c 2017-11-17 15:11:51.121849349 +0000 @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* Pick an arbitrary target for which unaligned accesses are more + expensive. */ +/* { dg-options "-O3 -march=armv8-a+sve -msve-vector-bits=256 -mtune=thunderx" } */ + +#define N 512 +#define START 1 +#define END 505 + +int x[N] __attribute__((aligned(32))); + +void __attribute__((noinline, noclone)) +foo (void) +{ + unsigned int v = 0; + for (unsigned int i = START; i < END; ++i) + { + x[i] = v; + v += 5; + } +} + +/* We should operate on aligned vectors. */ +/* { dg-final { scan-assembler {\tadrp\tx[0-9]+, x\n} } } */ +/* We should use an induction that starts at -5, with only the last + 7 elements of the first iteration being active. */ +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #-5, #5\n} } } */ Index: gcc/testsuite/gcc.target/aarch64/sve_peel_ind_1_run.c =================================================================== --- /dev/null 2017-11-14 14:28:07.424493901 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve_peel_ind_1_run.c 2017-11-17 15:11:51.121849349 +0000 @@ -0,0 +1,18 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O3 -march=armv8-a+sve -mtune=thunderx" } */ +/* { dg-options "-O3 -march=armv8-a+sve -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "sve_peel_ind_1.c" + +int __attribute__ ((optimize (1))) +main (void) +{ + foo (); + for (int i = 0; i < N; ++i) + { + if (x[i] != (i < START || i >= END ? 0 : (i - START) * 5)) + __builtin_abort (); + asm volatile ("" ::: "memory"); + } + return 0; +} Index: gcc/testsuite/gcc.target/aarch64/sve_peel_ind_2.c =================================================================== --- /dev/null 2017-11-14 14:28:07.424493901 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve_peel_ind_2.c 2017-11-17 15:11:51.121849349 +0000 @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* Pick an arbitrary target for which unaligned accesses are more + expensive. */ +/* { dg-options "-O3 -march=armv8-a+sve -msve-vector-bits=256 -mtune=thunderx" } */ + +#define N 512 +#define START 7 +#define END 22 + +int x[N] __attribute__((aligned(32))); + +void __attribute__((noinline, noclone)) +foo (void) +{ + for (unsigned int i = START; i < END; ++i) + x[i] = i; +} + +/* We should operate on aligned vectors. */ +/* { dg-final { scan-assembler {\tadrp\tx[0-9]+, x\n} } } */ +/* We should unroll the loop three times. */ +/* { dg-final { scan-assembler-times "\tst1w\t" 3 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve_peel_ind_2_run.c =================================================================== --- /dev/null 2017-11-14 14:28:07.424493901 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve_peel_ind_2_run.c 2017-11-17 15:11:51.121849349 +0000 @@ -0,0 +1,18 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O3 -march=armv8-a+sve -mtune=thunderx" } */ +/* { dg-options "-O3 -march=armv8-a+sve -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "sve_peel_ind_2.c" + +int __attribute__ ((optimize (1))) +main (void) +{ + foo (); + for (int i = 0; i < N; ++i) + { + if (x[i] != (i < START || i >= END ? 0 : i)) + __builtin_abort (); + asm volatile ("" ::: "memory"); + } + return 0; +} Index: gcc/testsuite/gcc.target/aarch64/sve_peel_ind_3.c =================================================================== --- /dev/null 2017-11-14 14:28:07.424493901 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve_peel_ind_3.c 2017-11-17 15:11:51.121849349 +0000 @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* Pick an arbitrary target for which unaligned accesses are more + expensive. */ +/* { dg-options "-O3 -march=armv8-a+sve -msve-vector-bits=256 -mtune=thunderx" } */ + +#define N 32 +#define MAX_START 8 +#define COUNT 16 + +int x[MAX_START][N] __attribute__((aligned(32))); + +void __attribute__((noinline, noclone)) +foo (int start) +{ + for (int i = start; i < start + COUNT; ++i) + x[start][i] = i; +} + +/* We should operate on aligned vectors. */ +/* { dg-final { scan-assembler {\tadrp\tx[0-9]+, x\n} } } */ +/* { dg-final { scan-assembler {\tubfx\t} } } */ Index: gcc/testsuite/gcc.target/aarch64/sve_peel_ind_3_run.c =================================================================== --- /dev/null 2017-11-14 14:28:07.424493901 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve_peel_ind_3_run.c 2017-11-17 15:11:51.121849349 +0000 @@ -0,0 +1,21 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O3 -march=armv8-a+sve -mtune=thunderx" } */ +/* { dg-options "-O3 -march=armv8-a+sve -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "sve_peel_ind_3.c" + +int __attribute__ ((optimize (1))) +main (void) +{ + for (int start = 0; start < MAX_START; ++start) + { + foo (start); + for (int i = 0; i < N; ++i) + { + if (x[start][i] != (i < start || i >= start + COUNT ? 0 : i)) + __builtin_abort (); + asm volatile ("" ::: "memory"); + } + } + return 0; +} Index: gcc/testsuite/gcc.target/aarch64/sve_peel_ind_4.c =================================================================== --- /dev/null 2017-11-14 14:28:07.424493901 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve_peel_ind_4.c 2017-11-17 15:11:51.121849349 +0000 @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* Pick an arbitrary target for which unaligned accesses are more + expensive. */ +/* { dg-options "-Ofast -march=armv8-a+sve -msve-vector-bits=256 -mtune=thunderx -fno-vect-cost-model" } */ + +#define START 1 +#define END 505 + +void __attribute__((noinline, noclone)) +foo (double *x) +{ + double v = 10.0; + for (unsigned int i = START; i < END; ++i) + { + x[i] = v; + v += 5.0; + } +} + +/* We should operate on aligned vectors. */ +/* { dg-final { scan-assembler {\tubfx\t} } } */ Index: gcc/testsuite/gcc.target/aarch64/sve_peel_ind_4_run.c =================================================================== --- /dev/null 2017-11-14 14:28:07.424493901 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve_peel_ind_4_run.c 2017-11-17 15:11:51.121849349 +0000 @@ -0,0 +1,29 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-Ofast -march=armv8-a+sve -mtune=thunderx" } */ +/* { dg-options "-Ofast -march=armv8-a+sve -mtune=thunderx -mtune=thunderx" { target aarch64_sve256_hw } } */ + +#include "sve_peel_ind_4.c" + +int __attribute__ ((optimize (1))) +main (void) +{ + double x[END + 1]; + for (int i = 0; i < END + 1; ++i) + { + x[i] = i; + asm volatile ("" ::: "memory"); + } + foo (x); + for (int i = 0; i < END + 1; ++i) + { + double expected; + if (i < START || i >= END) + expected = i; + else + expected = 10 + (i - START) * 5; + if (x[i] != expected) + __builtin_abort (); + asm volatile ("" ::: "memory"); + } + return 0; +}

Handle peeling for alignment with masking

Commit Message

Comments

Patch