tree-vect-data-refs.cc revision 1.1.1.1 1 1.1 mrg /* Data References Analysis and Manipulation Utilities for Vectorization.
2 1.1 mrg Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 1.1 mrg Contributed by Dorit Naishlos <dorit (at) il.ibm.com>
4 1.1 mrg and Ira Rosen <irar (at) il.ibm.com>
5 1.1 mrg
6 1.1 mrg This file is part of GCC.
7 1.1 mrg
8 1.1 mrg GCC is free software; you can redistribute it and/or modify it under
9 1.1 mrg the terms of the GNU General Public License as published by the Free
10 1.1 mrg Software Foundation; either version 3, or (at your option) any later
11 1.1 mrg version.
12 1.1 mrg
13 1.1 mrg GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 1.1 mrg WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 1.1 mrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 1.1 mrg for more details.
17 1.1 mrg
18 1.1 mrg You should have received a copy of the GNU General Public License
19 1.1 mrg along with GCC; see the file COPYING3. If not see
20 1.1 mrg <http://www.gnu.org/licenses/>. */
21 1.1 mrg
22 1.1 mrg #define INCLUDE_ALGORITHM
23 1.1 mrg #include "config.h"
24 1.1 mrg #include "system.h"
25 1.1 mrg #include "coretypes.h"
26 1.1 mrg #include "backend.h"
27 1.1 mrg #include "target.h"
28 1.1 mrg #include "rtl.h"
29 1.1 mrg #include "tree.h"
30 1.1 mrg #include "gimple.h"
31 1.1 mrg #include "predict.h"
32 1.1 mrg #include "memmodel.h"
33 1.1 mrg #include "tm_p.h"
34 1.1 mrg #include "ssa.h"
35 1.1 mrg #include "optabs-tree.h"
36 1.1 mrg #include "cgraph.h"
37 1.1 mrg #include "dumpfile.h"
38 1.1 mrg #include "alias.h"
39 1.1 mrg #include "fold-const.h"
40 1.1 mrg #include "stor-layout.h"
41 1.1 mrg #include "tree-eh.h"
42 1.1 mrg #include "gimplify.h"
43 1.1 mrg #include "gimple-iterator.h"
44 1.1 mrg #include "gimplify-me.h"
45 1.1 mrg #include "tree-ssa-loop-ivopts.h"
46 1.1 mrg #include "tree-ssa-loop-manip.h"
47 1.1 mrg #include "tree-ssa-loop.h"
48 1.1 mrg #include "cfgloop.h"
49 1.1 mrg #include "tree-scalar-evolution.h"
50 1.1 mrg #include "tree-vectorizer.h"
51 1.1 mrg #include "expr.h"
52 1.1 mrg #include "builtins.h"
53 1.1 mrg #include "tree-cfg.h"
54 1.1 mrg #include "tree-hash-traits.h"
55 1.1 mrg #include "vec-perm-indices.h"
56 1.1 mrg #include "internal-fn.h"
57 1.1 mrg #include "gimple-fold.h"
58 1.1 mrg
59 1.1 mrg /* Return true if load- or store-lanes optab OPTAB is implemented for
60 1.1 mrg COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 1.1 mrg
62 1.1 mrg static bool
63 1.1 mrg vect_lanes_optab_supported_p (const char *name, convert_optab optab,
64 1.1 mrg tree vectype, unsigned HOST_WIDE_INT count)
65 1.1 mrg {
66 1.1 mrg machine_mode mode, array_mode;
67 1.1 mrg bool limit_p;
68 1.1 mrg
69 1.1 mrg mode = TYPE_MODE (vectype);
70 1.1 mrg if (!targetm.array_mode (mode, count).exists (&array_mode))
71 1.1 mrg {
72 1.1 mrg poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
73 1.1 mrg limit_p = !targetm.array_mode_supported_p (mode, count);
74 1.1 mrg if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 1.1 mrg {
76 1.1 mrg if (dump_enabled_p ())
77 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78 1.1 mrg "no array mode for %s[%wu]\n",
79 1.1 mrg GET_MODE_NAME (mode), count);
80 1.1 mrg return false;
81 1.1 mrg }
82 1.1 mrg }
83 1.1 mrg
84 1.1 mrg if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 1.1 mrg {
86 1.1 mrg if (dump_enabled_p ())
87 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 1.1 mrg "cannot use %s<%s><%s>\n", name,
89 1.1 mrg GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 1.1 mrg return false;
91 1.1 mrg }
92 1.1 mrg
93 1.1 mrg if (dump_enabled_p ())
94 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
95 1.1 mrg "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 1.1 mrg GET_MODE_NAME (mode));
97 1.1 mrg
98 1.1 mrg return true;
99 1.1 mrg }
100 1.1 mrg
101 1.1 mrg
102 1.1 mrg /* Return the smallest scalar part of STMT_INFO.
103 1.1 mrg This is used to determine the vectype of the stmt. We generally set the
104 1.1 mrg vectype according to the type of the result (lhs). For stmts whose
105 1.1 mrg result-type is different than the type of the arguments (e.g., demotion,
106 1.1 mrg promotion), vectype will be reset appropriately (later). Note that we have
107 1.1 mrg to visit the smallest datatype in this function, because that determines the
108 1.1 mrg VF. If the smallest datatype in the loop is present only as the rhs of a
109 1.1 mrg promotion operation - we'd miss it.
110 1.1 mrg Such a case, where a variable of this datatype does not appear in the lhs
111 1.1 mrg anywhere in the loop, can only occur if it's an invariant: e.g.:
112 1.1 mrg 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 1.1 mrg invariant motion. However, we cannot rely on invariant motion to always
114 1.1 mrg take invariants out of the loop, and so in the case of promotion we also
115 1.1 mrg have to check the rhs.
116 1.1 mrg LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 1.1 mrg types. */
118 1.1 mrg
119 1.1 mrg tree
120 1.1 mrg vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
121 1.1 mrg {
122 1.1 mrg HOST_WIDE_INT lhs, rhs;
123 1.1 mrg
124 1.1 mrg /* During the analysis phase, this function is called on arbitrary
125 1.1 mrg statements that might not have scalar results. */
126 1.1 mrg if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
127 1.1 mrg return scalar_type;
128 1.1 mrg
129 1.1 mrg lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
130 1.1 mrg
131 1.1 mrg gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
132 1.1 mrg if (assign)
133 1.1 mrg {
134 1.1 mrg scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
135 1.1 mrg if (gimple_assign_cast_p (assign)
136 1.1 mrg || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
137 1.1 mrg || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
138 1.1 mrg || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
139 1.1 mrg || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
140 1.1 mrg || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
141 1.1 mrg || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
142 1.1 mrg || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
143 1.1 mrg {
144 1.1 mrg tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
145 1.1 mrg
146 1.1 mrg rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
147 1.1 mrg if (rhs < lhs)
148 1.1 mrg scalar_type = rhs_type;
149 1.1 mrg }
150 1.1 mrg }
151 1.1 mrg else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
152 1.1 mrg {
153 1.1 mrg unsigned int i = 0;
154 1.1 mrg if (gimple_call_internal_p (call))
155 1.1 mrg {
156 1.1 mrg internal_fn ifn = gimple_call_internal_fn (call);
157 1.1 mrg if (internal_load_fn_p (ifn))
158 1.1 mrg /* For loads the LHS type does the trick. */
159 1.1 mrg i = ~0U;
160 1.1 mrg else if (internal_store_fn_p (ifn))
161 1.1 mrg {
162 1.1 mrg /* For stores use the tyep of the stored value. */
163 1.1 mrg i = internal_fn_stored_value_index (ifn);
164 1.1 mrg scalar_type = TREE_TYPE (gimple_call_arg (call, i));
165 1.1 mrg i = ~0U;
166 1.1 mrg }
167 1.1 mrg else if (internal_fn_mask_index (ifn) == 0)
168 1.1 mrg i = 1;
169 1.1 mrg }
170 1.1 mrg if (i < gimple_call_num_args (call))
171 1.1 mrg {
172 1.1 mrg tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
173 1.1 mrg if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
174 1.1 mrg {
175 1.1 mrg rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
176 1.1 mrg if (rhs < lhs)
177 1.1 mrg scalar_type = rhs_type;
178 1.1 mrg }
179 1.1 mrg }
180 1.1 mrg }
181 1.1 mrg
182 1.1 mrg return scalar_type;
183 1.1 mrg }
184 1.1 mrg
185 1.1 mrg
186 1.1 mrg /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
187 1.1 mrg tested at run-time. Return TRUE if DDR was successfully inserted.
188 1.1 mrg Return false if versioning is not supported. */
189 1.1 mrg
190 1.1 mrg static opt_result
191 1.1 mrg vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
192 1.1 mrg {
193 1.1 mrg class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
194 1.1 mrg
195 1.1 mrg if ((unsigned) param_vect_max_version_for_alias_checks == 0)
196 1.1 mrg return opt_result::failure_at (vect_location,
197 1.1 mrg "will not create alias checks, as"
198 1.1 mrg " --param vect-max-version-for-alias-checks"
199 1.1 mrg " == 0\n");
200 1.1 mrg
201 1.1 mrg opt_result res
202 1.1 mrg = runtime_alias_check_p (ddr, loop,
203 1.1 mrg optimize_loop_nest_for_speed_p (loop));
204 1.1 mrg if (!res)
205 1.1 mrg return res;
206 1.1 mrg
207 1.1 mrg LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
208 1.1 mrg return opt_result::success ();
209 1.1 mrg }
210 1.1 mrg
211 1.1 mrg /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
212 1.1 mrg
213 1.1 mrg static void
214 1.1 mrg vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
215 1.1 mrg {
216 1.1 mrg const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
217 1.1 mrg for (unsigned int i = 0; i < checks.length(); ++i)
218 1.1 mrg if (checks[i] == value)
219 1.1 mrg return;
220 1.1 mrg
221 1.1 mrg if (dump_enabled_p ())
222 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
223 1.1 mrg "need run-time check that %T is nonzero\n",
224 1.1 mrg value);
225 1.1 mrg LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
226 1.1 mrg }
227 1.1 mrg
228 1.1 mrg /* Return true if we know that the order of vectorized DR_INFO_A and
229 1.1 mrg vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
230 1.1 mrg DR_INFO_B. At least one of the accesses is a write. */
231 1.1 mrg
232 1.1 mrg static bool
233 1.1 mrg vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
234 1.1 mrg {
235 1.1 mrg stmt_vec_info stmtinfo_a = dr_info_a->stmt;
236 1.1 mrg stmt_vec_info stmtinfo_b = dr_info_b->stmt;
237 1.1 mrg
238 1.1 mrg /* Single statements are always kept in their original order. */
239 1.1 mrg if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
240 1.1 mrg && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
241 1.1 mrg return true;
242 1.1 mrg
243 1.1 mrg /* STMT_A and STMT_B belong to overlapping groups. All loads are
244 1.1 mrg emitted at the position of the first scalar load.
245 1.1 mrg Stores in a group are emitted at the position of the last scalar store.
246 1.1 mrg Compute that position and check whether the resulting order matches
247 1.1 mrg the current one. */
248 1.1 mrg stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
249 1.1 mrg if (il_a)
250 1.1 mrg {
251 1.1 mrg if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
252 1.1 mrg for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
253 1.1 mrg s = DR_GROUP_NEXT_ELEMENT (s))
254 1.1 mrg il_a = get_later_stmt (il_a, s);
255 1.1 mrg else /* DR_IS_READ */
256 1.1 mrg for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
257 1.1 mrg s = DR_GROUP_NEXT_ELEMENT (s))
258 1.1 mrg if (get_later_stmt (il_a, s) == il_a)
259 1.1 mrg il_a = s;
260 1.1 mrg }
261 1.1 mrg else
262 1.1 mrg il_a = stmtinfo_a;
263 1.1 mrg stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
264 1.1 mrg if (il_b)
265 1.1 mrg {
266 1.1 mrg if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
267 1.1 mrg for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
268 1.1 mrg s = DR_GROUP_NEXT_ELEMENT (s))
269 1.1 mrg il_b = get_later_stmt (il_b, s);
270 1.1 mrg else /* DR_IS_READ */
271 1.1 mrg for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
272 1.1 mrg s = DR_GROUP_NEXT_ELEMENT (s))
273 1.1 mrg if (get_later_stmt (il_b, s) == il_b)
274 1.1 mrg il_b = s;
275 1.1 mrg }
276 1.1 mrg else
277 1.1 mrg il_b = stmtinfo_b;
278 1.1 mrg bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
279 1.1 mrg return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
280 1.1 mrg }
281 1.1 mrg
282 1.1 mrg /* A subroutine of vect_analyze_data_ref_dependence. Handle
283 1.1 mrg DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
284 1.1 mrg distances. These distances are conservatively correct but they don't
285 1.1 mrg reflect a guaranteed dependence.
286 1.1 mrg
287 1.1 mrg Return true if this function does all the work necessary to avoid
288 1.1 mrg an alias or false if the caller should use the dependence distances
289 1.1 mrg to limit the vectorization factor in the usual way. LOOP_DEPTH is
290 1.1 mrg the depth of the loop described by LOOP_VINFO and the other arguments
291 1.1 mrg are as for vect_analyze_data_ref_dependence. */
292 1.1 mrg
293 1.1 mrg static bool
294 1.1 mrg vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
295 1.1 mrg loop_vec_info loop_vinfo,
296 1.1 mrg int loop_depth, unsigned int *max_vf)
297 1.1 mrg {
298 1.1 mrg class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
299 1.1 mrg for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
300 1.1 mrg {
301 1.1 mrg int dist = dist_v[loop_depth];
302 1.1 mrg if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
303 1.1 mrg {
304 1.1 mrg /* If the user asserted safelen >= DIST consecutive iterations
305 1.1 mrg can be executed concurrently, assume independence.
306 1.1 mrg
307 1.1 mrg ??? An alternative would be to add the alias check even
308 1.1 mrg in this case, and vectorize the fallback loop with the
309 1.1 mrg maximum VF set to safelen. However, if the user has
310 1.1 mrg explicitly given a length, it's less likely that that
311 1.1 mrg would be a win. */
312 1.1 mrg if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
313 1.1 mrg {
314 1.1 mrg if ((unsigned int) loop->safelen < *max_vf)
315 1.1 mrg *max_vf = loop->safelen;
316 1.1 mrg LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
317 1.1 mrg continue;
318 1.1 mrg }
319 1.1 mrg
320 1.1 mrg /* For dependence distances of 2 or more, we have the option
321 1.1 mrg of limiting VF or checking for an alias at runtime.
322 1.1 mrg Prefer to check at runtime if we can, to avoid limiting
323 1.1 mrg the VF unnecessarily when the bases are in fact independent.
324 1.1 mrg
325 1.1 mrg Note that the alias checks will be removed if the VF ends up
326 1.1 mrg being small enough. */
327 1.1 mrg dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
328 1.1 mrg dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
329 1.1 mrg return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
330 1.1 mrg && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
331 1.1 mrg && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
332 1.1 mrg }
333 1.1 mrg }
334 1.1 mrg return true;
335 1.1 mrg }
336 1.1 mrg
337 1.1 mrg
338 1.1 mrg /* Function vect_analyze_data_ref_dependence.
339 1.1 mrg
340 1.1 mrg FIXME: I needed to change the sense of the returned flag.
341 1.1 mrg
342 1.1 mrg Return FALSE if there (might) exist a dependence between a memory-reference
343 1.1 mrg DRA and a memory-reference DRB. When versioning for alias may check a
344 1.1 mrg dependence at run-time, return TRUE. Adjust *MAX_VF according to
345 1.1 mrg the data dependence. */
346 1.1 mrg
347 1.1 mrg static opt_result
348 1.1 mrg vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
349 1.1 mrg loop_vec_info loop_vinfo,
350 1.1 mrg unsigned int *max_vf)
351 1.1 mrg {
352 1.1 mrg unsigned int i;
353 1.1 mrg class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
354 1.1 mrg struct data_reference *dra = DDR_A (ddr);
355 1.1 mrg struct data_reference *drb = DDR_B (ddr);
356 1.1 mrg dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
357 1.1 mrg dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
358 1.1 mrg stmt_vec_info stmtinfo_a = dr_info_a->stmt;
359 1.1 mrg stmt_vec_info stmtinfo_b = dr_info_b->stmt;
360 1.1 mrg lambda_vector dist_v;
361 1.1 mrg unsigned int loop_depth;
362 1.1 mrg
363 1.1 mrg /* If user asserted safelen consecutive iterations can be
364 1.1 mrg executed concurrently, assume independence. */
365 1.1 mrg auto apply_safelen = [&]()
366 1.1 mrg {
367 1.1 mrg if (loop->safelen >= 2)
368 1.1 mrg {
369 1.1 mrg if ((unsigned int) loop->safelen < *max_vf)
370 1.1 mrg *max_vf = loop->safelen;
371 1.1 mrg LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
372 1.1 mrg return true;
373 1.1 mrg }
374 1.1 mrg return false;
375 1.1 mrg };
376 1.1 mrg
377 1.1 mrg /* In loop analysis all data references should be vectorizable. */
378 1.1 mrg if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
379 1.1 mrg || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
380 1.1 mrg gcc_unreachable ();
381 1.1 mrg
382 1.1 mrg /* Independent data accesses. */
383 1.1 mrg if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
384 1.1 mrg return opt_result::success ();
385 1.1 mrg
386 1.1 mrg if (dra == drb
387 1.1 mrg || (DR_IS_READ (dra) && DR_IS_READ (drb)))
388 1.1 mrg return opt_result::success ();
389 1.1 mrg
390 1.1 mrg /* We do not have to consider dependences between accesses that belong
391 1.1 mrg to the same group, unless the stride could be smaller than the
392 1.1 mrg group size. */
393 1.1 mrg if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
394 1.1 mrg && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
395 1.1 mrg == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
396 1.1 mrg && !STMT_VINFO_STRIDED_P (stmtinfo_a))
397 1.1 mrg return opt_result::success ();
398 1.1 mrg
399 1.1 mrg /* Even if we have an anti-dependence then, as the vectorized loop covers at
400 1.1 mrg least two scalar iterations, there is always also a true dependence.
401 1.1 mrg As the vectorizer does not re-order loads and stores we can ignore
402 1.1 mrg the anti-dependence if TBAA can disambiguate both DRs similar to the
403 1.1 mrg case with known negative distance anti-dependences (positive
404 1.1 mrg distance anti-dependences would violate TBAA constraints). */
405 1.1 mrg if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
406 1.1 mrg || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
407 1.1 mrg && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
408 1.1 mrg get_alias_set (DR_REF (drb))))
409 1.1 mrg return opt_result::success ();
410 1.1 mrg
411 1.1 mrg if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
412 1.1 mrg || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
413 1.1 mrg {
414 1.1 mrg if (apply_safelen ())
415 1.1 mrg return opt_result::success ();
416 1.1 mrg
417 1.1 mrg return opt_result::failure_at
418 1.1 mrg (stmtinfo_a->stmt,
419 1.1 mrg "possible alias involving gather/scatter between %T and %T\n",
420 1.1 mrg DR_REF (dra), DR_REF (drb));
421 1.1 mrg }
422 1.1 mrg
423 1.1 mrg /* Unknown data dependence. */
424 1.1 mrg if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
425 1.1 mrg {
426 1.1 mrg if (apply_safelen ())
427 1.1 mrg return opt_result::success ();
428 1.1 mrg
429 1.1 mrg if (dump_enabled_p ())
430 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
431 1.1 mrg "versioning for alias required: "
432 1.1 mrg "can't determine dependence between %T and %T\n",
433 1.1 mrg DR_REF (dra), DR_REF (drb));
434 1.1 mrg
435 1.1 mrg /* Add to list of ddrs that need to be tested at run-time. */
436 1.1 mrg return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
437 1.1 mrg }
438 1.1 mrg
439 1.1 mrg /* Known data dependence. */
440 1.1 mrg if (DDR_NUM_DIST_VECTS (ddr) == 0)
441 1.1 mrg {
442 1.1 mrg if (apply_safelen ())
443 1.1 mrg return opt_result::success ();
444 1.1 mrg
445 1.1 mrg if (dump_enabled_p ())
446 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
447 1.1 mrg "versioning for alias required: "
448 1.1 mrg "bad dist vector for %T and %T\n",
449 1.1 mrg DR_REF (dra), DR_REF (drb));
450 1.1 mrg /* Add to list of ddrs that need to be tested at run-time. */
451 1.1 mrg return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
452 1.1 mrg }
453 1.1 mrg
454 1.1 mrg loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
455 1.1 mrg
456 1.1 mrg if (DDR_COULD_BE_INDEPENDENT_P (ddr)
457 1.1 mrg && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
458 1.1 mrg loop_depth, max_vf))
459 1.1 mrg return opt_result::success ();
460 1.1 mrg
461 1.1 mrg FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
462 1.1 mrg {
463 1.1 mrg int dist = dist_v[loop_depth];
464 1.1 mrg
465 1.1 mrg if (dump_enabled_p ())
466 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
467 1.1 mrg "dependence distance = %d.\n", dist);
468 1.1 mrg
469 1.1 mrg if (dist == 0)
470 1.1 mrg {
471 1.1 mrg if (dump_enabled_p ())
472 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
473 1.1 mrg "dependence distance == 0 between %T and %T\n",
474 1.1 mrg DR_REF (dra), DR_REF (drb));
475 1.1 mrg
476 1.1 mrg /* When we perform grouped accesses and perform implicit CSE
477 1.1 mrg by detecting equal accesses and doing disambiguation with
478 1.1 mrg runtime alias tests like for
479 1.1 mrg .. = a[i];
480 1.1 mrg .. = a[i+1];
481 1.1 mrg a[i] = ..;
482 1.1 mrg a[i+1] = ..;
483 1.1 mrg *p = ..;
484 1.1 mrg .. = a[i];
485 1.1 mrg .. = a[i+1];
486 1.1 mrg where we will end up loading { a[i], a[i+1] } once, make
487 1.1 mrg sure that inserting group loads before the first load and
488 1.1 mrg stores after the last store will do the right thing.
489 1.1 mrg Similar for groups like
490 1.1 mrg a[i] = ...;
491 1.1 mrg ... = a[i];
492 1.1 mrg a[i+1] = ...;
493 1.1 mrg where loads from the group interleave with the store. */
494 1.1 mrg if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
495 1.1 mrg return opt_result::failure_at (stmtinfo_a->stmt,
496 1.1 mrg "READ_WRITE dependence"
497 1.1 mrg " in interleaving.\n");
498 1.1 mrg
499 1.1 mrg if (loop->safelen < 2)
500 1.1 mrg {
501 1.1 mrg tree indicator = dr_zero_step_indicator (dra);
502 1.1 mrg if (!indicator || integer_zerop (indicator))
503 1.1 mrg return opt_result::failure_at (stmtinfo_a->stmt,
504 1.1 mrg "access also has a zero step\n");
505 1.1 mrg else if (TREE_CODE (indicator) != INTEGER_CST)
506 1.1 mrg vect_check_nonzero_value (loop_vinfo, indicator);
507 1.1 mrg }
508 1.1 mrg continue;
509 1.1 mrg }
510 1.1 mrg
511 1.1 mrg if (dist > 0 && DDR_REVERSED_P (ddr))
512 1.1 mrg {
513 1.1 mrg /* If DDR_REVERSED_P the order of the data-refs in DDR was
514 1.1 mrg reversed (to make distance vector positive), and the actual
515 1.1 mrg distance is negative. */
516 1.1 mrg if (dump_enabled_p ())
517 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
518 1.1 mrg "dependence distance negative.\n");
519 1.1 mrg /* When doing outer loop vectorization, we need to check if there is
520 1.1 mrg a backward dependence at the inner loop level if the dependence
521 1.1 mrg at the outer loop is reversed. See PR81740. */
522 1.1 mrg if (nested_in_vect_loop_p (loop, stmtinfo_a)
523 1.1 mrg || nested_in_vect_loop_p (loop, stmtinfo_b))
524 1.1 mrg {
525 1.1 mrg unsigned inner_depth = index_in_loop_nest (loop->inner->num,
526 1.1 mrg DDR_LOOP_NEST (ddr));
527 1.1 mrg if (dist_v[inner_depth] < 0)
528 1.1 mrg return opt_result::failure_at (stmtinfo_a->stmt,
529 1.1 mrg "not vectorized, dependence "
530 1.1 mrg "between data-refs %T and %T\n",
531 1.1 mrg DR_REF (dra), DR_REF (drb));
532 1.1 mrg }
533 1.1 mrg /* Record a negative dependence distance to later limit the
534 1.1 mrg amount of stmt copying / unrolling we can perform.
535 1.1 mrg Only need to handle read-after-write dependence. */
536 1.1 mrg if (DR_IS_READ (drb)
537 1.1 mrg && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
538 1.1 mrg || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
539 1.1 mrg STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
540 1.1 mrg continue;
541 1.1 mrg }
542 1.1 mrg
543 1.1 mrg unsigned int abs_dist = abs (dist);
544 1.1 mrg if (abs_dist >= 2 && abs_dist < *max_vf)
545 1.1 mrg {
546 1.1 mrg /* The dependence distance requires reduction of the maximal
547 1.1 mrg vectorization factor. */
548 1.1 mrg *max_vf = abs_dist;
549 1.1 mrg if (dump_enabled_p ())
550 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
551 1.1 mrg "adjusting maximal vectorization factor to %i\n",
552 1.1 mrg *max_vf);
553 1.1 mrg }
554 1.1 mrg
555 1.1 mrg if (abs_dist >= *max_vf)
556 1.1 mrg {
557 1.1 mrg /* Dependence distance does not create dependence, as far as
558 1.1 mrg vectorization is concerned, in this case. */
559 1.1 mrg if (dump_enabled_p ())
560 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
561 1.1 mrg "dependence distance >= VF.\n");
562 1.1 mrg continue;
563 1.1 mrg }
564 1.1 mrg
565 1.1 mrg return opt_result::failure_at (stmtinfo_a->stmt,
566 1.1 mrg "not vectorized, possible dependence "
567 1.1 mrg "between data-refs %T and %T\n",
568 1.1 mrg DR_REF (dra), DR_REF (drb));
569 1.1 mrg }
570 1.1 mrg
571 1.1 mrg return opt_result::success ();
572 1.1 mrg }
573 1.1 mrg
574 1.1 mrg /* Function vect_analyze_data_ref_dependences.
575 1.1 mrg
576 1.1 mrg Examine all the data references in the loop, and make sure there do not
577 1.1 mrg exist any data dependences between them. Set *MAX_VF according to
578 1.1 mrg the maximum vectorization factor the data dependences allow. */
579 1.1 mrg
580 1.1 mrg opt_result
581 1.1 mrg vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
582 1.1 mrg unsigned int *max_vf)
583 1.1 mrg {
584 1.1 mrg unsigned int i;
585 1.1 mrg struct data_dependence_relation *ddr;
586 1.1 mrg
587 1.1 mrg DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
588 1.1 mrg
589 1.1 mrg if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
590 1.1 mrg {
591 1.1 mrg LOOP_VINFO_DDRS (loop_vinfo)
592 1.1 mrg .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
593 1.1 mrg * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
594 1.1 mrg /* We do not need read-read dependences. */
595 1.1 mrg bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
596 1.1 mrg &LOOP_VINFO_DDRS (loop_vinfo),
597 1.1 mrg LOOP_VINFO_LOOP_NEST (loop_vinfo),
598 1.1 mrg false);
599 1.1 mrg gcc_assert (res);
600 1.1 mrg }
601 1.1 mrg
602 1.1 mrg LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
603 1.1 mrg
604 1.1 mrg /* For epilogues we either have no aliases or alias versioning
605 1.1 mrg was applied to original loop. Therefore we may just get max_vf
606 1.1 mrg using VF of original loop. */
607 1.1 mrg if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
608 1.1 mrg *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
609 1.1 mrg else
610 1.1 mrg FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
611 1.1 mrg {
612 1.1 mrg opt_result res
613 1.1 mrg = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
614 1.1 mrg if (!res)
615 1.1 mrg return res;
616 1.1 mrg }
617 1.1 mrg
618 1.1 mrg return opt_result::success ();
619 1.1 mrg }
620 1.1 mrg
621 1.1 mrg
622 1.1 mrg /* Function vect_slp_analyze_data_ref_dependence.
623 1.1 mrg
624 1.1 mrg Return TRUE if there (might) exist a dependence between a memory-reference
625 1.1 mrg DRA and a memory-reference DRB for VINFO. When versioning for alias
626 1.1 mrg may check a dependence at run-time, return FALSE. Adjust *MAX_VF
627 1.1 mrg according to the data dependence. */
628 1.1 mrg
629 1.1 mrg static bool
630 1.1 mrg vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
631 1.1 mrg struct data_dependence_relation *ddr)
632 1.1 mrg {
633 1.1 mrg struct data_reference *dra = DDR_A (ddr);
634 1.1 mrg struct data_reference *drb = DDR_B (ddr);
635 1.1 mrg dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
636 1.1 mrg dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
637 1.1 mrg
638 1.1 mrg /* We need to check dependences of statements marked as unvectorizable
639 1.1 mrg as well, they still can prohibit vectorization. */
640 1.1 mrg
641 1.1 mrg /* Independent data accesses. */
642 1.1 mrg if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
643 1.1 mrg return false;
644 1.1 mrg
645 1.1 mrg if (dra == drb)
646 1.1 mrg return false;
647 1.1 mrg
648 1.1 mrg /* Read-read is OK. */
649 1.1 mrg if (DR_IS_READ (dra) && DR_IS_READ (drb))
650 1.1 mrg return false;
651 1.1 mrg
652 1.1 mrg /* If dra and drb are part of the same interleaving chain consider
653 1.1 mrg them independent. */
654 1.1 mrg if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
655 1.1 mrg && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
656 1.1 mrg == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
657 1.1 mrg return false;
658 1.1 mrg
659 1.1 mrg /* Unknown data dependence. */
660 1.1 mrg if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
661 1.1 mrg {
662 1.1 mrg if (dump_enabled_p ())
663 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
664 1.1 mrg "can't determine dependence between %T and %T\n",
665 1.1 mrg DR_REF (dra), DR_REF (drb));
666 1.1 mrg }
667 1.1 mrg else if (dump_enabled_p ())
668 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
669 1.1 mrg "determined dependence between %T and %T\n",
670 1.1 mrg DR_REF (dra), DR_REF (drb));
671 1.1 mrg
672 1.1 mrg return true;
673 1.1 mrg }
674 1.1 mrg
675 1.1 mrg
676 1.1 mrg /* Analyze dependences involved in the transform of a store SLP NODE. */
677 1.1 mrg
678 1.1 mrg static bool
679 1.1 mrg vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
680 1.1 mrg {
681 1.1 mrg /* This walks over all stmts involved in the SLP store done
682 1.1 mrg in NODE verifying we can sink them up to the last stmt in the
683 1.1 mrg group. */
684 1.1 mrg stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
685 1.1 mrg gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
686 1.1 mrg
687 1.1 mrg for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
688 1.1 mrg {
689 1.1 mrg stmt_vec_info access_info
690 1.1 mrg = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
691 1.1 mrg if (access_info == last_access_info)
692 1.1 mrg continue;
693 1.1 mrg data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
694 1.1 mrg ao_ref ref;
695 1.1 mrg bool ref_initialized_p = false;
696 1.1 mrg for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
697 1.1 mrg gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
698 1.1 mrg {
699 1.1 mrg gimple *stmt = gsi_stmt (gsi);
700 1.1 mrg if (! gimple_vuse (stmt))
701 1.1 mrg continue;
702 1.1 mrg
703 1.1 mrg /* If we couldn't record a (single) data reference for this
704 1.1 mrg stmt we have to resort to the alias oracle. */
705 1.1 mrg stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
706 1.1 mrg data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
707 1.1 mrg if (!dr_b)
708 1.1 mrg {
709 1.1 mrg /* We are moving a store - this means
710 1.1 mrg we cannot use TBAA for disambiguation. */
711 1.1 mrg if (!ref_initialized_p)
712 1.1 mrg ao_ref_init (&ref, DR_REF (dr_a));
713 1.1 mrg if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
714 1.1 mrg || ref_maybe_used_by_stmt_p (stmt, &ref, false))
715 1.1 mrg return false;
716 1.1 mrg continue;
717 1.1 mrg }
718 1.1 mrg
719 1.1 mrg gcc_assert (!gimple_visited_p (stmt));
720 1.1 mrg
721 1.1 mrg ddr_p ddr = initialize_data_dependence_relation (dr_a,
722 1.1 mrg dr_b, vNULL);
723 1.1 mrg bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
724 1.1 mrg free_dependence_relation (ddr);
725 1.1 mrg if (dependent)
726 1.1 mrg return false;
727 1.1 mrg }
728 1.1 mrg }
729 1.1 mrg return true;
730 1.1 mrg }
731 1.1 mrg
732 1.1 mrg /* Analyze dependences involved in the transform of a load SLP NODE. STORES
733 1.1 mrg contain the vector of scalar stores of this instance if we are
734 1.1 mrg disambiguating the loads. */
735 1.1 mrg
736 1.1 mrg static bool
737 1.1 mrg vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
738 1.1 mrg vec<stmt_vec_info> stores,
739 1.1 mrg stmt_vec_info last_store_info)
740 1.1 mrg {
741 1.1 mrg /* This walks over all stmts involved in the SLP load done
742 1.1 mrg in NODE verifying we can hoist them up to the first stmt in the
743 1.1 mrg group. */
744 1.1 mrg stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
745 1.1 mrg gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
746 1.1 mrg
747 1.1 mrg for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
748 1.1 mrg {
749 1.1 mrg stmt_vec_info access_info
750 1.1 mrg = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
751 1.1 mrg if (access_info == first_access_info)
752 1.1 mrg continue;
753 1.1 mrg data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
754 1.1 mrg ao_ref ref;
755 1.1 mrg bool ref_initialized_p = false;
756 1.1 mrg hash_set<stmt_vec_info> grp_visited;
757 1.1 mrg for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
758 1.1 mrg gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
759 1.1 mrg {
760 1.1 mrg gimple *stmt = gsi_stmt (gsi);
761 1.1 mrg if (! gimple_vdef (stmt))
762 1.1 mrg continue;
763 1.1 mrg
764 1.1 mrg stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
765 1.1 mrg
766 1.1 mrg /* If we run into a store of this same instance (we've just
767 1.1 mrg marked those) then delay dependence checking until we run
768 1.1 mrg into the last store because this is where it will have
769 1.1 mrg been sunk to (and we verified that we can do that already). */
770 1.1 mrg if (gimple_visited_p (stmt))
771 1.1 mrg {
772 1.1 mrg if (stmt_info != last_store_info)
773 1.1 mrg continue;
774 1.1 mrg
775 1.1 mrg for (stmt_vec_info &store_info : stores)
776 1.1 mrg {
777 1.1 mrg data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
778 1.1 mrg ddr_p ddr = initialize_data_dependence_relation
779 1.1 mrg (dr_a, store_dr, vNULL);
780 1.1 mrg bool dependent
781 1.1 mrg = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
782 1.1 mrg free_dependence_relation (ddr);
783 1.1 mrg if (dependent)
784 1.1 mrg return false;
785 1.1 mrg }
786 1.1 mrg continue;
787 1.1 mrg }
788 1.1 mrg
789 1.1 mrg auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
790 1.1 mrg {
791 1.1 mrg /* We are hoisting a load - this means we can use TBAA for
792 1.1 mrg disambiguation. */
793 1.1 mrg if (!ref_initialized_p)
794 1.1 mrg ao_ref_init (&ref, DR_REF (dr_a));
795 1.1 mrg if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
796 1.1 mrg {
797 1.1 mrg /* If we couldn't record a (single) data reference for this
798 1.1 mrg stmt we have to give up now. */
799 1.1 mrg data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
800 1.1 mrg if (!dr_b)
801 1.1 mrg return false;
802 1.1 mrg ddr_p ddr = initialize_data_dependence_relation (dr_a,
803 1.1 mrg dr_b, vNULL);
804 1.1 mrg bool dependent
805 1.1 mrg = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
806 1.1 mrg free_dependence_relation (ddr);
807 1.1 mrg if (dependent)
808 1.1 mrg return false;
809 1.1 mrg }
810 1.1 mrg /* No dependence. */
811 1.1 mrg return true;
812 1.1 mrg };
813 1.1 mrg if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
814 1.1 mrg {
815 1.1 mrg /* When we run into a store group we have to honor
816 1.1 mrg that earlier stores might be moved here. We don't
817 1.1 mrg know exactly which and where to since we lack a
818 1.1 mrg back-mapping from DR to SLP node, so assume all
819 1.1 mrg earlier stores are sunk here. It's enough to
820 1.1 mrg consider the last stmt of a group for this.
821 1.1 mrg ??? Both this and the fact that we disregard that
822 1.1 mrg the conflicting instance might be removed later
823 1.1 mrg is overly conservative. */
824 1.1 mrg if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
825 1.1 mrg for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
826 1.1 mrg store_info != NULL;
827 1.1 mrg store_info = DR_GROUP_NEXT_ELEMENT (store_info))
828 1.1 mrg if ((store_info == stmt_info
829 1.1 mrg || get_later_stmt (store_info, stmt_info) == stmt_info)
830 1.1 mrg && !check_hoist (store_info))
831 1.1 mrg return false;
832 1.1 mrg }
833 1.1 mrg else
834 1.1 mrg {
835 1.1 mrg if (!check_hoist (stmt_info))
836 1.1 mrg return false;
837 1.1 mrg }
838 1.1 mrg }
839 1.1 mrg }
840 1.1 mrg return true;
841 1.1 mrg }
842 1.1 mrg
843 1.1 mrg
844 1.1 mrg /* Function vect_analyze_data_ref_dependences.
845 1.1 mrg
846 1.1 mrg Examine all the data references in the basic-block, and make sure there
847 1.1 mrg do not exist any data dependences between them. Set *MAX_VF according to
848 1.1 mrg the maximum vectorization factor the data dependences allow. */
849 1.1 mrg
850 1.1 mrg bool
851 1.1 mrg vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
852 1.1 mrg {
853 1.1 mrg DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
854 1.1 mrg
855 1.1 mrg /* The stores of this instance are at the root of the SLP tree. */
856 1.1 mrg slp_tree store = NULL;
857 1.1 mrg if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
858 1.1 mrg store = SLP_INSTANCE_TREE (instance);
859 1.1 mrg
860 1.1 mrg /* Verify we can sink stores to the vectorized stmt insert location. */
861 1.1 mrg stmt_vec_info last_store_info = NULL;
862 1.1 mrg if (store)
863 1.1 mrg {
864 1.1 mrg if (! vect_slp_analyze_store_dependences (vinfo, store))
865 1.1 mrg return false;
866 1.1 mrg
867 1.1 mrg /* Mark stores in this instance and remember the last one. */
868 1.1 mrg last_store_info = vect_find_last_scalar_stmt_in_slp (store);
869 1.1 mrg for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
870 1.1 mrg gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
871 1.1 mrg }
872 1.1 mrg
873 1.1 mrg bool res = true;
874 1.1 mrg
875 1.1 mrg /* Verify we can sink loads to the vectorized stmt insert location,
876 1.1 mrg special-casing stores of this instance. */
877 1.1 mrg for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
878 1.1 mrg if (! vect_slp_analyze_load_dependences (vinfo, load,
879 1.1 mrg store
880 1.1 mrg ? SLP_TREE_SCALAR_STMTS (store)
881 1.1 mrg : vNULL, last_store_info))
882 1.1 mrg {
883 1.1 mrg res = false;
884 1.1 mrg break;
885 1.1 mrg }
886 1.1 mrg
887 1.1 mrg /* Unset the visited flag. */
888 1.1 mrg if (store)
889 1.1 mrg for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
890 1.1 mrg gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
891 1.1 mrg
892 1.1 mrg return res;
893 1.1 mrg }
894 1.1 mrg
895 1.1 mrg /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
896 1.1 mrg applied. */
897 1.1 mrg
898 1.1 mrg int
899 1.1 mrg dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
900 1.1 mrg {
901 1.1 mrg HOST_WIDE_INT diff = 0;
902 1.1 mrg /* Alignment is only analyzed for the first element of a DR group,
903 1.1 mrg use that but adjust misalignment by the offset of the access. */
904 1.1 mrg if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
905 1.1 mrg {
906 1.1 mrg dr_vec_info *first_dr
907 1.1 mrg = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
908 1.1 mrg /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
909 1.1 mrg INTEGER_CSTs and the first element in the group has the lowest
910 1.1 mrg address. */
911 1.1 mrg diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
912 1.1 mrg - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
913 1.1 mrg gcc_assert (diff >= 0);
914 1.1 mrg dr_info = first_dr;
915 1.1 mrg }
916 1.1 mrg
917 1.1 mrg int misalign = dr_info->misalignment;
918 1.1 mrg gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
919 1.1 mrg if (misalign == DR_MISALIGNMENT_UNKNOWN)
920 1.1 mrg return misalign;
921 1.1 mrg
922 1.1 mrg /* If the access is only aligned for a vector type with smaller alignment
923 1.1 mrg requirement the access has unknown misalignment. */
924 1.1 mrg if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
925 1.1 mrg targetm.vectorize.preferred_vector_alignment (vectype)))
926 1.1 mrg return DR_MISALIGNMENT_UNKNOWN;
927 1.1 mrg
928 1.1 mrg /* Apply the offset from the DR group start and the externally supplied
929 1.1 mrg offset which can for example result from a negative stride access. */
930 1.1 mrg poly_int64 misalignment = misalign + diff + offset;
931 1.1 mrg
932 1.1 mrg /* vect_compute_data_ref_alignment will have ensured that target_alignment
933 1.1 mrg is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
934 1.1 mrg unsigned HOST_WIDE_INT target_alignment_c
935 1.1 mrg = dr_info->target_alignment.to_constant ();
936 1.1 mrg if (!known_misalignment (misalignment, target_alignment_c, &misalign))
937 1.1 mrg return DR_MISALIGNMENT_UNKNOWN;
938 1.1 mrg return misalign;
939 1.1 mrg }
940 1.1 mrg
941 1.1 mrg /* Record the base alignment guarantee given by DRB, which occurs
942 1.1 mrg in STMT_INFO. */
943 1.1 mrg
944 1.1 mrg static void
945 1.1 mrg vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
946 1.1 mrg innermost_loop_behavior *drb)
947 1.1 mrg {
948 1.1 mrg bool existed;
949 1.1 mrg std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
950 1.1 mrg = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
951 1.1 mrg if (!existed || entry.second->base_alignment < drb->base_alignment)
952 1.1 mrg {
953 1.1 mrg entry = std::make_pair (stmt_info, drb);
954 1.1 mrg if (dump_enabled_p ())
955 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
956 1.1 mrg "recording new base alignment for %T\n"
957 1.1 mrg " alignment: %d\n"
958 1.1 mrg " misalignment: %d\n"
959 1.1 mrg " based on: %G",
960 1.1 mrg drb->base_address,
961 1.1 mrg drb->base_alignment,
962 1.1 mrg drb->base_misalignment,
963 1.1 mrg stmt_info->stmt);
964 1.1 mrg }
965 1.1 mrg }
966 1.1 mrg
967 1.1 mrg /* If the region we're going to vectorize is reached, all unconditional
968 1.1 mrg data references occur at least once. We can therefore pool the base
969 1.1 mrg alignment guarantees from each unconditional reference. Do this by
970 1.1 mrg going through all the data references in VINFO and checking whether
971 1.1 mrg the containing statement makes the reference unconditionally. If so,
972 1.1 mrg record the alignment of the base address in VINFO so that it can be
973 1.1 mrg used for all other references with the same base. */
974 1.1 mrg
975 1.1 mrg void
976 1.1 mrg vect_record_base_alignments (vec_info *vinfo)
977 1.1 mrg {
978 1.1 mrg loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
979 1.1 mrg class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
980 1.1 mrg for (data_reference *dr : vinfo->shared->datarefs)
981 1.1 mrg {
982 1.1 mrg dr_vec_info *dr_info = vinfo->lookup_dr (dr);
983 1.1 mrg stmt_vec_info stmt_info = dr_info->stmt;
984 1.1 mrg if (!DR_IS_CONDITIONAL_IN_STMT (dr)
985 1.1 mrg && STMT_VINFO_VECTORIZABLE (stmt_info)
986 1.1 mrg && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
987 1.1 mrg {
988 1.1 mrg vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
989 1.1 mrg
990 1.1 mrg /* If DR is nested in the loop that is being vectorized, we can also
991 1.1 mrg record the alignment of the base wrt the outer loop. */
992 1.1 mrg if (loop && nested_in_vect_loop_p (loop, stmt_info))
993 1.1 mrg vect_record_base_alignment
994 1.1 mrg (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
995 1.1 mrg }
996 1.1 mrg }
997 1.1 mrg }
998 1.1 mrg
999 1.1 mrg /* Function vect_compute_data_ref_alignment
1000 1.1 mrg
1001 1.1 mrg Compute the misalignment of the data reference DR_INFO when vectorizing
1002 1.1 mrg with VECTYPE.
1003 1.1 mrg
1004 1.1 mrg Output:
1005 1.1 mrg 1. initialized misalignment info for DR_INFO
1006 1.1 mrg
1007 1.1 mrg FOR NOW: No analysis is actually performed. Misalignment is calculated
1008 1.1 mrg only for trivial cases. TODO. */
1009 1.1 mrg
1010 1.1 mrg static void
1011 1.1 mrg vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1012 1.1 mrg tree vectype)
1013 1.1 mrg {
1014 1.1 mrg stmt_vec_info stmt_info = dr_info->stmt;
1015 1.1 mrg vec_base_alignments *base_alignments = &vinfo->base_alignments;
1016 1.1 mrg loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1017 1.1 mrg class loop *loop = NULL;
1018 1.1 mrg tree ref = DR_REF (dr_info->dr);
1019 1.1 mrg
1020 1.1 mrg if (dump_enabled_p ())
1021 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
1022 1.1 mrg "vect_compute_data_ref_alignment:\n");
1023 1.1 mrg
1024 1.1 mrg if (loop_vinfo)
1025 1.1 mrg loop = LOOP_VINFO_LOOP (loop_vinfo);
1026 1.1 mrg
1027 1.1 mrg /* Initialize misalignment to unknown. */
1028 1.1 mrg SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1029 1.1 mrg
1030 1.1 mrg if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1031 1.1 mrg return;
1032 1.1 mrg
1033 1.1 mrg innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1034 1.1 mrg bool step_preserves_misalignment_p;
1035 1.1 mrg
1036 1.1 mrg poly_uint64 vector_alignment
1037 1.1 mrg = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1038 1.1 mrg BITS_PER_UNIT);
1039 1.1 mrg SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1040 1.1 mrg
1041 1.1 mrg /* If the main loop has peeled for alignment we have no way of knowing
1042 1.1 mrg whether the data accesses in the epilogues are aligned. We can't at
1043 1.1 mrg compile time answer the question whether we have entered the main loop or
1044 1.1 mrg not. Fixes PR 92351. */
1045 1.1 mrg if (loop_vinfo)
1046 1.1 mrg {
1047 1.1 mrg loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1048 1.1 mrg if (orig_loop_vinfo
1049 1.1 mrg && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1050 1.1 mrg return;
1051 1.1 mrg }
1052 1.1 mrg
1053 1.1 mrg unsigned HOST_WIDE_INT vect_align_c;
1054 1.1 mrg if (!vector_alignment.is_constant (&vect_align_c))
1055 1.1 mrg return;
1056 1.1 mrg
1057 1.1 mrg /* No step for BB vectorization. */
1058 1.1 mrg if (!loop)
1059 1.1 mrg {
1060 1.1 mrg gcc_assert (integer_zerop (drb->step));
1061 1.1 mrg step_preserves_misalignment_p = true;
1062 1.1 mrg }
1063 1.1 mrg
1064 1.1 mrg /* In case the dataref is in an inner-loop of the loop that is being
1065 1.1 mrg vectorized (LOOP), we use the base and misalignment information
1066 1.1 mrg relative to the outer-loop (LOOP). This is ok only if the misalignment
1067 1.1 mrg stays the same throughout the execution of the inner-loop, which is why
1068 1.1 mrg we have to check that the stride of the dataref in the inner-loop evenly
1069 1.1 mrg divides by the vector alignment. */
1070 1.1 mrg else if (nested_in_vect_loop_p (loop, stmt_info))
1071 1.1 mrg {
1072 1.1 mrg step_preserves_misalignment_p
1073 1.1 mrg = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1074 1.1 mrg
1075 1.1 mrg if (dump_enabled_p ())
1076 1.1 mrg {
1077 1.1 mrg if (step_preserves_misalignment_p)
1078 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
1079 1.1 mrg "inner step divides the vector alignment.\n");
1080 1.1 mrg else
1081 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1082 1.1 mrg "inner step doesn't divide the vector"
1083 1.1 mrg " alignment.\n");
1084 1.1 mrg }
1085 1.1 mrg }
1086 1.1 mrg
1087 1.1 mrg /* Similarly we can only use base and misalignment information relative to
1088 1.1 mrg an innermost loop if the misalignment stays the same throughout the
1089 1.1 mrg execution of the loop. As above, this is the case if the stride of
1090 1.1 mrg the dataref evenly divides by the alignment. */
1091 1.1 mrg else
1092 1.1 mrg {
1093 1.1 mrg poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1094 1.1 mrg step_preserves_misalignment_p
1095 1.1 mrg = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1096 1.1 mrg
1097 1.1 mrg if (!step_preserves_misalignment_p && dump_enabled_p ())
1098 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1099 1.1 mrg "step doesn't divide the vector alignment.\n");
1100 1.1 mrg }
1101 1.1 mrg
1102 1.1 mrg unsigned int base_alignment = drb->base_alignment;
1103 1.1 mrg unsigned int base_misalignment = drb->base_misalignment;
1104 1.1 mrg
1105 1.1 mrg /* Calculate the maximum of the pooled base address alignment and the
1106 1.1 mrg alignment that we can compute for DR itself. */
1107 1.1 mrg std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1108 1.1 mrg = base_alignments->get (drb->base_address);
1109 1.1 mrg if (entry
1110 1.1 mrg && base_alignment < (*entry).second->base_alignment
1111 1.1 mrg && (loop_vinfo
1112 1.1 mrg || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1113 1.1 mrg gimple_bb (entry->first->stmt))
1114 1.1 mrg && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1115 1.1 mrg || (entry->first->dr_aux.group <= dr_info->group)))))
1116 1.1 mrg {
1117 1.1 mrg base_alignment = entry->second->base_alignment;
1118 1.1 mrg base_misalignment = entry->second->base_misalignment;
1119 1.1 mrg }
1120 1.1 mrg
1121 1.1 mrg if (drb->offset_alignment < vect_align_c
1122 1.1 mrg || !step_preserves_misalignment_p
1123 1.1 mrg /* We need to know whether the step wrt the vectorized loop is
1124 1.1 mrg negative when computing the starting misalignment below. */
1125 1.1 mrg || TREE_CODE (drb->step) != INTEGER_CST)
1126 1.1 mrg {
1127 1.1 mrg if (dump_enabled_p ())
1128 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1129 1.1 mrg "Unknown alignment for access: %T\n", ref);
1130 1.1 mrg return;
1131 1.1 mrg }
1132 1.1 mrg
1133 1.1 mrg if (base_alignment < vect_align_c)
1134 1.1 mrg {
1135 1.1 mrg unsigned int max_alignment;
1136 1.1 mrg tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1137 1.1 mrg if (max_alignment < vect_align_c
1138 1.1 mrg || !vect_can_force_dr_alignment_p (base,
1139 1.1 mrg vect_align_c * BITS_PER_UNIT))
1140 1.1 mrg {
1141 1.1 mrg if (dump_enabled_p ())
1142 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
1143 1.1 mrg "can't force alignment of ref: %T\n", ref);
1144 1.1 mrg return;
1145 1.1 mrg }
1146 1.1 mrg
1147 1.1 mrg /* Force the alignment of the decl.
1148 1.1 mrg NOTE: This is the only change to the code we make during
1149 1.1 mrg the analysis phase, before deciding to vectorize the loop. */
1150 1.1 mrg if (dump_enabled_p ())
1151 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
1152 1.1 mrg "force alignment of %T\n", ref);
1153 1.1 mrg
1154 1.1 mrg dr_info->base_decl = base;
1155 1.1 mrg dr_info->base_misaligned = true;
1156 1.1 mrg base_misalignment = 0;
1157 1.1 mrg }
1158 1.1 mrg poly_int64 misalignment
1159 1.1 mrg = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1160 1.1 mrg
1161 1.1 mrg unsigned int const_misalignment;
1162 1.1 mrg if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1163 1.1 mrg {
1164 1.1 mrg if (dump_enabled_p ())
1165 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1166 1.1 mrg "Non-constant misalignment for access: %T\n", ref);
1167 1.1 mrg return;
1168 1.1 mrg }
1169 1.1 mrg
1170 1.1 mrg SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1171 1.1 mrg
1172 1.1 mrg if (dump_enabled_p ())
1173 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 1.1 mrg "misalign = %d bytes of ref %T\n",
1175 1.1 mrg const_misalignment, ref);
1176 1.1 mrg
1177 1.1 mrg return;
1178 1.1 mrg }
1179 1.1 mrg
1180 1.1 mrg /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1181 1.1 mrg that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1182 1.1 mrg is made aligned via peeling. */
1183 1.1 mrg
1184 1.1 mrg static bool
1185 1.1 mrg vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1186 1.1 mrg dr_vec_info *dr_peel_info)
1187 1.1 mrg {
1188 1.1 mrg if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1189 1.1 mrg DR_TARGET_ALIGNMENT (dr_info)))
1190 1.1 mrg {
1191 1.1 mrg poly_offset_int diff
1192 1.1 mrg = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1193 1.1 mrg - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1194 1.1 mrg if (known_eq (diff, 0)
1195 1.1 mrg || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1196 1.1 mrg return true;
1197 1.1 mrg }
1198 1.1 mrg return false;
1199 1.1 mrg }
1200 1.1 mrg
1201 1.1 mrg /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1202 1.1 mrg aligned via peeling. */
1203 1.1 mrg
1204 1.1 mrg static bool
1205 1.1 mrg vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1206 1.1 mrg dr_vec_info *dr_peel_info)
1207 1.1 mrg {
1208 1.1 mrg if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1209 1.1 mrg DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1210 1.1 mrg || !operand_equal_p (DR_OFFSET (dr_info->dr),
1211 1.1 mrg DR_OFFSET (dr_peel_info->dr), 0)
1212 1.1 mrg || !operand_equal_p (DR_STEP (dr_info->dr),
1213 1.1 mrg DR_STEP (dr_peel_info->dr), 0))
1214 1.1 mrg return false;
1215 1.1 mrg
1216 1.1 mrg return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1217 1.1 mrg }
1218 1.1 mrg
1219 1.1 mrg /* Compute the value for dr_info->misalign so that the access appears
1220 1.1 mrg aligned. This is used by peeling to compensate for dr_misalignment
1221 1.1 mrg applying the offset for negative step. */
1222 1.1 mrg
1223 1.1 mrg int
1224 1.1 mrg vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1225 1.1 mrg {
1226 1.1 mrg if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1227 1.1 mrg return 0;
1228 1.1 mrg
1229 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1230 1.1 mrg poly_int64 misalignment
1231 1.1 mrg = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1232 1.1 mrg * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1233 1.1 mrg
1234 1.1 mrg unsigned HOST_WIDE_INT target_alignment_c;
1235 1.1 mrg int misalign;
1236 1.1 mrg if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1237 1.1 mrg || !known_misalignment (misalignment, target_alignment_c, &misalign))
1238 1.1 mrg return DR_MISALIGNMENT_UNKNOWN;
1239 1.1 mrg return misalign;
1240 1.1 mrg }
1241 1.1 mrg
1242 1.1 mrg /* Function vect_update_misalignment_for_peel.
1243 1.1 mrg Sets DR_INFO's misalignment
1244 1.1 mrg - to 0 if it has the same alignment as DR_PEEL_INFO,
1245 1.1 mrg - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1246 1.1 mrg - to -1 (unknown) otherwise.
1247 1.1 mrg
1248 1.1 mrg DR_INFO - the data reference whose misalignment is to be adjusted.
1249 1.1 mrg DR_PEEL_INFO - the data reference whose misalignment is being made
1250 1.1 mrg zero in the vector loop by the peel.
1251 1.1 mrg NPEEL - the number of iterations in the peel loop if the misalignment
1252 1.1 mrg of DR_PEEL_INFO is known at compile time. */
1253 1.1 mrg
1254 1.1 mrg static void
1255 1.1 mrg vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1256 1.1 mrg dr_vec_info *dr_peel_info, int npeel)
1257 1.1 mrg {
1258 1.1 mrg /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1259 1.1 mrg if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1260 1.1 mrg {
1261 1.1 mrg SET_DR_MISALIGNMENT (dr_info,
1262 1.1 mrg vect_dr_misalign_for_aligned_access (dr_peel_info));
1263 1.1 mrg return;
1264 1.1 mrg }
1265 1.1 mrg
1266 1.1 mrg unsigned HOST_WIDE_INT alignment;
1267 1.1 mrg if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1268 1.1 mrg && known_alignment_for_access_p (dr_info,
1269 1.1 mrg STMT_VINFO_VECTYPE (dr_info->stmt))
1270 1.1 mrg && known_alignment_for_access_p (dr_peel_info,
1271 1.1 mrg STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1272 1.1 mrg {
1273 1.1 mrg int misal = dr_info->misalignment;
1274 1.1 mrg misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1275 1.1 mrg misal &= alignment - 1;
1276 1.1 mrg set_dr_misalignment (dr_info, misal);
1277 1.1 mrg return;
1278 1.1 mrg }
1279 1.1 mrg
1280 1.1 mrg if (dump_enabled_p ())
1281 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1282 1.1 mrg "to unknown (-1).\n");
1283 1.1 mrg SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1284 1.1 mrg }
1285 1.1 mrg
1286 1.1 mrg /* Return true if alignment is relevant for DR_INFO. */
1287 1.1 mrg
1288 1.1 mrg static bool
1289 1.1 mrg vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1290 1.1 mrg {
1291 1.1 mrg stmt_vec_info stmt_info = dr_info->stmt;
1292 1.1 mrg
1293 1.1 mrg if (!STMT_VINFO_RELEVANT_P (stmt_info))
1294 1.1 mrg return false;
1295 1.1 mrg
1296 1.1 mrg /* For interleaving, only the alignment of the first access matters. */
1297 1.1 mrg if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1298 1.1 mrg && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1299 1.1 mrg return false;
1300 1.1 mrg
1301 1.1 mrg /* Scatter-gather and invariant accesses continue to address individual
1302 1.1 mrg scalars, so vector-level alignment is irrelevant. */
1303 1.1 mrg if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1304 1.1 mrg || integer_zerop (DR_STEP (dr_info->dr)))
1305 1.1 mrg return false;
1306 1.1 mrg
1307 1.1 mrg /* Strided accesses perform only component accesses, alignment is
1308 1.1 mrg irrelevant for them. */
1309 1.1 mrg if (STMT_VINFO_STRIDED_P (stmt_info)
1310 1.1 mrg && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1311 1.1 mrg return false;
1312 1.1 mrg
1313 1.1 mrg return true;
1314 1.1 mrg }
1315 1.1 mrg
1316 1.1 mrg /* Given an memory reference EXP return whether its alignment is less
1317 1.1 mrg than its size. */
1318 1.1 mrg
1319 1.1 mrg static bool
1320 1.1 mrg not_size_aligned (tree exp)
1321 1.1 mrg {
1322 1.1 mrg if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1323 1.1 mrg return true;
1324 1.1 mrg
1325 1.1 mrg return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1326 1.1 mrg > get_object_alignment (exp));
1327 1.1 mrg }
1328 1.1 mrg
1329 1.1 mrg /* Function vector_alignment_reachable_p
1330 1.1 mrg
1331 1.1 mrg Return true if vector alignment for DR_INFO is reachable by peeling
1332 1.1 mrg a few loop iterations. Return false otherwise. */
1333 1.1 mrg
1334 1.1 mrg static bool
1335 1.1 mrg vector_alignment_reachable_p (dr_vec_info *dr_info)
1336 1.1 mrg {
1337 1.1 mrg stmt_vec_info stmt_info = dr_info->stmt;
1338 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1339 1.1 mrg
1340 1.1 mrg if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1341 1.1 mrg {
1342 1.1 mrg /* For interleaved access we peel only if number of iterations in
1343 1.1 mrg the prolog loop ({VF - misalignment}), is a multiple of the
1344 1.1 mrg number of the interleaved accesses. */
1345 1.1 mrg int elem_size, mis_in_elements;
1346 1.1 mrg
1347 1.1 mrg /* FORNOW: handle only known alignment. */
1348 1.1 mrg if (!known_alignment_for_access_p (dr_info, vectype))
1349 1.1 mrg return false;
1350 1.1 mrg
1351 1.1 mrg poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1352 1.1 mrg poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1353 1.1 mrg elem_size = vector_element_size (vector_size, nelements);
1354 1.1 mrg mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1355 1.1 mrg
1356 1.1 mrg if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1357 1.1 mrg return false;
1358 1.1 mrg }
1359 1.1 mrg
1360 1.1 mrg /* If misalignment is known at the compile time then allow peeling
1361 1.1 mrg only if natural alignment is reachable through peeling. */
1362 1.1 mrg if (known_alignment_for_access_p (dr_info, vectype)
1363 1.1 mrg && !aligned_access_p (dr_info, vectype))
1364 1.1 mrg {
1365 1.1 mrg HOST_WIDE_INT elmsize =
1366 1.1 mrg int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1367 1.1 mrg if (dump_enabled_p ())
1368 1.1 mrg {
1369 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
1370 1.1 mrg "data size = %wd. misalignment = %d.\n", elmsize,
1371 1.1 mrg dr_misalignment (dr_info, vectype));
1372 1.1 mrg }
1373 1.1 mrg if (dr_misalignment (dr_info, vectype) % elmsize)
1374 1.1 mrg {
1375 1.1 mrg if (dump_enabled_p ())
1376 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377 1.1 mrg "data size does not divide the misalignment.\n");
1378 1.1 mrg return false;
1379 1.1 mrg }
1380 1.1 mrg }
1381 1.1 mrg
1382 1.1 mrg if (!known_alignment_for_access_p (dr_info, vectype))
1383 1.1 mrg {
1384 1.1 mrg tree type = TREE_TYPE (DR_REF (dr_info->dr));
1385 1.1 mrg bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1386 1.1 mrg if (dump_enabled_p ())
1387 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1388 1.1 mrg "Unknown misalignment, %snaturally aligned\n",
1389 1.1 mrg is_packed ? "not " : "");
1390 1.1 mrg return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1391 1.1 mrg }
1392 1.1 mrg
1393 1.1 mrg return true;
1394 1.1 mrg }
1395 1.1 mrg
1396 1.1 mrg
1397 1.1 mrg /* Calculate the cost of the memory access represented by DR_INFO. */
1398 1.1 mrg
1399 1.1 mrg static void
1400 1.1 mrg vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1401 1.1 mrg dr_alignment_support alignment_support_scheme,
1402 1.1 mrg int misalignment,
1403 1.1 mrg unsigned int *inside_cost,
1404 1.1 mrg unsigned int *outside_cost,
1405 1.1 mrg stmt_vector_for_cost *body_cost_vec,
1406 1.1 mrg stmt_vector_for_cost *prologue_cost_vec)
1407 1.1 mrg {
1408 1.1 mrg stmt_vec_info stmt_info = dr_info->stmt;
1409 1.1 mrg loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1410 1.1 mrg int ncopies;
1411 1.1 mrg
1412 1.1 mrg if (PURE_SLP_STMT (stmt_info))
1413 1.1 mrg ncopies = 1;
1414 1.1 mrg else
1415 1.1 mrg ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1416 1.1 mrg
1417 1.1 mrg if (DR_IS_READ (dr_info->dr))
1418 1.1 mrg vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1419 1.1 mrg misalignment, true, inside_cost,
1420 1.1 mrg outside_cost, prologue_cost_vec, body_cost_vec, false);
1421 1.1 mrg else
1422 1.1 mrg vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1423 1.1 mrg misalignment, inside_cost, body_cost_vec);
1424 1.1 mrg
1425 1.1 mrg if (dump_enabled_p ())
1426 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
1427 1.1 mrg "vect_get_data_access_cost: inside_cost = %d, "
1428 1.1 mrg "outside_cost = %d.\n", *inside_cost, *outside_cost);
1429 1.1 mrg }
1430 1.1 mrg
1431 1.1 mrg
1432 1.1 mrg typedef struct _vect_peel_info
1433 1.1 mrg {
1434 1.1 mrg dr_vec_info *dr_info;
1435 1.1 mrg int npeel;
1436 1.1 mrg unsigned int count;
1437 1.1 mrg } *vect_peel_info;
1438 1.1 mrg
1439 1.1 mrg typedef struct _vect_peel_extended_info
1440 1.1 mrg {
1441 1.1 mrg vec_info *vinfo;
1442 1.1 mrg struct _vect_peel_info peel_info;
1443 1.1 mrg unsigned int inside_cost;
1444 1.1 mrg unsigned int outside_cost;
1445 1.1 mrg } *vect_peel_extended_info;
1446 1.1 mrg
1447 1.1 mrg
1448 1.1 mrg /* Peeling hashtable helpers. */
1449 1.1 mrg
1450 1.1 mrg struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1451 1.1 mrg {
1452 1.1 mrg static inline hashval_t hash (const _vect_peel_info *);
1453 1.1 mrg static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1454 1.1 mrg };
1455 1.1 mrg
1456 1.1 mrg inline hashval_t
1457 1.1 mrg peel_info_hasher::hash (const _vect_peel_info *peel_info)
1458 1.1 mrg {
1459 1.1 mrg return (hashval_t) peel_info->npeel;
1460 1.1 mrg }
1461 1.1 mrg
1462 1.1 mrg inline bool
1463 1.1 mrg peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1464 1.1 mrg {
1465 1.1 mrg return (a->npeel == b->npeel);
1466 1.1 mrg }
1467 1.1 mrg
1468 1.1 mrg
1469 1.1 mrg /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1470 1.1 mrg
1471 1.1 mrg static void
1472 1.1 mrg vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1473 1.1 mrg loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1474 1.1 mrg int npeel, bool supportable_if_not_aligned)
1475 1.1 mrg {
1476 1.1 mrg struct _vect_peel_info elem, *slot;
1477 1.1 mrg _vect_peel_info **new_slot;
1478 1.1 mrg
1479 1.1 mrg elem.npeel = npeel;
1480 1.1 mrg slot = peeling_htab->find (&elem);
1481 1.1 mrg if (slot)
1482 1.1 mrg slot->count++;
1483 1.1 mrg else
1484 1.1 mrg {
1485 1.1 mrg slot = XNEW (struct _vect_peel_info);
1486 1.1 mrg slot->npeel = npeel;
1487 1.1 mrg slot->dr_info = dr_info;
1488 1.1 mrg slot->count = 1;
1489 1.1 mrg new_slot = peeling_htab->find_slot (slot, INSERT);
1490 1.1 mrg *new_slot = slot;
1491 1.1 mrg }
1492 1.1 mrg
1493 1.1 mrg /* If this DR is not supported with unknown misalignment then bias
1494 1.1 mrg this slot when the cost model is disabled. */
1495 1.1 mrg if (!supportable_if_not_aligned
1496 1.1 mrg && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1497 1.1 mrg slot->count += VECT_MAX_COST;
1498 1.1 mrg }
1499 1.1 mrg
1500 1.1 mrg
1501 1.1 mrg /* Traverse peeling hash table to find peeling option that aligns maximum
1502 1.1 mrg number of data accesses. */
1503 1.1 mrg
1504 1.1 mrg int
1505 1.1 mrg vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1506 1.1 mrg _vect_peel_extended_info *max)
1507 1.1 mrg {
1508 1.1 mrg vect_peel_info elem = *slot;
1509 1.1 mrg
1510 1.1 mrg if (elem->count > max->peel_info.count
1511 1.1 mrg || (elem->count == max->peel_info.count
1512 1.1 mrg && max->peel_info.npeel > elem->npeel))
1513 1.1 mrg {
1514 1.1 mrg max->peel_info.npeel = elem->npeel;
1515 1.1 mrg max->peel_info.count = elem->count;
1516 1.1 mrg max->peel_info.dr_info = elem->dr_info;
1517 1.1 mrg }
1518 1.1 mrg
1519 1.1 mrg return 1;
1520 1.1 mrg }
1521 1.1 mrg
1522 1.1 mrg /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1523 1.1 mrg data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1524 1.1 mrg npeel is computed at runtime but DR0_INFO's misalignment will be zero
1525 1.1 mrg after peeling. */
1526 1.1 mrg
1527 1.1 mrg static void
1528 1.1 mrg vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1529 1.1 mrg dr_vec_info *dr0_info,
1530 1.1 mrg unsigned int *inside_cost,
1531 1.1 mrg unsigned int *outside_cost,
1532 1.1 mrg stmt_vector_for_cost *body_cost_vec,
1533 1.1 mrg stmt_vector_for_cost *prologue_cost_vec,
1534 1.1 mrg unsigned int npeel)
1535 1.1 mrg {
1536 1.1 mrg vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1537 1.1 mrg
1538 1.1 mrg bool dr0_alignment_known_p
1539 1.1 mrg = (dr0_info
1540 1.1 mrg && known_alignment_for_access_p (dr0_info,
1541 1.1 mrg STMT_VINFO_VECTYPE (dr0_info->stmt)));
1542 1.1 mrg
1543 1.1 mrg for (data_reference *dr : datarefs)
1544 1.1 mrg {
1545 1.1 mrg dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1546 1.1 mrg if (!vect_relevant_for_alignment_p (dr_info))
1547 1.1 mrg continue;
1548 1.1 mrg
1549 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1550 1.1 mrg dr_alignment_support alignment_support_scheme;
1551 1.1 mrg int misalignment;
1552 1.1 mrg unsigned HOST_WIDE_INT alignment;
1553 1.1 mrg
1554 1.1 mrg bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1555 1.1 mrg size_zero_node) < 0;
1556 1.1 mrg poly_int64 off = 0;
1557 1.1 mrg if (negative)
1558 1.1 mrg off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1559 1.1 mrg * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1560 1.1 mrg
1561 1.1 mrg if (npeel == 0)
1562 1.1 mrg misalignment = dr_misalignment (dr_info, vectype, off);
1563 1.1 mrg else if (dr_info == dr0_info
1564 1.1 mrg || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1565 1.1 mrg misalignment = 0;
1566 1.1 mrg else if (!dr0_alignment_known_p
1567 1.1 mrg || !known_alignment_for_access_p (dr_info, vectype)
1568 1.1 mrg || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1569 1.1 mrg misalignment = DR_MISALIGNMENT_UNKNOWN;
1570 1.1 mrg else
1571 1.1 mrg {
1572 1.1 mrg misalignment = dr_misalignment (dr_info, vectype, off);
1573 1.1 mrg misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1574 1.1 mrg misalignment &= alignment - 1;
1575 1.1 mrg }
1576 1.1 mrg alignment_support_scheme
1577 1.1 mrg = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1578 1.1 mrg misalignment);
1579 1.1 mrg
1580 1.1 mrg vect_get_data_access_cost (loop_vinfo, dr_info,
1581 1.1 mrg alignment_support_scheme, misalignment,
1582 1.1 mrg inside_cost, outside_cost,
1583 1.1 mrg body_cost_vec, prologue_cost_vec);
1584 1.1 mrg }
1585 1.1 mrg }
1586 1.1 mrg
1587 1.1 mrg /* Traverse peeling hash table and calculate cost for each peeling option.
1588 1.1 mrg Find the one with the lowest cost. */
1589 1.1 mrg
1590 1.1 mrg int
1591 1.1 mrg vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1592 1.1 mrg _vect_peel_extended_info *min)
1593 1.1 mrg {
1594 1.1 mrg vect_peel_info elem = *slot;
1595 1.1 mrg int dummy;
1596 1.1 mrg unsigned int inside_cost = 0, outside_cost = 0;
1597 1.1 mrg loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1598 1.1 mrg stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1599 1.1 mrg epilogue_cost_vec;
1600 1.1 mrg
1601 1.1 mrg prologue_cost_vec.create (2);
1602 1.1 mrg body_cost_vec.create (2);
1603 1.1 mrg epilogue_cost_vec.create (2);
1604 1.1 mrg
1605 1.1 mrg vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1606 1.1 mrg &outside_cost, &body_cost_vec,
1607 1.1 mrg &prologue_cost_vec, elem->npeel);
1608 1.1 mrg
1609 1.1 mrg body_cost_vec.release ();
1610 1.1 mrg
1611 1.1 mrg outside_cost += vect_get_known_peeling_cost
1612 1.1 mrg (loop_vinfo, elem->npeel, &dummy,
1613 1.1 mrg &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1614 1.1 mrg &prologue_cost_vec, &epilogue_cost_vec);
1615 1.1 mrg
1616 1.1 mrg /* Prologue and epilogue costs are added to the target model later.
1617 1.1 mrg These costs depend only on the scalar iteration cost, the
1618 1.1 mrg number of peeling iterations finally chosen, and the number of
1619 1.1 mrg misaligned statements. So discard the information found here. */
1620 1.1 mrg prologue_cost_vec.release ();
1621 1.1 mrg epilogue_cost_vec.release ();
1622 1.1 mrg
1623 1.1 mrg if (inside_cost < min->inside_cost
1624 1.1 mrg || (inside_cost == min->inside_cost
1625 1.1 mrg && outside_cost < min->outside_cost))
1626 1.1 mrg {
1627 1.1 mrg min->inside_cost = inside_cost;
1628 1.1 mrg min->outside_cost = outside_cost;
1629 1.1 mrg min->peel_info.dr_info = elem->dr_info;
1630 1.1 mrg min->peel_info.npeel = elem->npeel;
1631 1.1 mrg min->peel_info.count = elem->count;
1632 1.1 mrg }
1633 1.1 mrg
1634 1.1 mrg return 1;
1635 1.1 mrg }
1636 1.1 mrg
1637 1.1 mrg
1638 1.1 mrg /* Choose best peeling option by traversing peeling hash table and either
1639 1.1 mrg choosing an option with the lowest cost (if cost model is enabled) or the
1640 1.1 mrg option that aligns as many accesses as possible. */
1641 1.1 mrg
1642 1.1 mrg static struct _vect_peel_extended_info
1643 1.1 mrg vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1644 1.1 mrg loop_vec_info loop_vinfo)
1645 1.1 mrg {
1646 1.1 mrg struct _vect_peel_extended_info res;
1647 1.1 mrg
1648 1.1 mrg res.peel_info.dr_info = NULL;
1649 1.1 mrg res.vinfo = loop_vinfo;
1650 1.1 mrg
1651 1.1 mrg if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1652 1.1 mrg {
1653 1.1 mrg res.inside_cost = INT_MAX;
1654 1.1 mrg res.outside_cost = INT_MAX;
1655 1.1 mrg peeling_htab->traverse <_vect_peel_extended_info *,
1656 1.1 mrg vect_peeling_hash_get_lowest_cost> (&res);
1657 1.1 mrg }
1658 1.1 mrg else
1659 1.1 mrg {
1660 1.1 mrg res.peel_info.count = 0;
1661 1.1 mrg peeling_htab->traverse <_vect_peel_extended_info *,
1662 1.1 mrg vect_peeling_hash_get_most_frequent> (&res);
1663 1.1 mrg res.inside_cost = 0;
1664 1.1 mrg res.outside_cost = 0;
1665 1.1 mrg }
1666 1.1 mrg
1667 1.1 mrg return res;
1668 1.1 mrg }
1669 1.1 mrg
1670 1.1 mrg /* Return true if the new peeling NPEEL is supported. */
1671 1.1 mrg
1672 1.1 mrg static bool
1673 1.1 mrg vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1674 1.1 mrg unsigned npeel)
1675 1.1 mrg {
1676 1.1 mrg vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1677 1.1 mrg enum dr_alignment_support supportable_dr_alignment;
1678 1.1 mrg
1679 1.1 mrg bool dr0_alignment_known_p
1680 1.1 mrg = known_alignment_for_access_p (dr0_info,
1681 1.1 mrg STMT_VINFO_VECTYPE (dr0_info->stmt));
1682 1.1 mrg
1683 1.1 mrg /* Ensure that all data refs can be vectorized after the peel. */
1684 1.1 mrg for (data_reference *dr : datarefs)
1685 1.1 mrg {
1686 1.1 mrg if (dr == dr0_info->dr)
1687 1.1 mrg continue;
1688 1.1 mrg
1689 1.1 mrg dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1690 1.1 mrg if (!vect_relevant_for_alignment_p (dr_info)
1691 1.1 mrg || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1692 1.1 mrg continue;
1693 1.1 mrg
1694 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1695 1.1 mrg int misalignment;
1696 1.1 mrg unsigned HOST_WIDE_INT alignment;
1697 1.1 mrg if (!dr0_alignment_known_p
1698 1.1 mrg || !known_alignment_for_access_p (dr_info, vectype)
1699 1.1 mrg || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1700 1.1 mrg misalignment = DR_MISALIGNMENT_UNKNOWN;
1701 1.1 mrg else
1702 1.1 mrg {
1703 1.1 mrg misalignment = dr_misalignment (dr_info, vectype);
1704 1.1 mrg misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1705 1.1 mrg misalignment &= alignment - 1;
1706 1.1 mrg }
1707 1.1 mrg supportable_dr_alignment
1708 1.1 mrg = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1709 1.1 mrg misalignment);
1710 1.1 mrg if (supportable_dr_alignment == dr_unaligned_unsupported)
1711 1.1 mrg return false;
1712 1.1 mrg }
1713 1.1 mrg
1714 1.1 mrg return true;
1715 1.1 mrg }
1716 1.1 mrg
1717 1.1 mrg /* Compare two data-references DRA and DRB to group them into chunks
1718 1.1 mrg with related alignment. */
1719 1.1 mrg
1720 1.1 mrg static int
1721 1.1 mrg dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1722 1.1 mrg {
1723 1.1 mrg data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1724 1.1 mrg data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1725 1.1 mrg int cmp;
1726 1.1 mrg
1727 1.1 mrg /* Stabilize sort. */
1728 1.1 mrg if (dra == drb)
1729 1.1 mrg return 0;
1730 1.1 mrg
1731 1.1 mrg /* Ordering of DRs according to base. */
1732 1.1 mrg cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1733 1.1 mrg DR_BASE_ADDRESS (drb));
1734 1.1 mrg if (cmp != 0)
1735 1.1 mrg return cmp;
1736 1.1 mrg
1737 1.1 mrg /* And according to DR_OFFSET. */
1738 1.1 mrg cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1739 1.1 mrg if (cmp != 0)
1740 1.1 mrg return cmp;
1741 1.1 mrg
1742 1.1 mrg /* And after step. */
1743 1.1 mrg cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1744 1.1 mrg if (cmp != 0)
1745 1.1 mrg return cmp;
1746 1.1 mrg
1747 1.1 mrg /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1748 1.1 mrg cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1749 1.1 mrg if (cmp == 0)
1750 1.1 mrg return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1751 1.1 mrg return cmp;
1752 1.1 mrg }
1753 1.1 mrg
1754 1.1 mrg /* Function vect_enhance_data_refs_alignment
1755 1.1 mrg
1756 1.1 mrg This pass will use loop versioning and loop peeling in order to enhance
1757 1.1 mrg the alignment of data references in the loop.
1758 1.1 mrg
1759 1.1 mrg FOR NOW: we assume that whatever versioning/peeling takes place, only the
1760 1.1 mrg original loop is to be vectorized. Any other loops that are created by
1761 1.1 mrg the transformations performed in this pass - are not supposed to be
1762 1.1 mrg vectorized. This restriction will be relaxed.
1763 1.1 mrg
1764 1.1 mrg This pass will require a cost model to guide it whether to apply peeling
1765 1.1 mrg or versioning or a combination of the two. For example, the scheme that
1766 1.1 mrg intel uses when given a loop with several memory accesses, is as follows:
1767 1.1 mrg choose one memory access ('p') which alignment you want to force by doing
1768 1.1 mrg peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1769 1.1 mrg other accesses are not necessarily aligned, or (2) use loop versioning to
1770 1.1 mrg generate one loop in which all accesses are aligned, and another loop in
1771 1.1 mrg which only 'p' is necessarily aligned.
1772 1.1 mrg
1773 1.1 mrg ("Automatic Intra-Register Vectorization for the Intel Architecture",
1774 1.1 mrg Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1775 1.1 mrg Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1776 1.1 mrg
1777 1.1 mrg Devising a cost model is the most critical aspect of this work. It will
1778 1.1 mrg guide us on which access to peel for, whether to use loop versioning, how
1779 1.1 mrg many versions to create, etc. The cost model will probably consist of
1780 1.1 mrg generic considerations as well as target specific considerations (on
1781 1.1 mrg powerpc for example, misaligned stores are more painful than misaligned
1782 1.1 mrg loads).
1783 1.1 mrg
1784 1.1 mrg Here are the general steps involved in alignment enhancements:
1785 1.1 mrg
1786 1.1 mrg -- original loop, before alignment analysis:
1787 1.1 mrg for (i=0; i<N; i++){
1788 1.1 mrg x = q[i]; # DR_MISALIGNMENT(q) = unknown
1789 1.1 mrg p[i] = y; # DR_MISALIGNMENT(p) = unknown
1790 1.1 mrg }
1791 1.1 mrg
1792 1.1 mrg -- After vect_compute_data_refs_alignment:
1793 1.1 mrg for (i=0; i<N; i++){
1794 1.1 mrg x = q[i]; # DR_MISALIGNMENT(q) = 3
1795 1.1 mrg p[i] = y; # DR_MISALIGNMENT(p) = unknown
1796 1.1 mrg }
1797 1.1 mrg
1798 1.1 mrg -- Possibility 1: we do loop versioning:
1799 1.1 mrg if (p is aligned) {
1800 1.1 mrg for (i=0; i<N; i++){ # loop 1A
1801 1.1 mrg x = q[i]; # DR_MISALIGNMENT(q) = 3
1802 1.1 mrg p[i] = y; # DR_MISALIGNMENT(p) = 0
1803 1.1 mrg }
1804 1.1 mrg }
1805 1.1 mrg else {
1806 1.1 mrg for (i=0; i<N; i++){ # loop 1B
1807 1.1 mrg x = q[i]; # DR_MISALIGNMENT(q) = 3
1808 1.1 mrg p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1809 1.1 mrg }
1810 1.1 mrg }
1811 1.1 mrg
1812 1.1 mrg -- Possibility 2: we do loop peeling:
1813 1.1 mrg for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1814 1.1 mrg x = q[i];
1815 1.1 mrg p[i] = y;
1816 1.1 mrg }
1817 1.1 mrg for (i = 3; i < N; i++){ # loop 2A
1818 1.1 mrg x = q[i]; # DR_MISALIGNMENT(q) = 0
1819 1.1 mrg p[i] = y; # DR_MISALIGNMENT(p) = unknown
1820 1.1 mrg }
1821 1.1 mrg
1822 1.1 mrg -- Possibility 3: combination of loop peeling and versioning:
1823 1.1 mrg for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1824 1.1 mrg x = q[i];
1825 1.1 mrg p[i] = y;
1826 1.1 mrg }
1827 1.1 mrg if (p is aligned) {
1828 1.1 mrg for (i = 3; i<N; i++){ # loop 3A
1829 1.1 mrg x = q[i]; # DR_MISALIGNMENT(q) = 0
1830 1.1 mrg p[i] = y; # DR_MISALIGNMENT(p) = 0
1831 1.1 mrg }
1832 1.1 mrg }
1833 1.1 mrg else {
1834 1.1 mrg for (i = 3; i<N; i++){ # loop 3B
1835 1.1 mrg x = q[i]; # DR_MISALIGNMENT(q) = 0
1836 1.1 mrg p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1837 1.1 mrg }
1838 1.1 mrg }
1839 1.1 mrg
1840 1.1 mrg These loops are later passed to loop_transform to be vectorized. The
1841 1.1 mrg vectorizer will use the alignment information to guide the transformation
1842 1.1 mrg (whether to generate regular loads/stores, or with special handling for
1843 1.1 mrg misalignment). */
1844 1.1 mrg
1845 1.1 mrg opt_result
1846 1.1 mrg vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1847 1.1 mrg {
1848 1.1 mrg class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1849 1.1 mrg dr_vec_info *first_store = NULL;
1850 1.1 mrg dr_vec_info *dr0_info = NULL;
1851 1.1 mrg struct data_reference *dr;
1852 1.1 mrg unsigned int i;
1853 1.1 mrg bool do_peeling = false;
1854 1.1 mrg bool do_versioning = false;
1855 1.1 mrg unsigned int npeel = 0;
1856 1.1 mrg bool one_misalignment_known = false;
1857 1.1 mrg bool one_misalignment_unknown = false;
1858 1.1 mrg bool one_dr_unsupportable = false;
1859 1.1 mrg dr_vec_info *unsupportable_dr_info = NULL;
1860 1.1 mrg unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1861 1.1 mrg hash_table<peel_info_hasher> peeling_htab (1);
1862 1.1 mrg
1863 1.1 mrg DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1864 1.1 mrg
1865 1.1 mrg /* Reset data so we can safely be called multiple times. */
1866 1.1 mrg LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1867 1.1 mrg LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1868 1.1 mrg
1869 1.1 mrg if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1870 1.1 mrg return opt_result::success ();
1871 1.1 mrg
1872 1.1 mrg /* Sort the vector of datarefs so DRs that have the same or dependent
1873 1.1 mrg alignment are next to each other. */
1874 1.1 mrg auto_vec<data_reference_p> datarefs
1875 1.1 mrg = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1876 1.1 mrg datarefs.qsort (dr_align_group_sort_cmp);
1877 1.1 mrg
1878 1.1 mrg /* Compute the number of DRs that become aligned when we peel
1879 1.1 mrg a dataref so it becomes aligned. */
1880 1.1 mrg auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1881 1.1 mrg n_same_align_refs.quick_grow_cleared (datarefs.length ());
1882 1.1 mrg unsigned i0;
1883 1.1 mrg for (i0 = 0; i0 < datarefs.length (); ++i0)
1884 1.1 mrg if (DR_BASE_ADDRESS (datarefs[i0]))
1885 1.1 mrg break;
1886 1.1 mrg for (i = i0 + 1; i <= datarefs.length (); ++i)
1887 1.1 mrg {
1888 1.1 mrg if (i == datarefs.length ()
1889 1.1 mrg || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1890 1.1 mrg DR_BASE_ADDRESS (datarefs[i]), 0)
1891 1.1 mrg || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1892 1.1 mrg DR_OFFSET (datarefs[i]), 0)
1893 1.1 mrg || !operand_equal_p (DR_STEP (datarefs[i0]),
1894 1.1 mrg DR_STEP (datarefs[i]), 0))
1895 1.1 mrg {
1896 1.1 mrg /* The subgroup [i0, i-1] now only differs in DR_INIT and
1897 1.1 mrg possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1898 1.1 mrg will get known misalignment if we align one of the refs
1899 1.1 mrg with the largest DR_TARGET_ALIGNMENT. */
1900 1.1 mrg for (unsigned j = i0; j < i; ++j)
1901 1.1 mrg {
1902 1.1 mrg dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1903 1.1 mrg for (unsigned k = i0; k < i; ++k)
1904 1.1 mrg {
1905 1.1 mrg if (k == j)
1906 1.1 mrg continue;
1907 1.1 mrg dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1908 1.1 mrg if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1909 1.1 mrg dr_infoj))
1910 1.1 mrg n_same_align_refs[j]++;
1911 1.1 mrg }
1912 1.1 mrg }
1913 1.1 mrg i0 = i;
1914 1.1 mrg }
1915 1.1 mrg }
1916 1.1 mrg
1917 1.1 mrg /* While cost model enhancements are expected in the future, the high level
1918 1.1 mrg view of the code at this time is as follows:
1919 1.1 mrg
1920 1.1 mrg A) If there is a misaligned access then see if peeling to align
1921 1.1 mrg this access can make all data references satisfy
1922 1.1 mrg vect_supportable_dr_alignment. If so, update data structures
1923 1.1 mrg as needed and return true.
1924 1.1 mrg
1925 1.1 mrg B) If peeling wasn't possible and there is a data reference with an
1926 1.1 mrg unknown misalignment that does not satisfy vect_supportable_dr_alignment
1927 1.1 mrg then see if loop versioning checks can be used to make all data
1928 1.1 mrg references satisfy vect_supportable_dr_alignment. If so, update
1929 1.1 mrg data structures as needed and return true.
1930 1.1 mrg
1931 1.1 mrg C) If neither peeling nor versioning were successful then return false if
1932 1.1 mrg any data reference does not satisfy vect_supportable_dr_alignment.
1933 1.1 mrg
1934 1.1 mrg D) Return true (all data references satisfy vect_supportable_dr_alignment).
1935 1.1 mrg
1936 1.1 mrg Note, Possibility 3 above (which is peeling and versioning together) is not
1937 1.1 mrg being done at this time. */
1938 1.1 mrg
1939 1.1 mrg /* (1) Peeling to force alignment. */
1940 1.1 mrg
1941 1.1 mrg /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1942 1.1 mrg Considerations:
1943 1.1 mrg + How many accesses will become aligned due to the peeling
1944 1.1 mrg - How many accesses will become unaligned due to the peeling,
1945 1.1 mrg and the cost of misaligned accesses.
1946 1.1 mrg - The cost of peeling (the extra runtime checks, the increase
1947 1.1 mrg in code size). */
1948 1.1 mrg
1949 1.1 mrg FOR_EACH_VEC_ELT (datarefs, i, dr)
1950 1.1 mrg {
1951 1.1 mrg dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1952 1.1 mrg if (!vect_relevant_for_alignment_p (dr_info))
1953 1.1 mrg continue;
1954 1.1 mrg
1955 1.1 mrg stmt_vec_info stmt_info = dr_info->stmt;
1956 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1957 1.1 mrg do_peeling = vector_alignment_reachable_p (dr_info);
1958 1.1 mrg if (do_peeling)
1959 1.1 mrg {
1960 1.1 mrg if (known_alignment_for_access_p (dr_info, vectype))
1961 1.1 mrg {
1962 1.1 mrg unsigned int npeel_tmp = 0;
1963 1.1 mrg bool negative = tree_int_cst_compare (DR_STEP (dr),
1964 1.1 mrg size_zero_node) < 0;
1965 1.1 mrg
1966 1.1 mrg /* If known_alignment_for_access_p then we have set
1967 1.1 mrg DR_MISALIGNMENT which is only done if we know it at compiler
1968 1.1 mrg time, so it is safe to assume target alignment is constant.
1969 1.1 mrg */
1970 1.1 mrg unsigned int target_align =
1971 1.1 mrg DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1972 1.1 mrg unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1973 1.1 mrg poly_int64 off = 0;
1974 1.1 mrg if (negative)
1975 1.1 mrg off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1976 1.1 mrg unsigned int mis = dr_misalignment (dr_info, vectype, off);
1977 1.1 mrg mis = negative ? mis : -mis;
1978 1.1 mrg if (mis != 0)
1979 1.1 mrg npeel_tmp = (mis & (target_align - 1)) / dr_size;
1980 1.1 mrg
1981 1.1 mrg /* For multiple types, it is possible that the bigger type access
1982 1.1 mrg will have more than one peeling option. E.g., a loop with two
1983 1.1 mrg types: one of size (vector size / 4), and the other one of
1984 1.1 mrg size (vector size / 8). Vectorization factor will 8. If both
1985 1.1 mrg accesses are misaligned by 3, the first one needs one scalar
1986 1.1 mrg iteration to be aligned, and the second one needs 5. But the
1987 1.1 mrg first one will be aligned also by peeling 5 scalar
1988 1.1 mrg iterations, and in that case both accesses will be aligned.
1989 1.1 mrg Hence, except for the immediate peeling amount, we also want
1990 1.1 mrg to try to add full vector size, while we don't exceed
1991 1.1 mrg vectorization factor.
1992 1.1 mrg We do this automatically for cost model, since we calculate
1993 1.1 mrg cost for every peeling option. */
1994 1.1 mrg poly_uint64 nscalars = npeel_tmp;
1995 1.1 mrg if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1996 1.1 mrg {
1997 1.1 mrg poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1998 1.1 mrg nscalars = (STMT_SLP_TYPE (stmt_info)
1999 1.1 mrg ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2000 1.1 mrg }
2001 1.1 mrg
2002 1.1 mrg /* Save info about DR in the hash table. Also include peeling
2003 1.1 mrg amounts according to the explanation above. Indicate
2004 1.1 mrg the alignment status when the ref is not aligned.
2005 1.1 mrg ??? Rather than using unknown alignment here we should
2006 1.1 mrg prune all entries from the peeling hashtable which cause
2007 1.1 mrg DRs to be not supported. */
2008 1.1 mrg bool supportable_if_not_aligned
2009 1.1 mrg = vect_supportable_dr_alignment
2010 1.1 mrg (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2011 1.1 mrg while (known_le (npeel_tmp, nscalars))
2012 1.1 mrg {
2013 1.1 mrg vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2014 1.1 mrg dr_info, npeel_tmp,
2015 1.1 mrg supportable_if_not_aligned);
2016 1.1 mrg npeel_tmp += MAX (1, target_align / dr_size);
2017 1.1 mrg }
2018 1.1 mrg
2019 1.1 mrg one_misalignment_known = true;
2020 1.1 mrg }
2021 1.1 mrg else
2022 1.1 mrg {
2023 1.1 mrg /* If we don't know any misalignment values, we prefer
2024 1.1 mrg peeling for data-ref that has the maximum number of data-refs
2025 1.1 mrg with the same alignment, unless the target prefers to align
2026 1.1 mrg stores over load. */
2027 1.1 mrg unsigned same_align_drs = n_same_align_refs[i];
2028 1.1 mrg if (!dr0_info
2029 1.1 mrg || dr0_same_align_drs < same_align_drs)
2030 1.1 mrg {
2031 1.1 mrg dr0_same_align_drs = same_align_drs;
2032 1.1 mrg dr0_info = dr_info;
2033 1.1 mrg }
2034 1.1 mrg /* For data-refs with the same number of related
2035 1.1 mrg accesses prefer the one where the misalign
2036 1.1 mrg computation will be invariant in the outermost loop. */
2037 1.1 mrg else if (dr0_same_align_drs == same_align_drs)
2038 1.1 mrg {
2039 1.1 mrg class loop *ivloop0, *ivloop;
2040 1.1 mrg ivloop0 = outermost_invariant_loop_for_expr
2041 1.1 mrg (loop, DR_BASE_ADDRESS (dr0_info->dr));
2042 1.1 mrg ivloop = outermost_invariant_loop_for_expr
2043 1.1 mrg (loop, DR_BASE_ADDRESS (dr));
2044 1.1 mrg if ((ivloop && !ivloop0)
2045 1.1 mrg || (ivloop && ivloop0
2046 1.1 mrg && flow_loop_nested_p (ivloop, ivloop0)))
2047 1.1 mrg dr0_info = dr_info;
2048 1.1 mrg }
2049 1.1 mrg
2050 1.1 mrg one_misalignment_unknown = true;
2051 1.1 mrg
2052 1.1 mrg /* Check for data refs with unsupportable alignment that
2053 1.1 mrg can be peeled. */
2054 1.1 mrg enum dr_alignment_support supportable_dr_alignment
2055 1.1 mrg = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2056 1.1 mrg DR_MISALIGNMENT_UNKNOWN);
2057 1.1 mrg if (supportable_dr_alignment == dr_unaligned_unsupported)
2058 1.1 mrg {
2059 1.1 mrg one_dr_unsupportable = true;
2060 1.1 mrg unsupportable_dr_info = dr_info;
2061 1.1 mrg }
2062 1.1 mrg
2063 1.1 mrg if (!first_store && DR_IS_WRITE (dr))
2064 1.1 mrg {
2065 1.1 mrg first_store = dr_info;
2066 1.1 mrg first_store_same_align_drs = same_align_drs;
2067 1.1 mrg }
2068 1.1 mrg }
2069 1.1 mrg }
2070 1.1 mrg else
2071 1.1 mrg {
2072 1.1 mrg if (!aligned_access_p (dr_info, vectype))
2073 1.1 mrg {
2074 1.1 mrg if (dump_enabled_p ())
2075 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076 1.1 mrg "vector alignment may not be reachable\n");
2077 1.1 mrg break;
2078 1.1 mrg }
2079 1.1 mrg }
2080 1.1 mrg }
2081 1.1 mrg
2082 1.1 mrg /* Check if we can possibly peel the loop. */
2083 1.1 mrg if (!vect_can_advance_ivs_p (loop_vinfo)
2084 1.1 mrg || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2085 1.1 mrg || loop->inner)
2086 1.1 mrg do_peeling = false;
2087 1.1 mrg
2088 1.1 mrg struct _vect_peel_extended_info peel_for_known_alignment;
2089 1.1 mrg struct _vect_peel_extended_info peel_for_unknown_alignment;
2090 1.1 mrg struct _vect_peel_extended_info best_peel;
2091 1.1 mrg
2092 1.1 mrg peel_for_unknown_alignment.inside_cost = INT_MAX;
2093 1.1 mrg peel_for_unknown_alignment.outside_cost = INT_MAX;
2094 1.1 mrg peel_for_unknown_alignment.peel_info.count = 0;
2095 1.1 mrg
2096 1.1 mrg if (do_peeling
2097 1.1 mrg && one_misalignment_unknown)
2098 1.1 mrg {
2099 1.1 mrg /* Check if the target requires to prefer stores over loads, i.e., if
2100 1.1 mrg misaligned stores are more expensive than misaligned loads (taking
2101 1.1 mrg drs with same alignment into account). */
2102 1.1 mrg unsigned int load_inside_cost = 0;
2103 1.1 mrg unsigned int load_outside_cost = 0;
2104 1.1 mrg unsigned int store_inside_cost = 0;
2105 1.1 mrg unsigned int store_outside_cost = 0;
2106 1.1 mrg unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2107 1.1 mrg
2108 1.1 mrg stmt_vector_for_cost dummy;
2109 1.1 mrg dummy.create (2);
2110 1.1 mrg vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2111 1.1 mrg &load_inside_cost,
2112 1.1 mrg &load_outside_cost,
2113 1.1 mrg &dummy, &dummy, estimated_npeels);
2114 1.1 mrg dummy.release ();
2115 1.1 mrg
2116 1.1 mrg if (first_store)
2117 1.1 mrg {
2118 1.1 mrg dummy.create (2);
2119 1.1 mrg vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2120 1.1 mrg &store_inside_cost,
2121 1.1 mrg &store_outside_cost,
2122 1.1 mrg &dummy, &dummy,
2123 1.1 mrg estimated_npeels);
2124 1.1 mrg dummy.release ();
2125 1.1 mrg }
2126 1.1 mrg else
2127 1.1 mrg {
2128 1.1 mrg store_inside_cost = INT_MAX;
2129 1.1 mrg store_outside_cost = INT_MAX;
2130 1.1 mrg }
2131 1.1 mrg
2132 1.1 mrg if (load_inside_cost > store_inside_cost
2133 1.1 mrg || (load_inside_cost == store_inside_cost
2134 1.1 mrg && load_outside_cost > store_outside_cost))
2135 1.1 mrg {
2136 1.1 mrg dr0_info = first_store;
2137 1.1 mrg dr0_same_align_drs = first_store_same_align_drs;
2138 1.1 mrg peel_for_unknown_alignment.inside_cost = store_inside_cost;
2139 1.1 mrg peel_for_unknown_alignment.outside_cost = store_outside_cost;
2140 1.1 mrg }
2141 1.1 mrg else
2142 1.1 mrg {
2143 1.1 mrg peel_for_unknown_alignment.inside_cost = load_inside_cost;
2144 1.1 mrg peel_for_unknown_alignment.outside_cost = load_outside_cost;
2145 1.1 mrg }
2146 1.1 mrg
2147 1.1 mrg stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2148 1.1 mrg prologue_cost_vec.create (2);
2149 1.1 mrg epilogue_cost_vec.create (2);
2150 1.1 mrg
2151 1.1 mrg int dummy2;
2152 1.1 mrg peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2153 1.1 mrg (loop_vinfo, estimated_npeels, &dummy2,
2154 1.1 mrg &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2155 1.1 mrg &prologue_cost_vec, &epilogue_cost_vec);
2156 1.1 mrg
2157 1.1 mrg prologue_cost_vec.release ();
2158 1.1 mrg epilogue_cost_vec.release ();
2159 1.1 mrg
2160 1.1 mrg peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2161 1.1 mrg }
2162 1.1 mrg
2163 1.1 mrg peel_for_unknown_alignment.peel_info.npeel = 0;
2164 1.1 mrg peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2165 1.1 mrg
2166 1.1 mrg best_peel = peel_for_unknown_alignment;
2167 1.1 mrg
2168 1.1 mrg peel_for_known_alignment.inside_cost = INT_MAX;
2169 1.1 mrg peel_for_known_alignment.outside_cost = INT_MAX;
2170 1.1 mrg peel_for_known_alignment.peel_info.count = 0;
2171 1.1 mrg peel_for_known_alignment.peel_info.dr_info = NULL;
2172 1.1 mrg
2173 1.1 mrg if (do_peeling && one_misalignment_known)
2174 1.1 mrg {
2175 1.1 mrg /* Peeling is possible, but there is no data access that is not supported
2176 1.1 mrg unless aligned. So we try to choose the best possible peeling from
2177 1.1 mrg the hash table. */
2178 1.1 mrg peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2179 1.1 mrg (&peeling_htab, loop_vinfo);
2180 1.1 mrg }
2181 1.1 mrg
2182 1.1 mrg /* Compare costs of peeling for known and unknown alignment. */
2183 1.1 mrg if (peel_for_known_alignment.peel_info.dr_info != NULL
2184 1.1 mrg && peel_for_unknown_alignment.inside_cost
2185 1.1 mrg >= peel_for_known_alignment.inside_cost)
2186 1.1 mrg {
2187 1.1 mrg best_peel = peel_for_known_alignment;
2188 1.1 mrg
2189 1.1 mrg /* If the best peeling for known alignment has NPEEL == 0, perform no
2190 1.1 mrg peeling at all except if there is an unsupportable dr that we can
2191 1.1 mrg align. */
2192 1.1 mrg if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2193 1.1 mrg do_peeling = false;
2194 1.1 mrg }
2195 1.1 mrg
2196 1.1 mrg /* If there is an unsupportable data ref, prefer this over all choices so far
2197 1.1 mrg since we'd have to discard a chosen peeling except when it accidentally
2198 1.1 mrg aligned the unsupportable data ref. */
2199 1.1 mrg if (one_dr_unsupportable)
2200 1.1 mrg dr0_info = unsupportable_dr_info;
2201 1.1 mrg else if (do_peeling)
2202 1.1 mrg {
2203 1.1 mrg /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2204 1.1 mrg TODO: Use nopeel_outside_cost or get rid of it? */
2205 1.1 mrg unsigned nopeel_inside_cost = 0;
2206 1.1 mrg unsigned nopeel_outside_cost = 0;
2207 1.1 mrg
2208 1.1 mrg stmt_vector_for_cost dummy;
2209 1.1 mrg dummy.create (2);
2210 1.1 mrg vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2211 1.1 mrg &nopeel_outside_cost, &dummy, &dummy, 0);
2212 1.1 mrg dummy.release ();
2213 1.1 mrg
2214 1.1 mrg /* Add epilogue costs. As we do not peel for alignment here, no prologue
2215 1.1 mrg costs will be recorded. */
2216 1.1 mrg stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2217 1.1 mrg prologue_cost_vec.create (2);
2218 1.1 mrg epilogue_cost_vec.create (2);
2219 1.1 mrg
2220 1.1 mrg int dummy2;
2221 1.1 mrg nopeel_outside_cost += vect_get_known_peeling_cost
2222 1.1 mrg (loop_vinfo, 0, &dummy2,
2223 1.1 mrg &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2224 1.1 mrg &prologue_cost_vec, &epilogue_cost_vec);
2225 1.1 mrg
2226 1.1 mrg prologue_cost_vec.release ();
2227 1.1 mrg epilogue_cost_vec.release ();
2228 1.1 mrg
2229 1.1 mrg npeel = best_peel.peel_info.npeel;
2230 1.1 mrg dr0_info = best_peel.peel_info.dr_info;
2231 1.1 mrg
2232 1.1 mrg /* If no peeling is not more expensive than the best peeling we
2233 1.1 mrg have so far, don't perform any peeling. */
2234 1.1 mrg if (nopeel_inside_cost <= best_peel.inside_cost)
2235 1.1 mrg do_peeling = false;
2236 1.1 mrg }
2237 1.1 mrg
2238 1.1 mrg if (do_peeling)
2239 1.1 mrg {
2240 1.1 mrg stmt_vec_info stmt_info = dr0_info->stmt;
2241 1.1 mrg if (known_alignment_for_access_p (dr0_info,
2242 1.1 mrg STMT_VINFO_VECTYPE (stmt_info)))
2243 1.1 mrg {
2244 1.1 mrg bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2245 1.1 mrg size_zero_node) < 0;
2246 1.1 mrg if (!npeel)
2247 1.1 mrg {
2248 1.1 mrg /* Since it's known at compile time, compute the number of
2249 1.1 mrg iterations in the peeled loop (the peeling factor) for use in
2250 1.1 mrg updating DR_MISALIGNMENT values. The peeling factor is the
2251 1.1 mrg vectorization factor minus the misalignment as an element
2252 1.1 mrg count. */
2253 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2254 1.1 mrg poly_int64 off = 0;
2255 1.1 mrg if (negative)
2256 1.1 mrg off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2257 1.1 mrg * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2258 1.1 mrg unsigned int mis
2259 1.1 mrg = dr_misalignment (dr0_info, vectype, off);
2260 1.1 mrg mis = negative ? mis : -mis;
2261 1.1 mrg /* If known_alignment_for_access_p then we have set
2262 1.1 mrg DR_MISALIGNMENT which is only done if we know it at compiler
2263 1.1 mrg time, so it is safe to assume target alignment is constant.
2264 1.1 mrg */
2265 1.1 mrg unsigned int target_align =
2266 1.1 mrg DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2267 1.1 mrg npeel = ((mis & (target_align - 1))
2268 1.1 mrg / vect_get_scalar_dr_size (dr0_info));
2269 1.1 mrg }
2270 1.1 mrg
2271 1.1 mrg /* For interleaved data access every iteration accesses all the
2272 1.1 mrg members of the group, therefore we divide the number of iterations
2273 1.1 mrg by the group size. */
2274 1.1 mrg if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2275 1.1 mrg npeel /= DR_GROUP_SIZE (stmt_info);
2276 1.1 mrg
2277 1.1 mrg if (dump_enabled_p ())
2278 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2279 1.1 mrg "Try peeling by %d\n", npeel);
2280 1.1 mrg }
2281 1.1 mrg
2282 1.1 mrg /* Ensure that all datarefs can be vectorized after the peel. */
2283 1.1 mrg if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2284 1.1 mrg do_peeling = false;
2285 1.1 mrg
2286 1.1 mrg /* Check if all datarefs are supportable and log. */
2287 1.1 mrg if (do_peeling
2288 1.1 mrg && npeel == 0
2289 1.1 mrg && known_alignment_for_access_p (dr0_info,
2290 1.1 mrg STMT_VINFO_VECTYPE (stmt_info)))
2291 1.1 mrg return opt_result::success ();
2292 1.1 mrg
2293 1.1 mrg /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2294 1.1 mrg if (do_peeling)
2295 1.1 mrg {
2296 1.1 mrg unsigned max_allowed_peel
2297 1.1 mrg = param_vect_max_peeling_for_alignment;
2298 1.1 mrg if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2299 1.1 mrg max_allowed_peel = 0;
2300 1.1 mrg if (max_allowed_peel != (unsigned)-1)
2301 1.1 mrg {
2302 1.1 mrg unsigned max_peel = npeel;
2303 1.1 mrg if (max_peel == 0)
2304 1.1 mrg {
2305 1.1 mrg poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2306 1.1 mrg unsigned HOST_WIDE_INT target_align_c;
2307 1.1 mrg if (target_align.is_constant (&target_align_c))
2308 1.1 mrg max_peel =
2309 1.1 mrg target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2310 1.1 mrg else
2311 1.1 mrg {
2312 1.1 mrg do_peeling = false;
2313 1.1 mrg if (dump_enabled_p ())
2314 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2315 1.1 mrg "Disable peeling, max peels set and vector"
2316 1.1 mrg " alignment unknown\n");
2317 1.1 mrg }
2318 1.1 mrg }
2319 1.1 mrg if (max_peel > max_allowed_peel)
2320 1.1 mrg {
2321 1.1 mrg do_peeling = false;
2322 1.1 mrg if (dump_enabled_p ())
2323 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2324 1.1 mrg "Disable peeling, max peels reached: %d\n", max_peel);
2325 1.1 mrg }
2326 1.1 mrg }
2327 1.1 mrg }
2328 1.1 mrg
2329 1.1 mrg /* Cost model #2 - if peeling may result in a remaining loop not
2330 1.1 mrg iterating enough to be vectorized then do not peel. Since this
2331 1.1 mrg is a cost heuristic rather than a correctness decision, use the
2332 1.1 mrg most likely runtime value for variable vectorization factors. */
2333 1.1 mrg if (do_peeling
2334 1.1 mrg && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2335 1.1 mrg {
2336 1.1 mrg unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2337 1.1 mrg unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2338 1.1 mrg if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2339 1.1 mrg < assumed_vf + max_peel)
2340 1.1 mrg do_peeling = false;
2341 1.1 mrg }
2342 1.1 mrg
2343 1.1 mrg if (do_peeling)
2344 1.1 mrg {
2345 1.1 mrg /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2346 1.1 mrg If the misalignment of DR_i is identical to that of dr0 then set
2347 1.1 mrg DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2348 1.1 mrg dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2349 1.1 mrg by the peeling factor times the element size of DR_i (MOD the
2350 1.1 mrg vectorization factor times the size). Otherwise, the
2351 1.1 mrg misalignment of DR_i must be set to unknown. */
2352 1.1 mrg FOR_EACH_VEC_ELT (datarefs, i, dr)
2353 1.1 mrg if (dr != dr0_info->dr)
2354 1.1 mrg {
2355 1.1 mrg dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2356 1.1 mrg if (!vect_relevant_for_alignment_p (dr_info))
2357 1.1 mrg continue;
2358 1.1 mrg
2359 1.1 mrg vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2360 1.1 mrg }
2361 1.1 mrg
2362 1.1 mrg LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2363 1.1 mrg if (npeel)
2364 1.1 mrg LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2365 1.1 mrg else
2366 1.1 mrg LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2367 1.1 mrg SET_DR_MISALIGNMENT (dr0_info,
2368 1.1 mrg vect_dr_misalign_for_aligned_access (dr0_info));
2369 1.1 mrg if (dump_enabled_p ())
2370 1.1 mrg {
2371 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2372 1.1 mrg "Alignment of access forced using peeling.\n");
2373 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2374 1.1 mrg "Peeling for alignment will be applied.\n");
2375 1.1 mrg }
2376 1.1 mrg
2377 1.1 mrg /* The inside-loop cost will be accounted for in vectorizable_load
2378 1.1 mrg and vectorizable_store correctly with adjusted alignments.
2379 1.1 mrg Drop the body_cst_vec on the floor here. */
2380 1.1 mrg return opt_result::success ();
2381 1.1 mrg }
2382 1.1 mrg }
2383 1.1 mrg
2384 1.1 mrg /* (2) Versioning to force alignment. */
2385 1.1 mrg
2386 1.1 mrg /* Try versioning if:
2387 1.1 mrg 1) optimize loop for speed and the cost-model is not cheap
2388 1.1 mrg 2) there is at least one unsupported misaligned data ref with an unknown
2389 1.1 mrg misalignment, and
2390 1.1 mrg 3) all misaligned data refs with a known misalignment are supported, and
2391 1.1 mrg 4) the number of runtime alignment checks is within reason. */
2392 1.1 mrg
2393 1.1 mrg do_versioning
2394 1.1 mrg = (optimize_loop_nest_for_speed_p (loop)
2395 1.1 mrg && !loop->inner /* FORNOW */
2396 1.1 mrg && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2397 1.1 mrg
2398 1.1 mrg if (do_versioning)
2399 1.1 mrg {
2400 1.1 mrg FOR_EACH_VEC_ELT (datarefs, i, dr)
2401 1.1 mrg {
2402 1.1 mrg dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2403 1.1 mrg if (!vect_relevant_for_alignment_p (dr_info))
2404 1.1 mrg continue;
2405 1.1 mrg
2406 1.1 mrg stmt_vec_info stmt_info = dr_info->stmt;
2407 1.1 mrg if (STMT_VINFO_STRIDED_P (stmt_info))
2408 1.1 mrg {
2409 1.1 mrg do_versioning = false;
2410 1.1 mrg break;
2411 1.1 mrg }
2412 1.1 mrg
2413 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2414 1.1 mrg bool negative = tree_int_cst_compare (DR_STEP (dr),
2415 1.1 mrg size_zero_node) < 0;
2416 1.1 mrg poly_int64 off = 0;
2417 1.1 mrg if (negative)
2418 1.1 mrg off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2419 1.1 mrg * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2420 1.1 mrg int misalignment;
2421 1.1 mrg if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2422 1.1 mrg continue;
2423 1.1 mrg
2424 1.1 mrg enum dr_alignment_support supportable_dr_alignment
2425 1.1 mrg = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2426 1.1 mrg misalignment);
2427 1.1 mrg if (supportable_dr_alignment == dr_unaligned_unsupported)
2428 1.1 mrg {
2429 1.1 mrg if (misalignment != DR_MISALIGNMENT_UNKNOWN
2430 1.1 mrg || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2431 1.1 mrg >= (unsigned) param_vect_max_version_for_alignment_checks))
2432 1.1 mrg {
2433 1.1 mrg do_versioning = false;
2434 1.1 mrg break;
2435 1.1 mrg }
2436 1.1 mrg
2437 1.1 mrg /* At present we don't support versioning for alignment
2438 1.1 mrg with variable VF, since there's no guarantee that the
2439 1.1 mrg VF is a power of two. We could relax this if we added
2440 1.1 mrg a way of enforcing a power-of-two size. */
2441 1.1 mrg unsigned HOST_WIDE_INT size;
2442 1.1 mrg if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2443 1.1 mrg {
2444 1.1 mrg do_versioning = false;
2445 1.1 mrg break;
2446 1.1 mrg }
2447 1.1 mrg
2448 1.1 mrg /* Forcing alignment in the first iteration is no good if
2449 1.1 mrg we don't keep it across iterations. For now, just disable
2450 1.1 mrg versioning in this case.
2451 1.1 mrg ?? We could actually unroll the loop to achieve the required
2452 1.1 mrg overall step alignment, and forcing the alignment could be
2453 1.1 mrg done by doing some iterations of the non-vectorized loop. */
2454 1.1 mrg if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2455 1.1 mrg * DR_STEP_ALIGNMENT (dr),
2456 1.1 mrg DR_TARGET_ALIGNMENT (dr_info)))
2457 1.1 mrg {
2458 1.1 mrg do_versioning = false;
2459 1.1 mrg break;
2460 1.1 mrg }
2461 1.1 mrg
2462 1.1 mrg /* The rightmost bits of an aligned address must be zeros.
2463 1.1 mrg Construct the mask needed for this test. For example,
2464 1.1 mrg GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2465 1.1 mrg mask must be 15 = 0xf. */
2466 1.1 mrg int mask = size - 1;
2467 1.1 mrg
2468 1.1 mrg /* FORNOW: use the same mask to test all potentially unaligned
2469 1.1 mrg references in the loop. */
2470 1.1 mrg if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2471 1.1 mrg && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2472 1.1 mrg {
2473 1.1 mrg do_versioning = false;
2474 1.1 mrg break;
2475 1.1 mrg }
2476 1.1 mrg
2477 1.1 mrg LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2478 1.1 mrg LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2479 1.1 mrg }
2480 1.1 mrg }
2481 1.1 mrg
2482 1.1 mrg /* Versioning requires at least one misaligned data reference. */
2483 1.1 mrg if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2484 1.1 mrg do_versioning = false;
2485 1.1 mrg else if (!do_versioning)
2486 1.1 mrg LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2487 1.1 mrg }
2488 1.1 mrg
2489 1.1 mrg if (do_versioning)
2490 1.1 mrg {
2491 1.1 mrg const vec<stmt_vec_info> &may_misalign_stmts
2492 1.1 mrg = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2493 1.1 mrg stmt_vec_info stmt_info;
2494 1.1 mrg
2495 1.1 mrg /* It can now be assumed that the data references in the statements
2496 1.1 mrg in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2497 1.1 mrg of the loop being vectorized. */
2498 1.1 mrg FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2499 1.1 mrg {
2500 1.1 mrg dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2501 1.1 mrg SET_DR_MISALIGNMENT (dr_info,
2502 1.1 mrg vect_dr_misalign_for_aligned_access (dr_info));
2503 1.1 mrg if (dump_enabled_p ())
2504 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2505 1.1 mrg "Alignment of access forced using versioning.\n");
2506 1.1 mrg }
2507 1.1 mrg
2508 1.1 mrg if (dump_enabled_p ())
2509 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2510 1.1 mrg "Versioning for alignment will be applied.\n");
2511 1.1 mrg
2512 1.1 mrg /* Peeling and versioning can't be done together at this time. */
2513 1.1 mrg gcc_assert (! (do_peeling && do_versioning));
2514 1.1 mrg
2515 1.1 mrg return opt_result::success ();
2516 1.1 mrg }
2517 1.1 mrg
2518 1.1 mrg /* This point is reached if neither peeling nor versioning is being done. */
2519 1.1 mrg gcc_assert (! (do_peeling || do_versioning));
2520 1.1 mrg
2521 1.1 mrg return opt_result::success ();
2522 1.1 mrg }
2523 1.1 mrg
2524 1.1 mrg
2525 1.1 mrg /* Function vect_analyze_data_refs_alignment
2526 1.1 mrg
2527 1.1 mrg Analyze the alignment of the data-references in the loop.
2528 1.1 mrg Return FALSE if a data reference is found that cannot be vectorized. */
2529 1.1 mrg
2530 1.1 mrg opt_result
2531 1.1 mrg vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2532 1.1 mrg {
2533 1.1 mrg DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2534 1.1 mrg
2535 1.1 mrg vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2536 1.1 mrg struct data_reference *dr;
2537 1.1 mrg unsigned int i;
2538 1.1 mrg
2539 1.1 mrg vect_record_base_alignments (loop_vinfo);
2540 1.1 mrg FOR_EACH_VEC_ELT (datarefs, i, dr)
2541 1.1 mrg {
2542 1.1 mrg dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2543 1.1 mrg if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2544 1.1 mrg {
2545 1.1 mrg if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2546 1.1 mrg && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2547 1.1 mrg continue;
2548 1.1 mrg vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2549 1.1 mrg STMT_VINFO_VECTYPE (dr_info->stmt));
2550 1.1 mrg }
2551 1.1 mrg }
2552 1.1 mrg
2553 1.1 mrg return opt_result::success ();
2554 1.1 mrg }
2555 1.1 mrg
2556 1.1 mrg
2557 1.1 mrg /* Analyze alignment of DRs of stmts in NODE. */
2558 1.1 mrg
2559 1.1 mrg static bool
2560 1.1 mrg vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2561 1.1 mrg {
2562 1.1 mrg /* Alignment is maintained in the first element of the group. */
2563 1.1 mrg stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2564 1.1 mrg first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2565 1.1 mrg dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2566 1.1 mrg tree vectype = SLP_TREE_VECTYPE (node);
2567 1.1 mrg poly_uint64 vector_alignment
2568 1.1 mrg = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2569 1.1 mrg BITS_PER_UNIT);
2570 1.1 mrg if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2571 1.1 mrg vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2572 1.1 mrg /* Re-analyze alignment when we're facing a vectorization with a bigger
2573 1.1 mrg alignment requirement. */
2574 1.1 mrg else if (known_lt (dr_info->target_alignment, vector_alignment))
2575 1.1 mrg {
2576 1.1 mrg poly_uint64 old_target_alignment = dr_info->target_alignment;
2577 1.1 mrg int old_misalignment = dr_info->misalignment;
2578 1.1 mrg vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2579 1.1 mrg /* But keep knowledge about a smaller alignment. */
2580 1.1 mrg if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2581 1.1 mrg && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2582 1.1 mrg {
2583 1.1 mrg dr_info->target_alignment = old_target_alignment;
2584 1.1 mrg dr_info->misalignment = old_misalignment;
2585 1.1 mrg }
2586 1.1 mrg }
2587 1.1 mrg /* When we ever face unordered target alignments the first one wins in terms
2588 1.1 mrg of analyzing and the other will become unknown in dr_misalignment. */
2589 1.1 mrg return true;
2590 1.1 mrg }
2591 1.1 mrg
2592 1.1 mrg /* Function vect_slp_analyze_instance_alignment
2593 1.1 mrg
2594 1.1 mrg Analyze the alignment of the data-references in the SLP instance.
2595 1.1 mrg Return FALSE if a data reference is found that cannot be vectorized. */
2596 1.1 mrg
2597 1.1 mrg bool
2598 1.1 mrg vect_slp_analyze_instance_alignment (vec_info *vinfo,
2599 1.1 mrg slp_instance instance)
2600 1.1 mrg {
2601 1.1 mrg DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2602 1.1 mrg
2603 1.1 mrg slp_tree node;
2604 1.1 mrg unsigned i;
2605 1.1 mrg FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2606 1.1 mrg if (! vect_slp_analyze_node_alignment (vinfo, node))
2607 1.1 mrg return false;
2608 1.1 mrg
2609 1.1 mrg if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2610 1.1 mrg && ! vect_slp_analyze_node_alignment
2611 1.1 mrg (vinfo, SLP_INSTANCE_TREE (instance)))
2612 1.1 mrg return false;
2613 1.1 mrg
2614 1.1 mrg return true;
2615 1.1 mrg }
2616 1.1 mrg
2617 1.1 mrg
2618 1.1 mrg /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2619 1.1 mrg accesses of legal size, step, etc. Detect gaps, single element
2620 1.1 mrg interleaving, and other special cases. Set grouped access info.
2621 1.1 mrg Collect groups of strided stores for further use in SLP analysis.
2622 1.1 mrg Worker for vect_analyze_group_access. */
2623 1.1 mrg
2624 1.1 mrg static bool
2625 1.1 mrg vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2626 1.1 mrg {
2627 1.1 mrg data_reference *dr = dr_info->dr;
2628 1.1 mrg tree step = DR_STEP (dr);
2629 1.1 mrg tree scalar_type = TREE_TYPE (DR_REF (dr));
2630 1.1 mrg HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2631 1.1 mrg stmt_vec_info stmt_info = dr_info->stmt;
2632 1.1 mrg loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2633 1.1 mrg bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2634 1.1 mrg HOST_WIDE_INT dr_step = -1;
2635 1.1 mrg HOST_WIDE_INT groupsize, last_accessed_element = 1;
2636 1.1 mrg bool slp_impossible = false;
2637 1.1 mrg
2638 1.1 mrg /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2639 1.1 mrg size of the interleaving group (including gaps). */
2640 1.1 mrg if (tree_fits_shwi_p (step))
2641 1.1 mrg {
2642 1.1 mrg dr_step = tree_to_shwi (step);
2643 1.1 mrg /* Check that STEP is a multiple of type size. Otherwise there is
2644 1.1 mrg a non-element-sized gap at the end of the group which we
2645 1.1 mrg cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2646 1.1 mrg ??? As we can handle non-constant step fine here we should
2647 1.1 mrg simply remove uses of DR_GROUP_GAP between the last and first
2648 1.1 mrg element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2649 1.1 mrg simply not include that gap. */
2650 1.1 mrg if ((dr_step % type_size) != 0)
2651 1.1 mrg {
2652 1.1 mrg if (dump_enabled_p ())
2653 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2654 1.1 mrg "Step %T is not a multiple of the element size"
2655 1.1 mrg " for %T\n",
2656 1.1 mrg step, DR_REF (dr));
2657 1.1 mrg return false;
2658 1.1 mrg }
2659 1.1 mrg groupsize = absu_hwi (dr_step) / type_size;
2660 1.1 mrg }
2661 1.1 mrg else
2662 1.1 mrg groupsize = 0;
2663 1.1 mrg
2664 1.1 mrg /* Not consecutive access is possible only if it is a part of interleaving. */
2665 1.1 mrg if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2666 1.1 mrg {
2667 1.1 mrg /* Check if it this DR is a part of interleaving, and is a single
2668 1.1 mrg element of the group that is accessed in the loop. */
2669 1.1 mrg
2670 1.1 mrg /* Gaps are supported only for loads. STEP must be a multiple of the type
2671 1.1 mrg size. */
2672 1.1 mrg if (DR_IS_READ (dr)
2673 1.1 mrg && (dr_step % type_size) == 0
2674 1.1 mrg && groupsize > 0
2675 1.1 mrg /* This could be UINT_MAX but as we are generating code in a very
2676 1.1 mrg inefficient way we have to cap earlier.
2677 1.1 mrg See PR91403 for example. */
2678 1.1 mrg && groupsize <= 4096)
2679 1.1 mrg {
2680 1.1 mrg DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2681 1.1 mrg DR_GROUP_SIZE (stmt_info) = groupsize;
2682 1.1 mrg DR_GROUP_GAP (stmt_info) = groupsize - 1;
2683 1.1 mrg if (dump_enabled_p ())
2684 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2685 1.1 mrg "Detected single element interleaving %T"
2686 1.1 mrg " step %T\n",
2687 1.1 mrg DR_REF (dr), step);
2688 1.1 mrg
2689 1.1 mrg return true;
2690 1.1 mrg }
2691 1.1 mrg
2692 1.1 mrg if (dump_enabled_p ())
2693 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2694 1.1 mrg "not consecutive access %G", stmt_info->stmt);
2695 1.1 mrg
2696 1.1 mrg if (bb_vinfo)
2697 1.1 mrg {
2698 1.1 mrg /* Mark the statement as unvectorizable. */
2699 1.1 mrg STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2700 1.1 mrg return true;
2701 1.1 mrg }
2702 1.1 mrg
2703 1.1 mrg if (dump_enabled_p ())
2704 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2705 1.1 mrg STMT_VINFO_STRIDED_P (stmt_info) = true;
2706 1.1 mrg return true;
2707 1.1 mrg }
2708 1.1 mrg
2709 1.1 mrg if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2710 1.1 mrg {
2711 1.1 mrg /* First stmt in the interleaving chain. Check the chain. */
2712 1.1 mrg stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2713 1.1 mrg struct data_reference *data_ref = dr;
2714 1.1 mrg unsigned int count = 1;
2715 1.1 mrg tree prev_init = DR_INIT (data_ref);
2716 1.1 mrg HOST_WIDE_INT diff, gaps = 0;
2717 1.1 mrg
2718 1.1 mrg /* By construction, all group members have INTEGER_CST DR_INITs. */
2719 1.1 mrg while (next)
2720 1.1 mrg {
2721 1.1 mrg /* We never have the same DR multiple times. */
2722 1.1 mrg gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2723 1.1 mrg DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2724 1.1 mrg
2725 1.1 mrg data_ref = STMT_VINFO_DATA_REF (next);
2726 1.1 mrg
2727 1.1 mrg /* All group members have the same STEP by construction. */
2728 1.1 mrg gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2729 1.1 mrg
2730 1.1 mrg /* Check that the distance between two accesses is equal to the type
2731 1.1 mrg size. Otherwise, we have gaps. */
2732 1.1 mrg diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2733 1.1 mrg - TREE_INT_CST_LOW (prev_init)) / type_size;
2734 1.1 mrg if (diff < 1 || diff > UINT_MAX)
2735 1.1 mrg {
2736 1.1 mrg /* For artificial testcases with array accesses with large
2737 1.1 mrg constant indices we can run into overflow issues which
2738 1.1 mrg can end up fooling the groupsize constraint below so
2739 1.1 mrg check the individual gaps (which are represented as
2740 1.1 mrg unsigned int) as well. */
2741 1.1 mrg if (dump_enabled_p ())
2742 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2743 1.1 mrg "interleaved access with gap larger "
2744 1.1 mrg "than representable\n");
2745 1.1 mrg return false;
2746 1.1 mrg }
2747 1.1 mrg if (diff != 1)
2748 1.1 mrg {
2749 1.1 mrg /* FORNOW: SLP of accesses with gaps is not supported. */
2750 1.1 mrg slp_impossible = true;
2751 1.1 mrg if (DR_IS_WRITE (data_ref))
2752 1.1 mrg {
2753 1.1 mrg if (dump_enabled_p ())
2754 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2755 1.1 mrg "interleaved store with gaps\n");
2756 1.1 mrg return false;
2757 1.1 mrg }
2758 1.1 mrg
2759 1.1 mrg gaps += diff - 1;
2760 1.1 mrg }
2761 1.1 mrg
2762 1.1 mrg last_accessed_element += diff;
2763 1.1 mrg
2764 1.1 mrg /* Store the gap from the previous member of the group. If there is no
2765 1.1 mrg gap in the access, DR_GROUP_GAP is always 1. */
2766 1.1 mrg DR_GROUP_GAP (next) = diff;
2767 1.1 mrg
2768 1.1 mrg prev_init = DR_INIT (data_ref);
2769 1.1 mrg next = DR_GROUP_NEXT_ELEMENT (next);
2770 1.1 mrg /* Count the number of data-refs in the chain. */
2771 1.1 mrg count++;
2772 1.1 mrg }
2773 1.1 mrg
2774 1.1 mrg if (groupsize == 0)
2775 1.1 mrg groupsize = count + gaps;
2776 1.1 mrg
2777 1.1 mrg /* This could be UINT_MAX but as we are generating code in a very
2778 1.1 mrg inefficient way we have to cap earlier. See PR78699 for example. */
2779 1.1 mrg if (groupsize > 4096)
2780 1.1 mrg {
2781 1.1 mrg if (dump_enabled_p ())
2782 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2783 1.1 mrg "group is too large\n");
2784 1.1 mrg return false;
2785 1.1 mrg }
2786 1.1 mrg
2787 1.1 mrg /* Check that the size of the interleaving is equal to count for stores,
2788 1.1 mrg i.e., that there are no gaps. */
2789 1.1 mrg if (groupsize != count
2790 1.1 mrg && !DR_IS_READ (dr))
2791 1.1 mrg {
2792 1.1 mrg groupsize = count;
2793 1.1 mrg STMT_VINFO_STRIDED_P (stmt_info) = true;
2794 1.1 mrg }
2795 1.1 mrg
2796 1.1 mrg /* If there is a gap after the last load in the group it is the
2797 1.1 mrg difference between the groupsize and the last accessed
2798 1.1 mrg element.
2799 1.1 mrg When there is no gap, this difference should be 0. */
2800 1.1 mrg DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2801 1.1 mrg
2802 1.1 mrg DR_GROUP_SIZE (stmt_info) = groupsize;
2803 1.1 mrg if (dump_enabled_p ())
2804 1.1 mrg {
2805 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2806 1.1 mrg "Detected interleaving ");
2807 1.1 mrg if (DR_IS_READ (dr))
2808 1.1 mrg dump_printf (MSG_NOTE, "load ");
2809 1.1 mrg else if (STMT_VINFO_STRIDED_P (stmt_info))
2810 1.1 mrg dump_printf (MSG_NOTE, "strided store ");
2811 1.1 mrg else
2812 1.1 mrg dump_printf (MSG_NOTE, "store ");
2813 1.1 mrg dump_printf (MSG_NOTE, "of size %u\n",
2814 1.1 mrg (unsigned)groupsize);
2815 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2816 1.1 mrg next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2817 1.1 mrg while (next)
2818 1.1 mrg {
2819 1.1 mrg if (DR_GROUP_GAP (next) != 1)
2820 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2821 1.1 mrg "\t<gap of %d elements>\n",
2822 1.1 mrg DR_GROUP_GAP (next) - 1);
2823 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2824 1.1 mrg next = DR_GROUP_NEXT_ELEMENT (next);
2825 1.1 mrg }
2826 1.1 mrg if (DR_GROUP_GAP (stmt_info) != 0)
2827 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2828 1.1 mrg "\t<gap of %d elements>\n",
2829 1.1 mrg DR_GROUP_GAP (stmt_info));
2830 1.1 mrg }
2831 1.1 mrg
2832 1.1 mrg /* SLP: create an SLP data structure for every interleaving group of
2833 1.1 mrg stores for further analysis in vect_analyse_slp. */
2834 1.1 mrg if (DR_IS_WRITE (dr) && !slp_impossible)
2835 1.1 mrg {
2836 1.1 mrg if (loop_vinfo)
2837 1.1 mrg LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2838 1.1 mrg if (bb_vinfo)
2839 1.1 mrg BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2840 1.1 mrg }
2841 1.1 mrg }
2842 1.1 mrg
2843 1.1 mrg return true;
2844 1.1 mrg }
2845 1.1 mrg
2846 1.1 mrg /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2847 1.1 mrg accesses of legal size, step, etc. Detect gaps, single element
2848 1.1 mrg interleaving, and other special cases. Set grouped access info.
2849 1.1 mrg Collect groups of strided stores for further use in SLP analysis. */
2850 1.1 mrg
2851 1.1 mrg static bool
2852 1.1 mrg vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2853 1.1 mrg {
2854 1.1 mrg if (!vect_analyze_group_access_1 (vinfo, dr_info))
2855 1.1 mrg {
2856 1.1 mrg /* Dissolve the group if present. */
2857 1.1 mrg stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2858 1.1 mrg while (stmt_info)
2859 1.1 mrg {
2860 1.1 mrg stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2861 1.1 mrg DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2862 1.1 mrg DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2863 1.1 mrg stmt_info = next;
2864 1.1 mrg }
2865 1.1 mrg return false;
2866 1.1 mrg }
2867 1.1 mrg return true;
2868 1.1 mrg }
2869 1.1 mrg
2870 1.1 mrg /* Analyze the access pattern of the data-reference DR_INFO.
2871 1.1 mrg In case of non-consecutive accesses call vect_analyze_group_access() to
2872 1.1 mrg analyze groups of accesses. */
2873 1.1 mrg
2874 1.1 mrg static bool
2875 1.1 mrg vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2876 1.1 mrg {
2877 1.1 mrg data_reference *dr = dr_info->dr;
2878 1.1 mrg tree step = DR_STEP (dr);
2879 1.1 mrg tree scalar_type = TREE_TYPE (DR_REF (dr));
2880 1.1 mrg stmt_vec_info stmt_info = dr_info->stmt;
2881 1.1 mrg loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2882 1.1 mrg class loop *loop = NULL;
2883 1.1 mrg
2884 1.1 mrg if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2885 1.1 mrg return true;
2886 1.1 mrg
2887 1.1 mrg if (loop_vinfo)
2888 1.1 mrg loop = LOOP_VINFO_LOOP (loop_vinfo);
2889 1.1 mrg
2890 1.1 mrg if (loop_vinfo && !step)
2891 1.1 mrg {
2892 1.1 mrg if (dump_enabled_p ())
2893 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2894 1.1 mrg "bad data-ref access in loop\n");
2895 1.1 mrg return false;
2896 1.1 mrg }
2897 1.1 mrg
2898 1.1 mrg /* Allow loads with zero step in inner-loop vectorization. */
2899 1.1 mrg if (loop_vinfo && integer_zerop (step))
2900 1.1 mrg {
2901 1.1 mrg DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2902 1.1 mrg if (!nested_in_vect_loop_p (loop, stmt_info))
2903 1.1 mrg return DR_IS_READ (dr);
2904 1.1 mrg /* Allow references with zero step for outer loops marked
2905 1.1 mrg with pragma omp simd only - it guarantees absence of
2906 1.1 mrg loop-carried dependencies between inner loop iterations. */
2907 1.1 mrg if (loop->safelen < 2)
2908 1.1 mrg {
2909 1.1 mrg if (dump_enabled_p ())
2910 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2911 1.1 mrg "zero step in inner loop of nest\n");
2912 1.1 mrg return false;
2913 1.1 mrg }
2914 1.1 mrg }
2915 1.1 mrg
2916 1.1 mrg if (loop && nested_in_vect_loop_p (loop, stmt_info))
2917 1.1 mrg {
2918 1.1 mrg /* Interleaved accesses are not yet supported within outer-loop
2919 1.1 mrg vectorization for references in the inner-loop. */
2920 1.1 mrg DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2921 1.1 mrg
2922 1.1 mrg /* For the rest of the analysis we use the outer-loop step. */
2923 1.1 mrg step = STMT_VINFO_DR_STEP (stmt_info);
2924 1.1 mrg if (integer_zerop (step))
2925 1.1 mrg {
2926 1.1 mrg if (dump_enabled_p ())
2927 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2928 1.1 mrg "zero step in outer loop.\n");
2929 1.1 mrg return DR_IS_READ (dr);
2930 1.1 mrg }
2931 1.1 mrg }
2932 1.1 mrg
2933 1.1 mrg /* Consecutive? */
2934 1.1 mrg if (TREE_CODE (step) == INTEGER_CST)
2935 1.1 mrg {
2936 1.1 mrg HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2937 1.1 mrg if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2938 1.1 mrg || (dr_step < 0
2939 1.1 mrg && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2940 1.1 mrg {
2941 1.1 mrg /* Mark that it is not interleaving. */
2942 1.1 mrg DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2943 1.1 mrg return true;
2944 1.1 mrg }
2945 1.1 mrg }
2946 1.1 mrg
2947 1.1 mrg if (loop && nested_in_vect_loop_p (loop, stmt_info))
2948 1.1 mrg {
2949 1.1 mrg if (dump_enabled_p ())
2950 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
2951 1.1 mrg "grouped access in outer loop.\n");
2952 1.1 mrg return false;
2953 1.1 mrg }
2954 1.1 mrg
2955 1.1 mrg
2956 1.1 mrg /* Assume this is a DR handled by non-constant strided load case. */
2957 1.1 mrg if (TREE_CODE (step) != INTEGER_CST)
2958 1.1 mrg return (STMT_VINFO_STRIDED_P (stmt_info)
2959 1.1 mrg && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2960 1.1 mrg || vect_analyze_group_access (vinfo, dr_info)));
2961 1.1 mrg
2962 1.1 mrg /* Not consecutive access - check if it's a part of interleaving group. */
2963 1.1 mrg return vect_analyze_group_access (vinfo, dr_info);
2964 1.1 mrg }
2965 1.1 mrg
2966 1.1 mrg /* Compare two data-references DRA and DRB to group them into chunks
2967 1.1 mrg suitable for grouping. */
2968 1.1 mrg
2969 1.1 mrg static int
2970 1.1 mrg dr_group_sort_cmp (const void *dra_, const void *drb_)
2971 1.1 mrg {
2972 1.1 mrg dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2973 1.1 mrg dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2974 1.1 mrg data_reference_p dra = dra_info->dr;
2975 1.1 mrg data_reference_p drb = drb_info->dr;
2976 1.1 mrg int cmp;
2977 1.1 mrg
2978 1.1 mrg /* Stabilize sort. */
2979 1.1 mrg if (dra == drb)
2980 1.1 mrg return 0;
2981 1.1 mrg
2982 1.1 mrg /* Different group IDs lead never belong to the same group. */
2983 1.1 mrg if (dra_info->group != drb_info->group)
2984 1.1 mrg return dra_info->group < drb_info->group ? -1 : 1;
2985 1.1 mrg
2986 1.1 mrg /* Ordering of DRs according to base. */
2987 1.1 mrg cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2988 1.1 mrg DR_BASE_ADDRESS (drb));
2989 1.1 mrg if (cmp != 0)
2990 1.1 mrg return cmp;
2991 1.1 mrg
2992 1.1 mrg /* And according to DR_OFFSET. */
2993 1.1 mrg cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2994 1.1 mrg if (cmp != 0)
2995 1.1 mrg return cmp;
2996 1.1 mrg
2997 1.1 mrg /* Put reads before writes. */
2998 1.1 mrg if (DR_IS_READ (dra) != DR_IS_READ (drb))
2999 1.1 mrg return DR_IS_READ (dra) ? -1 : 1;
3000 1.1 mrg
3001 1.1 mrg /* Then sort after access size. */
3002 1.1 mrg cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3003 1.1 mrg TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3004 1.1 mrg if (cmp != 0)
3005 1.1 mrg return cmp;
3006 1.1 mrg
3007 1.1 mrg /* And after step. */
3008 1.1 mrg cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3009 1.1 mrg if (cmp != 0)
3010 1.1 mrg return cmp;
3011 1.1 mrg
3012 1.1 mrg /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3013 1.1 mrg cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3014 1.1 mrg if (cmp == 0)
3015 1.1 mrg return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3016 1.1 mrg return cmp;
3017 1.1 mrg }
3018 1.1 mrg
3019 1.1 mrg /* If OP is the result of a conversion, return the unconverted value,
3020 1.1 mrg otherwise return null. */
3021 1.1 mrg
3022 1.1 mrg static tree
3023 1.1 mrg strip_conversion (tree op)
3024 1.1 mrg {
3025 1.1 mrg if (TREE_CODE (op) != SSA_NAME)
3026 1.1 mrg return NULL_TREE;
3027 1.1 mrg gimple *stmt = SSA_NAME_DEF_STMT (op);
3028 1.1 mrg if (!is_gimple_assign (stmt)
3029 1.1 mrg || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3030 1.1 mrg return NULL_TREE;
3031 1.1 mrg return gimple_assign_rhs1 (stmt);
3032 1.1 mrg }
3033 1.1 mrg
3034 1.1 mrg /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3035 1.1 mrg and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3036 1.1 mrg be grouped in SLP mode. */
3037 1.1 mrg
3038 1.1 mrg static bool
3039 1.1 mrg can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3040 1.1 mrg bool allow_slp_p)
3041 1.1 mrg {
3042 1.1 mrg if (gimple_assign_single_p (stmt1_info->stmt))
3043 1.1 mrg return gimple_assign_single_p (stmt2_info->stmt);
3044 1.1 mrg
3045 1.1 mrg gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3046 1.1 mrg if (call1 && gimple_call_internal_p (call1))
3047 1.1 mrg {
3048 1.1 mrg /* Check for two masked loads or two masked stores. */
3049 1.1 mrg gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3050 1.1 mrg if (!call2 || !gimple_call_internal_p (call2))
3051 1.1 mrg return false;
3052 1.1 mrg internal_fn ifn = gimple_call_internal_fn (call1);
3053 1.1 mrg if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3054 1.1 mrg return false;
3055 1.1 mrg if (ifn != gimple_call_internal_fn (call2))
3056 1.1 mrg return false;
3057 1.1 mrg
3058 1.1 mrg /* Check that the masks are the same. Cope with casts of masks,
3059 1.1 mrg like those created by build_mask_conversion. */
3060 1.1 mrg tree mask1 = gimple_call_arg (call1, 2);
3061 1.1 mrg tree mask2 = gimple_call_arg (call2, 2);
3062 1.1 mrg if (!operand_equal_p (mask1, mask2, 0)
3063 1.1 mrg && (ifn == IFN_MASK_STORE || !allow_slp_p))
3064 1.1 mrg {
3065 1.1 mrg mask1 = strip_conversion (mask1);
3066 1.1 mrg if (!mask1)
3067 1.1 mrg return false;
3068 1.1 mrg mask2 = strip_conversion (mask2);
3069 1.1 mrg if (!mask2)
3070 1.1 mrg return false;
3071 1.1 mrg if (!operand_equal_p (mask1, mask2, 0))
3072 1.1 mrg return false;
3073 1.1 mrg }
3074 1.1 mrg return true;
3075 1.1 mrg }
3076 1.1 mrg
3077 1.1 mrg return false;
3078 1.1 mrg }
3079 1.1 mrg
3080 1.1 mrg /* Function vect_analyze_data_ref_accesses.
3081 1.1 mrg
3082 1.1 mrg Analyze the access pattern of all the data references in the loop.
3083 1.1 mrg
3084 1.1 mrg FORNOW: the only access pattern that is considered vectorizable is a
3085 1.1 mrg simple step 1 (consecutive) access.
3086 1.1 mrg
3087 1.1 mrg FORNOW: handle only arrays and pointer accesses. */
3088 1.1 mrg
3089 1.1 mrg opt_result
3090 1.1 mrg vect_analyze_data_ref_accesses (vec_info *vinfo,
3091 1.1 mrg vec<int> *dataref_groups)
3092 1.1 mrg {
3093 1.1 mrg unsigned int i;
3094 1.1 mrg vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3095 1.1 mrg
3096 1.1 mrg DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3097 1.1 mrg
3098 1.1 mrg if (datarefs.is_empty ())
3099 1.1 mrg return opt_result::success ();
3100 1.1 mrg
3101 1.1 mrg /* Sort the array of datarefs to make building the interleaving chains
3102 1.1 mrg linear. Don't modify the original vector's order, it is needed for
3103 1.1 mrg determining what dependencies are reversed. */
3104 1.1 mrg vec<dr_vec_info *> datarefs_copy;
3105 1.1 mrg datarefs_copy.create (datarefs.length ());
3106 1.1 mrg for (unsigned i = 0; i < datarefs.length (); i++)
3107 1.1 mrg {
3108 1.1 mrg dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3109 1.1 mrg /* If the caller computed DR grouping use that, otherwise group by
3110 1.1 mrg basic blocks. */
3111 1.1 mrg if (dataref_groups)
3112 1.1 mrg dr_info->group = (*dataref_groups)[i];
3113 1.1 mrg else
3114 1.1 mrg dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3115 1.1 mrg datarefs_copy.quick_push (dr_info);
3116 1.1 mrg }
3117 1.1 mrg datarefs_copy.qsort (dr_group_sort_cmp);
3118 1.1 mrg hash_set<stmt_vec_info> to_fixup;
3119 1.1 mrg
3120 1.1 mrg /* Build the interleaving chains. */
3121 1.1 mrg for (i = 0; i < datarefs_copy.length () - 1;)
3122 1.1 mrg {
3123 1.1 mrg dr_vec_info *dr_info_a = datarefs_copy[i];
3124 1.1 mrg data_reference_p dra = dr_info_a->dr;
3125 1.1 mrg int dra_group_id = dr_info_a->group;
3126 1.1 mrg stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3127 1.1 mrg stmt_vec_info lastinfo = NULL;
3128 1.1 mrg if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3129 1.1 mrg || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3130 1.1 mrg {
3131 1.1 mrg ++i;
3132 1.1 mrg continue;
3133 1.1 mrg }
3134 1.1 mrg for (i = i + 1; i < datarefs_copy.length (); ++i)
3135 1.1 mrg {
3136 1.1 mrg dr_vec_info *dr_info_b = datarefs_copy[i];
3137 1.1 mrg data_reference_p drb = dr_info_b->dr;
3138 1.1 mrg int drb_group_id = dr_info_b->group;
3139 1.1 mrg stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3140 1.1 mrg if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3141 1.1 mrg || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3142 1.1 mrg break;
3143 1.1 mrg
3144 1.1 mrg /* ??? Imperfect sorting (non-compatible types, non-modulo
3145 1.1 mrg accesses, same accesses) can lead to a group to be artificially
3146 1.1 mrg split here as we don't just skip over those. If it really
3147 1.1 mrg matters we can push those to a worklist and re-iterate
3148 1.1 mrg over them. The we can just skip ahead to the next DR here. */
3149 1.1 mrg
3150 1.1 mrg /* DRs in a different DR group should not be put into the same
3151 1.1 mrg interleaving group. */
3152 1.1 mrg if (dra_group_id != drb_group_id)
3153 1.1 mrg break;
3154 1.1 mrg
3155 1.1 mrg /* Check that the data-refs have same first location (except init)
3156 1.1 mrg and they are both either store or load (not load and store,
3157 1.1 mrg not masked loads or stores). */
3158 1.1 mrg if (DR_IS_READ (dra) != DR_IS_READ (drb)
3159 1.1 mrg || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3160 1.1 mrg DR_BASE_ADDRESS (drb)) != 0
3161 1.1 mrg || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3162 1.1 mrg || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3163 1.1 mrg break;
3164 1.1 mrg
3165 1.1 mrg /* Check that the data-refs have the same constant size. */
3166 1.1 mrg tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3167 1.1 mrg tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3168 1.1 mrg if (!tree_fits_uhwi_p (sza)
3169 1.1 mrg || !tree_fits_uhwi_p (szb)
3170 1.1 mrg || !tree_int_cst_equal (sza, szb))
3171 1.1 mrg break;
3172 1.1 mrg
3173 1.1 mrg /* Check that the data-refs have the same step. */
3174 1.1 mrg if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3175 1.1 mrg break;
3176 1.1 mrg
3177 1.1 mrg /* Check the types are compatible.
3178 1.1 mrg ??? We don't distinguish this during sorting. */
3179 1.1 mrg if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3180 1.1 mrg TREE_TYPE (DR_REF (drb))))
3181 1.1 mrg break;
3182 1.1 mrg
3183 1.1 mrg /* Check that the DR_INITs are compile-time constants. */
3184 1.1 mrg if (!tree_fits_shwi_p (DR_INIT (dra))
3185 1.1 mrg || !tree_fits_shwi_p (DR_INIT (drb)))
3186 1.1 mrg break;
3187 1.1 mrg
3188 1.1 mrg /* Different .GOMP_SIMD_LANE calls still give the same lane,
3189 1.1 mrg just hold extra information. */
3190 1.1 mrg if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3191 1.1 mrg && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3192 1.1 mrg && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3193 1.1 mrg break;
3194 1.1 mrg
3195 1.1 mrg /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3196 1.1 mrg HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3197 1.1 mrg HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3198 1.1 mrg HOST_WIDE_INT init_prev
3199 1.1 mrg = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3200 1.1 mrg gcc_assert (init_a <= init_b
3201 1.1 mrg && init_a <= init_prev
3202 1.1 mrg && init_prev <= init_b);
3203 1.1 mrg
3204 1.1 mrg /* Do not place the same access in the interleaving chain twice. */
3205 1.1 mrg if (init_b == init_prev)
3206 1.1 mrg {
3207 1.1 mrg gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3208 1.1 mrg < gimple_uid (DR_STMT (drb)));
3209 1.1 mrg /* Simply link in duplicates and fix up the chain below. */
3210 1.1 mrg }
3211 1.1 mrg else
3212 1.1 mrg {
3213 1.1 mrg /* If init_b == init_a + the size of the type * k, we have an
3214 1.1 mrg interleaving, and DRA is accessed before DRB. */
3215 1.1 mrg unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3216 1.1 mrg if (type_size_a == 0
3217 1.1 mrg || (((unsigned HOST_WIDE_INT)init_b - init_a)
3218 1.1 mrg % type_size_a != 0))
3219 1.1 mrg break;
3220 1.1 mrg
3221 1.1 mrg /* If we have a store, the accesses are adjacent. This splits
3222 1.1 mrg groups into chunks we support (we don't support vectorization
3223 1.1 mrg of stores with gaps). */
3224 1.1 mrg if (!DR_IS_READ (dra)
3225 1.1 mrg && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3226 1.1 mrg != type_size_a))
3227 1.1 mrg break;
3228 1.1 mrg
3229 1.1 mrg /* If the step (if not zero or non-constant) is smaller than the
3230 1.1 mrg difference between data-refs' inits this splits groups into
3231 1.1 mrg suitable sizes. */
3232 1.1 mrg if (tree_fits_shwi_p (DR_STEP (dra)))
3233 1.1 mrg {
3234 1.1 mrg unsigned HOST_WIDE_INT step
3235 1.1 mrg = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3236 1.1 mrg if (step != 0
3237 1.1 mrg && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3238 1.1 mrg break;
3239 1.1 mrg }
3240 1.1 mrg }
3241 1.1 mrg
3242 1.1 mrg if (dump_enabled_p ())
3243 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
3244 1.1 mrg DR_IS_READ (dra)
3245 1.1 mrg ? "Detected interleaving load %T and %T\n"
3246 1.1 mrg : "Detected interleaving store %T and %T\n",
3247 1.1 mrg DR_REF (dra), DR_REF (drb));
3248 1.1 mrg
3249 1.1 mrg /* Link the found element into the group list. */
3250 1.1 mrg if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3251 1.1 mrg {
3252 1.1 mrg DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3253 1.1 mrg lastinfo = stmtinfo_a;
3254 1.1 mrg }
3255 1.1 mrg DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3256 1.1 mrg DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3257 1.1 mrg lastinfo = stmtinfo_b;
3258 1.1 mrg
3259 1.1 mrg if (! STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3260 1.1 mrg {
3261 1.1 mrg STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3262 1.1 mrg = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3263 1.1 mrg
3264 1.1 mrg if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3265 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
3266 1.1 mrg "Load suitable for SLP vectorization only.\n");
3267 1.1 mrg }
3268 1.1 mrg
3269 1.1 mrg if (init_b == init_prev
3270 1.1 mrg && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3271 1.1 mrg && dump_enabled_p ())
3272 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
3273 1.1 mrg "Queuing group with duplicate access for fixup\n");
3274 1.1 mrg }
3275 1.1 mrg }
3276 1.1 mrg
3277 1.1 mrg /* Fixup groups with duplicate entries by splitting it. */
3278 1.1 mrg while (1)
3279 1.1 mrg {
3280 1.1 mrg hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3281 1.1 mrg if (!(it != to_fixup.end ()))
3282 1.1 mrg break;
3283 1.1 mrg stmt_vec_info grp = *it;
3284 1.1 mrg to_fixup.remove (grp);
3285 1.1 mrg
3286 1.1 mrg /* Find the earliest duplicate group member. */
3287 1.1 mrg unsigned first_duplicate = -1u;
3288 1.1 mrg stmt_vec_info next, g = grp;
3289 1.1 mrg while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3290 1.1 mrg {
3291 1.1 mrg if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3292 1.1 mrg DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3293 1.1 mrg && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3294 1.1 mrg first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3295 1.1 mrg g = next;
3296 1.1 mrg }
3297 1.1 mrg if (first_duplicate == -1U)
3298 1.1 mrg continue;
3299 1.1 mrg
3300 1.1 mrg /* Then move all stmts after the first duplicate to a new group.
3301 1.1 mrg Note this is a heuristic but one with the property that *it
3302 1.1 mrg is fixed up completely. */
3303 1.1 mrg g = grp;
3304 1.1 mrg stmt_vec_info newgroup = NULL, ng = grp;
3305 1.1 mrg while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3306 1.1 mrg {
3307 1.1 mrg if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3308 1.1 mrg {
3309 1.1 mrg DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3310 1.1 mrg if (!newgroup)
3311 1.1 mrg {
3312 1.1 mrg newgroup = next;
3313 1.1 mrg STMT_VINFO_SLP_VECT_ONLY (newgroup)
3314 1.1 mrg = STMT_VINFO_SLP_VECT_ONLY (grp);
3315 1.1 mrg }
3316 1.1 mrg else
3317 1.1 mrg DR_GROUP_NEXT_ELEMENT (ng) = next;
3318 1.1 mrg ng = next;
3319 1.1 mrg DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3320 1.1 mrg }
3321 1.1 mrg else
3322 1.1 mrg g = DR_GROUP_NEXT_ELEMENT (g);
3323 1.1 mrg }
3324 1.1 mrg DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3325 1.1 mrg
3326 1.1 mrg /* Fixup the new group which still may contain duplicates. */
3327 1.1 mrg to_fixup.add (newgroup);
3328 1.1 mrg }
3329 1.1 mrg
3330 1.1 mrg dr_vec_info *dr_info;
3331 1.1 mrg FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3332 1.1 mrg {
3333 1.1 mrg if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3334 1.1 mrg && !vect_analyze_data_ref_access (vinfo, dr_info))
3335 1.1 mrg {
3336 1.1 mrg if (dump_enabled_p ())
3337 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3338 1.1 mrg "not vectorized: complicated access pattern.\n");
3339 1.1 mrg
3340 1.1 mrg if (is_a <bb_vec_info> (vinfo))
3341 1.1 mrg {
3342 1.1 mrg /* Mark the statement as not vectorizable. */
3343 1.1 mrg STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3344 1.1 mrg continue;
3345 1.1 mrg }
3346 1.1 mrg else
3347 1.1 mrg {
3348 1.1 mrg datarefs_copy.release ();
3349 1.1 mrg return opt_result::failure_at (dr_info->stmt->stmt,
3350 1.1 mrg "not vectorized:"
3351 1.1 mrg " complicated access pattern.\n");
3352 1.1 mrg }
3353 1.1 mrg }
3354 1.1 mrg }
3355 1.1 mrg
3356 1.1 mrg datarefs_copy.release ();
3357 1.1 mrg return opt_result::success ();
3358 1.1 mrg }
3359 1.1 mrg
3360 1.1 mrg /* Function vect_vfa_segment_size.
3361 1.1 mrg
3362 1.1 mrg Input:
3363 1.1 mrg DR_INFO: The data reference.
3364 1.1 mrg LENGTH_FACTOR: segment length to consider.
3365 1.1 mrg
3366 1.1 mrg Return a value suitable for the dr_with_seg_len::seg_len field.
3367 1.1 mrg This is the "distance travelled" by the pointer from the first
3368 1.1 mrg iteration in the segment to the last. Note that it does not include
3369 1.1 mrg the size of the access; in effect it only describes the first byte. */
3370 1.1 mrg
3371 1.1 mrg static tree
3372 1.1 mrg vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3373 1.1 mrg {
3374 1.1 mrg length_factor = size_binop (MINUS_EXPR,
3375 1.1 mrg fold_convert (sizetype, length_factor),
3376 1.1 mrg size_one_node);
3377 1.1 mrg return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3378 1.1 mrg length_factor);
3379 1.1 mrg }
3380 1.1 mrg
3381 1.1 mrg /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3382 1.1 mrg gives the worst-case number of bytes covered by the segment. */
3383 1.1 mrg
3384 1.1 mrg static unsigned HOST_WIDE_INT
3385 1.1 mrg vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3386 1.1 mrg {
3387 1.1 mrg stmt_vec_info stmt_vinfo = dr_info->stmt;
3388 1.1 mrg tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3389 1.1 mrg unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3390 1.1 mrg unsigned HOST_WIDE_INT access_size = ref_size;
3391 1.1 mrg if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3392 1.1 mrg {
3393 1.1 mrg gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3394 1.1 mrg access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3395 1.1 mrg }
3396 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3397 1.1 mrg int misalignment;
3398 1.1 mrg if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3399 1.1 mrg && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3400 1.1 mrg && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3401 1.1 mrg == dr_explicit_realign_optimized))
3402 1.1 mrg {
3403 1.1 mrg /* We might access a full vector's worth. */
3404 1.1 mrg access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3405 1.1 mrg }
3406 1.1 mrg return access_size;
3407 1.1 mrg }
3408 1.1 mrg
3409 1.1 mrg /* Get the minimum alignment for all the scalar accesses that DR_INFO
3410 1.1 mrg describes. */
3411 1.1 mrg
3412 1.1 mrg static unsigned int
3413 1.1 mrg vect_vfa_align (dr_vec_info *dr_info)
3414 1.1 mrg {
3415 1.1 mrg return dr_alignment (dr_info->dr);
3416 1.1 mrg }
3417 1.1 mrg
3418 1.1 mrg /* Function vect_no_alias_p.
3419 1.1 mrg
3420 1.1 mrg Given data references A and B with equal base and offset, see whether
3421 1.1 mrg the alias relation can be decided at compilation time. Return 1 if
3422 1.1 mrg it can and the references alias, 0 if it can and the references do
3423 1.1 mrg not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3424 1.1 mrg SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3425 1.1 mrg of dr_with_seg_len::{seg_len,access_size} for A and B. */
3426 1.1 mrg
3427 1.1 mrg static int
3428 1.1 mrg vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3429 1.1 mrg tree segment_length_a, tree segment_length_b,
3430 1.1 mrg unsigned HOST_WIDE_INT access_size_a,
3431 1.1 mrg unsigned HOST_WIDE_INT access_size_b)
3432 1.1 mrg {
3433 1.1 mrg poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3434 1.1 mrg poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3435 1.1 mrg poly_uint64 const_length_a;
3436 1.1 mrg poly_uint64 const_length_b;
3437 1.1 mrg
3438 1.1 mrg /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3439 1.1 mrg bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3440 1.1 mrg [a, a+12) */
3441 1.1 mrg if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3442 1.1 mrg {
3443 1.1 mrg const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3444 1.1 mrg offset_a -= const_length_a;
3445 1.1 mrg }
3446 1.1 mrg else
3447 1.1 mrg const_length_a = tree_to_poly_uint64 (segment_length_a);
3448 1.1 mrg if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3449 1.1 mrg {
3450 1.1 mrg const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3451 1.1 mrg offset_b -= const_length_b;
3452 1.1 mrg }
3453 1.1 mrg else
3454 1.1 mrg const_length_b = tree_to_poly_uint64 (segment_length_b);
3455 1.1 mrg
3456 1.1 mrg const_length_a += access_size_a;
3457 1.1 mrg const_length_b += access_size_b;
3458 1.1 mrg
3459 1.1 mrg if (ranges_known_overlap_p (offset_a, const_length_a,
3460 1.1 mrg offset_b, const_length_b))
3461 1.1 mrg return 1;
3462 1.1 mrg
3463 1.1 mrg if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3464 1.1 mrg offset_b, const_length_b))
3465 1.1 mrg return 0;
3466 1.1 mrg
3467 1.1 mrg return -1;
3468 1.1 mrg }
3469 1.1 mrg
3470 1.1 mrg /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3471 1.1 mrg in DDR is >= VF. */
3472 1.1 mrg
3473 1.1 mrg static bool
3474 1.1 mrg dependence_distance_ge_vf (data_dependence_relation *ddr,
3475 1.1 mrg unsigned int loop_depth, poly_uint64 vf)
3476 1.1 mrg {
3477 1.1 mrg if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3478 1.1 mrg || DDR_NUM_DIST_VECTS (ddr) == 0)
3479 1.1 mrg return false;
3480 1.1 mrg
3481 1.1 mrg /* If the dependence is exact, we should have limited the VF instead. */
3482 1.1 mrg gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3483 1.1 mrg
3484 1.1 mrg unsigned int i;
3485 1.1 mrg lambda_vector dist_v;
3486 1.1 mrg FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3487 1.1 mrg {
3488 1.1 mrg HOST_WIDE_INT dist = dist_v[loop_depth];
3489 1.1 mrg if (dist != 0
3490 1.1 mrg && !(dist > 0 && DDR_REVERSED_P (ddr))
3491 1.1 mrg && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3492 1.1 mrg return false;
3493 1.1 mrg }
3494 1.1 mrg
3495 1.1 mrg if (dump_enabled_p ())
3496 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
3497 1.1 mrg "dependence distance between %T and %T is >= VF\n",
3498 1.1 mrg DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3499 1.1 mrg
3500 1.1 mrg return true;
3501 1.1 mrg }
3502 1.1 mrg
3503 1.1 mrg /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3504 1.1 mrg
3505 1.1 mrg static void
3506 1.1 mrg dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3507 1.1 mrg {
3508 1.1 mrg dump_printf (dump_kind, "%s (%T) >= ",
3509 1.1 mrg lower_bound.unsigned_p ? "unsigned" : "abs",
3510 1.1 mrg lower_bound.expr);
3511 1.1 mrg dump_dec (dump_kind, lower_bound.min_value);
3512 1.1 mrg }
3513 1.1 mrg
3514 1.1 mrg /* Record that the vectorized loop requires the vec_lower_bound described
3515 1.1 mrg by EXPR, UNSIGNED_P and MIN_VALUE. */
3516 1.1 mrg
3517 1.1 mrg static void
3518 1.1 mrg vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3519 1.1 mrg poly_uint64 min_value)
3520 1.1 mrg {
3521 1.1 mrg vec<vec_lower_bound> &lower_bounds
3522 1.1 mrg = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3523 1.1 mrg for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3524 1.1 mrg if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3525 1.1 mrg {
3526 1.1 mrg unsigned_p &= lower_bounds[i].unsigned_p;
3527 1.1 mrg min_value = upper_bound (lower_bounds[i].min_value, min_value);
3528 1.1 mrg if (lower_bounds[i].unsigned_p != unsigned_p
3529 1.1 mrg || maybe_lt (lower_bounds[i].min_value, min_value))
3530 1.1 mrg {
3531 1.1 mrg lower_bounds[i].unsigned_p = unsigned_p;
3532 1.1 mrg lower_bounds[i].min_value = min_value;
3533 1.1 mrg if (dump_enabled_p ())
3534 1.1 mrg {
3535 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
3536 1.1 mrg "updating run-time check to ");
3537 1.1 mrg dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3538 1.1 mrg dump_printf (MSG_NOTE, "\n");
3539 1.1 mrg }
3540 1.1 mrg }
3541 1.1 mrg return;
3542 1.1 mrg }
3543 1.1 mrg
3544 1.1 mrg vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3545 1.1 mrg if (dump_enabled_p ())
3546 1.1 mrg {
3547 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3548 1.1 mrg dump_lower_bound (MSG_NOTE, lower_bound);
3549 1.1 mrg dump_printf (MSG_NOTE, "\n");
3550 1.1 mrg }
3551 1.1 mrg LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3552 1.1 mrg }
3553 1.1 mrg
3554 1.1 mrg /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3555 1.1 mrg will span fewer than GAP bytes. */
3556 1.1 mrg
3557 1.1 mrg static bool
3558 1.1 mrg vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3559 1.1 mrg poly_int64 gap)
3560 1.1 mrg {
3561 1.1 mrg stmt_vec_info stmt_info = dr_info->stmt;
3562 1.1 mrg HOST_WIDE_INT count
3563 1.1 mrg = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3564 1.1 mrg if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3565 1.1 mrg count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3566 1.1 mrg return (estimated_poly_value (gap)
3567 1.1 mrg <= count * vect_get_scalar_dr_size (dr_info));
3568 1.1 mrg }
3569 1.1 mrg
3570 1.1 mrg /* Return true if we know that there is no alias between DR_INFO_A and
3571 1.1 mrg DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3572 1.1 mrg When returning true, set *LOWER_BOUND_OUT to this N. */
3573 1.1 mrg
3574 1.1 mrg static bool
3575 1.1 mrg vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3576 1.1 mrg poly_uint64 *lower_bound_out)
3577 1.1 mrg {
3578 1.1 mrg /* Check that there is a constant gap of known sign between DR_A
3579 1.1 mrg and DR_B. */
3580 1.1 mrg data_reference *dr_a = dr_info_a->dr;
3581 1.1 mrg data_reference *dr_b = dr_info_b->dr;
3582 1.1 mrg poly_int64 init_a, init_b;
3583 1.1 mrg if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3584 1.1 mrg || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3585 1.1 mrg || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3586 1.1 mrg || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3587 1.1 mrg || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3588 1.1 mrg || !ordered_p (init_a, init_b))
3589 1.1 mrg return false;
3590 1.1 mrg
3591 1.1 mrg /* Sort DR_A and DR_B by the address they access. */
3592 1.1 mrg if (maybe_lt (init_b, init_a))
3593 1.1 mrg {
3594 1.1 mrg std::swap (init_a, init_b);
3595 1.1 mrg std::swap (dr_info_a, dr_info_b);
3596 1.1 mrg std::swap (dr_a, dr_b);
3597 1.1 mrg }
3598 1.1 mrg
3599 1.1 mrg /* If the two accesses could be dependent within a scalar iteration,
3600 1.1 mrg make sure that we'd retain their order. */
3601 1.1 mrg if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3602 1.1 mrg && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3603 1.1 mrg return false;
3604 1.1 mrg
3605 1.1 mrg /* There is no alias if abs (DR_STEP) is greater than or equal to
3606 1.1 mrg the bytes spanned by the combination of the two accesses. */
3607 1.1 mrg *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3608 1.1 mrg return true;
3609 1.1 mrg }
3610 1.1 mrg
3611 1.1 mrg /* Function vect_prune_runtime_alias_test_list.
3612 1.1 mrg
3613 1.1 mrg Prune a list of ddrs to be tested at run-time by versioning for alias.
3614 1.1 mrg Merge several alias checks into one if possible.
3615 1.1 mrg Return FALSE if resulting list of ddrs is longer then allowed by
3616 1.1 mrg PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3617 1.1 mrg
3618 1.1 mrg opt_result
3619 1.1 mrg vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3620 1.1 mrg {
3621 1.1 mrg typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3622 1.1 mrg hash_set <tree_pair_hash> compared_objects;
3623 1.1 mrg
3624 1.1 mrg const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3625 1.1 mrg vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3626 1.1 mrg = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3627 1.1 mrg const vec<vec_object_pair> &check_unequal_addrs
3628 1.1 mrg = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3629 1.1 mrg poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3630 1.1 mrg tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3631 1.1 mrg
3632 1.1 mrg ddr_p ddr;
3633 1.1 mrg unsigned int i;
3634 1.1 mrg tree length_factor;
3635 1.1 mrg
3636 1.1 mrg DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3637 1.1 mrg
3638 1.1 mrg /* Step values are irrelevant for aliasing if the number of vector
3639 1.1 mrg iterations is equal to the number of scalar iterations (which can
3640 1.1 mrg happen for fully-SLP loops). */
3641 1.1 mrg bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3642 1.1 mrg
3643 1.1 mrg if (!vf_one_p)
3644 1.1 mrg {
3645 1.1 mrg /* Convert the checks for nonzero steps into bound tests. */
3646 1.1 mrg tree value;
3647 1.1 mrg FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3648 1.1 mrg vect_check_lower_bound (loop_vinfo, value, true, 1);
3649 1.1 mrg }
3650 1.1 mrg
3651 1.1 mrg if (may_alias_ddrs.is_empty ())
3652 1.1 mrg return opt_result::success ();
3653 1.1 mrg
3654 1.1 mrg comp_alias_ddrs.create (may_alias_ddrs.length ());
3655 1.1 mrg
3656 1.1 mrg unsigned int loop_depth
3657 1.1 mrg = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3658 1.1 mrg LOOP_VINFO_LOOP_NEST (loop_vinfo));
3659 1.1 mrg
3660 1.1 mrg /* First, we collect all data ref pairs for aliasing checks. */
3661 1.1 mrg FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3662 1.1 mrg {
3663 1.1 mrg poly_uint64 lower_bound;
3664 1.1 mrg tree segment_length_a, segment_length_b;
3665 1.1 mrg unsigned HOST_WIDE_INT access_size_a, access_size_b;
3666 1.1 mrg unsigned HOST_WIDE_INT align_a, align_b;
3667 1.1 mrg
3668 1.1 mrg /* Ignore the alias if the VF we chose ended up being no greater
3669 1.1 mrg than the dependence distance. */
3670 1.1 mrg if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3671 1.1 mrg continue;
3672 1.1 mrg
3673 1.1 mrg if (DDR_OBJECT_A (ddr))
3674 1.1 mrg {
3675 1.1 mrg vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3676 1.1 mrg if (!compared_objects.add (new_pair))
3677 1.1 mrg {
3678 1.1 mrg if (dump_enabled_p ())
3679 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
3680 1.1 mrg "checking that %T and %T"
3681 1.1 mrg " have different addresses\n",
3682 1.1 mrg new_pair.first, new_pair.second);
3683 1.1 mrg LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3684 1.1 mrg }
3685 1.1 mrg continue;
3686 1.1 mrg }
3687 1.1 mrg
3688 1.1 mrg dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3689 1.1 mrg stmt_vec_info stmt_info_a = dr_info_a->stmt;
3690 1.1 mrg
3691 1.1 mrg dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3692 1.1 mrg stmt_vec_info stmt_info_b = dr_info_b->stmt;
3693 1.1 mrg
3694 1.1 mrg bool preserves_scalar_order_p
3695 1.1 mrg = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3696 1.1 mrg bool ignore_step_p
3697 1.1 mrg = (vf_one_p
3698 1.1 mrg && (preserves_scalar_order_p
3699 1.1 mrg || operand_equal_p (DR_STEP (dr_info_a->dr),
3700 1.1 mrg DR_STEP (dr_info_b->dr))));
3701 1.1 mrg
3702 1.1 mrg /* Skip the pair if inter-iteration dependencies are irrelevant
3703 1.1 mrg and intra-iteration dependencies are guaranteed to be honored. */
3704 1.1 mrg if (ignore_step_p
3705 1.1 mrg && (preserves_scalar_order_p
3706 1.1 mrg || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3707 1.1 mrg &lower_bound)))
3708 1.1 mrg {
3709 1.1 mrg if (dump_enabled_p ())
3710 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
3711 1.1 mrg "no need for alias check between "
3712 1.1 mrg "%T and %T when VF is 1\n",
3713 1.1 mrg DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3714 1.1 mrg continue;
3715 1.1 mrg }
3716 1.1 mrg
3717 1.1 mrg /* See whether we can handle the alias using a bounds check on
3718 1.1 mrg the step, and whether that's likely to be the best approach.
3719 1.1 mrg (It might not be, for example, if the minimum step is much larger
3720 1.1 mrg than the number of bytes handled by one vector iteration.) */
3721 1.1 mrg if (!ignore_step_p
3722 1.1 mrg && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3723 1.1 mrg && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3724 1.1 mrg &lower_bound)
3725 1.1 mrg && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3726 1.1 mrg || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3727 1.1 mrg {
3728 1.1 mrg bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3729 1.1 mrg if (dump_enabled_p ())
3730 1.1 mrg {
3731 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3732 1.1 mrg "%T and %T when the step %T is outside ",
3733 1.1 mrg DR_REF (dr_info_a->dr),
3734 1.1 mrg DR_REF (dr_info_b->dr),
3735 1.1 mrg DR_STEP (dr_info_a->dr));
3736 1.1 mrg if (unsigned_p)
3737 1.1 mrg dump_printf (MSG_NOTE, "[0");
3738 1.1 mrg else
3739 1.1 mrg {
3740 1.1 mrg dump_printf (MSG_NOTE, "(");
3741 1.1 mrg dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3742 1.1 mrg }
3743 1.1 mrg dump_printf (MSG_NOTE, ", ");
3744 1.1 mrg dump_dec (MSG_NOTE, lower_bound);
3745 1.1 mrg dump_printf (MSG_NOTE, ")\n");
3746 1.1 mrg }
3747 1.1 mrg vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3748 1.1 mrg unsigned_p, lower_bound);
3749 1.1 mrg continue;
3750 1.1 mrg }
3751 1.1 mrg
3752 1.1 mrg stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3753 1.1 mrg if (dr_group_first_a)
3754 1.1 mrg {
3755 1.1 mrg stmt_info_a = dr_group_first_a;
3756 1.1 mrg dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3757 1.1 mrg }
3758 1.1 mrg
3759 1.1 mrg stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3760 1.1 mrg if (dr_group_first_b)
3761 1.1 mrg {
3762 1.1 mrg stmt_info_b = dr_group_first_b;
3763 1.1 mrg dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3764 1.1 mrg }
3765 1.1 mrg
3766 1.1 mrg if (ignore_step_p)
3767 1.1 mrg {
3768 1.1 mrg segment_length_a = size_zero_node;
3769 1.1 mrg segment_length_b = size_zero_node;
3770 1.1 mrg }
3771 1.1 mrg else
3772 1.1 mrg {
3773 1.1 mrg if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3774 1.1 mrg DR_STEP (dr_info_b->dr), 0))
3775 1.1 mrg length_factor = scalar_loop_iters;
3776 1.1 mrg else
3777 1.1 mrg length_factor = size_int (vect_factor);
3778 1.1 mrg segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3779 1.1 mrg segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3780 1.1 mrg }
3781 1.1 mrg access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3782 1.1 mrg access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3783 1.1 mrg align_a = vect_vfa_align (dr_info_a);
3784 1.1 mrg align_b = vect_vfa_align (dr_info_b);
3785 1.1 mrg
3786 1.1 mrg /* See whether the alias is known at compilation time. */
3787 1.1 mrg if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3788 1.1 mrg DR_BASE_ADDRESS (dr_info_b->dr), 0)
3789 1.1 mrg && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3790 1.1 mrg DR_OFFSET (dr_info_b->dr), 0)
3791 1.1 mrg && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3792 1.1 mrg && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3793 1.1 mrg && poly_int_tree_p (segment_length_a)
3794 1.1 mrg && poly_int_tree_p (segment_length_b))
3795 1.1 mrg {
3796 1.1 mrg int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3797 1.1 mrg segment_length_a,
3798 1.1 mrg segment_length_b,
3799 1.1 mrg access_size_a,
3800 1.1 mrg access_size_b);
3801 1.1 mrg if (res >= 0 && dump_enabled_p ())
3802 1.1 mrg {
3803 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
3804 1.1 mrg "can tell at compile time that %T and %T",
3805 1.1 mrg DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3806 1.1 mrg if (res == 0)
3807 1.1 mrg dump_printf (MSG_NOTE, " do not alias\n");
3808 1.1 mrg else
3809 1.1 mrg dump_printf (MSG_NOTE, " alias\n");
3810 1.1 mrg }
3811 1.1 mrg
3812 1.1 mrg if (res == 0)
3813 1.1 mrg continue;
3814 1.1 mrg
3815 1.1 mrg if (res == 1)
3816 1.1 mrg return opt_result::failure_at (stmt_info_b->stmt,
3817 1.1 mrg "not vectorized:"
3818 1.1 mrg " compilation time alias: %G%G",
3819 1.1 mrg stmt_info_a->stmt,
3820 1.1 mrg stmt_info_b->stmt);
3821 1.1 mrg }
3822 1.1 mrg
3823 1.1 mrg /* dr_with_seg_len requires the alignment to apply to the segment length
3824 1.1 mrg and access size, not just the start address. The access size can be
3825 1.1 mrg smaller than the pointer alignment for grouped accesses and bitfield
3826 1.1 mrg references; see PR115192 and PR116125 respectively. */
3827 1.1 mrg align_a = std::min (align_a, least_bit_hwi (access_size_a));
3828 1.1 mrg align_b = std::min (align_b, least_bit_hwi (access_size_b));
3829 1.1 mrg
3830 1.1 mrg dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3831 1.1 mrg access_size_a, align_a);
3832 1.1 mrg dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3833 1.1 mrg access_size_b, align_b);
3834 1.1 mrg /* Canonicalize the order to be the one that's needed for accurate
3835 1.1 mrg RAW, WAR and WAW flags, in cases where the data references are
3836 1.1 mrg well-ordered. The order doesn't really matter otherwise,
3837 1.1 mrg but we might as well be consistent. */
3838 1.1 mrg if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3839 1.1 mrg std::swap (dr_a, dr_b);
3840 1.1 mrg
3841 1.1 mrg dr_with_seg_len_pair_t dr_with_seg_len_pair
3842 1.1 mrg (dr_a, dr_b, (preserves_scalar_order_p
3843 1.1 mrg ? dr_with_seg_len_pair_t::WELL_ORDERED
3844 1.1 mrg : dr_with_seg_len_pair_t::REORDERED));
3845 1.1 mrg
3846 1.1 mrg comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3847 1.1 mrg }
3848 1.1 mrg
3849 1.1 mrg prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3850 1.1 mrg
3851 1.1 mrg unsigned int count = (comp_alias_ddrs.length ()
3852 1.1 mrg + check_unequal_addrs.length ());
3853 1.1 mrg
3854 1.1 mrg if (count
3855 1.1 mrg && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3856 1.1 mrg == VECT_COST_MODEL_VERY_CHEAP))
3857 1.1 mrg return opt_result::failure_at
3858 1.1 mrg (vect_location, "would need a runtime alias check\n");
3859 1.1 mrg
3860 1.1 mrg if (dump_enabled_p ())
3861 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
3862 1.1 mrg "improved number of alias checks from %d to %d\n",
3863 1.1 mrg may_alias_ddrs.length (), count);
3864 1.1 mrg unsigned limit = param_vect_max_version_for_alias_checks;
3865 1.1 mrg if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3866 1.1 mrg limit = param_vect_max_version_for_alias_checks * 6 / 10;
3867 1.1 mrg if (count > limit)
3868 1.1 mrg return opt_result::failure_at
3869 1.1 mrg (vect_location,
3870 1.1 mrg "number of versioning for alias run-time tests exceeds %d "
3871 1.1 mrg "(--param vect-max-version-for-alias-checks)\n", limit);
3872 1.1 mrg
3873 1.1 mrg return opt_result::success ();
3874 1.1 mrg }
3875 1.1 mrg
3876 1.1 mrg /* Check whether we can use an internal function for a gather load
3877 1.1 mrg or scatter store. READ_P is true for loads and false for stores.
3878 1.1 mrg MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3879 1.1 mrg the type of the memory elements being loaded or stored. OFFSET_TYPE
3880 1.1 mrg is the type of the offset that is being applied to the invariant
3881 1.1 mrg base address. SCALE is the amount by which the offset should
3882 1.1 mrg be multiplied *after* it has been converted to address width.
3883 1.1 mrg
3884 1.1 mrg Return true if the function is supported, storing the function id in
3885 1.1 mrg *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3886 1.1 mrg
3887 1.1 mrg bool
3888 1.1 mrg vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3889 1.1 mrg tree vectype, tree memory_type, tree offset_type,
3890 1.1 mrg int scale, internal_fn *ifn_out,
3891 1.1 mrg tree *offset_vectype_out)
3892 1.1 mrg {
3893 1.1 mrg unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3894 1.1 mrg unsigned int element_bits = vector_element_bits (vectype);
3895 1.1 mrg if (element_bits != memory_bits)
3896 1.1 mrg /* For now the vector elements must be the same width as the
3897 1.1 mrg memory elements. */
3898 1.1 mrg return false;
3899 1.1 mrg
3900 1.1 mrg /* Work out which function we need. */
3901 1.1 mrg internal_fn ifn, alt_ifn;
3902 1.1 mrg if (read_p)
3903 1.1 mrg {
3904 1.1 mrg ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3905 1.1 mrg alt_ifn = IFN_MASK_GATHER_LOAD;
3906 1.1 mrg }
3907 1.1 mrg else
3908 1.1 mrg {
3909 1.1 mrg ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3910 1.1 mrg alt_ifn = IFN_MASK_SCATTER_STORE;
3911 1.1 mrg }
3912 1.1 mrg
3913 1.1 mrg for (;;)
3914 1.1 mrg {
3915 1.1 mrg tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3916 1.1 mrg if (!offset_vectype)
3917 1.1 mrg return false;
3918 1.1 mrg
3919 1.1 mrg /* Test whether the target supports this combination. */
3920 1.1 mrg if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3921 1.1 mrg offset_vectype, scale))
3922 1.1 mrg {
3923 1.1 mrg *ifn_out = ifn;
3924 1.1 mrg *offset_vectype_out = offset_vectype;
3925 1.1 mrg return true;
3926 1.1 mrg }
3927 1.1 mrg else if (!masked_p
3928 1.1 mrg && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3929 1.1 mrg memory_type,
3930 1.1 mrg offset_vectype,
3931 1.1 mrg scale))
3932 1.1 mrg {
3933 1.1 mrg *ifn_out = alt_ifn;
3934 1.1 mrg *offset_vectype_out = offset_vectype;
3935 1.1 mrg return true;
3936 1.1 mrg }
3937 1.1 mrg
3938 1.1 mrg if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3939 1.1 mrg && TYPE_PRECISION (offset_type) >= element_bits)
3940 1.1 mrg return false;
3941 1.1 mrg
3942 1.1 mrg offset_type = build_nonstandard_integer_type
3943 1.1 mrg (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3944 1.1 mrg }
3945 1.1 mrg }
3946 1.1 mrg
3947 1.1 mrg /* STMT_INFO is a call to an internal gather load or scatter store function.
3948 1.1 mrg Describe the operation in INFO. */
3949 1.1 mrg
3950 1.1 mrg static void
3951 1.1 mrg vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3952 1.1 mrg gather_scatter_info *info)
3953 1.1 mrg {
3954 1.1 mrg gcall *call = as_a <gcall *> (stmt_info->stmt);
3955 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3956 1.1 mrg data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3957 1.1 mrg
3958 1.1 mrg info->ifn = gimple_call_internal_fn (call);
3959 1.1 mrg info->decl = NULL_TREE;
3960 1.1 mrg info->base = gimple_call_arg (call, 0);
3961 1.1 mrg info->offset = gimple_call_arg (call, 1);
3962 1.1 mrg info->offset_dt = vect_unknown_def_type;
3963 1.1 mrg info->offset_vectype = NULL_TREE;
3964 1.1 mrg info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3965 1.1 mrg info->element_type = TREE_TYPE (vectype);
3966 1.1 mrg info->memory_type = TREE_TYPE (DR_REF (dr));
3967 1.1 mrg }
3968 1.1 mrg
3969 1.1 mrg /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3970 1.1 mrg gather load or scatter store. Describe the operation in *INFO if so. */
3971 1.1 mrg
3972 1.1 mrg bool
3973 1.1 mrg vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3974 1.1 mrg gather_scatter_info *info)
3975 1.1 mrg {
3976 1.1 mrg HOST_WIDE_INT scale = 1;
3977 1.1 mrg poly_int64 pbitpos, pbitsize;
3978 1.1 mrg class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3979 1.1 mrg struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3980 1.1 mrg tree offtype = NULL_TREE;
3981 1.1 mrg tree decl = NULL_TREE, base, off;
3982 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3983 1.1 mrg tree memory_type = TREE_TYPE (DR_REF (dr));
3984 1.1 mrg machine_mode pmode;
3985 1.1 mrg int punsignedp, reversep, pvolatilep = 0;
3986 1.1 mrg internal_fn ifn;
3987 1.1 mrg tree offset_vectype;
3988 1.1 mrg bool masked_p = false;
3989 1.1 mrg
3990 1.1 mrg /* See whether this is already a call to a gather/scatter internal function.
3991 1.1 mrg If not, see whether it's a masked load or store. */
3992 1.1 mrg gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3993 1.1 mrg if (call && gimple_call_internal_p (call))
3994 1.1 mrg {
3995 1.1 mrg ifn = gimple_call_internal_fn (call);
3996 1.1 mrg if (internal_gather_scatter_fn_p (ifn))
3997 1.1 mrg {
3998 1.1 mrg vect_describe_gather_scatter_call (stmt_info, info);
3999 1.1 mrg return true;
4000 1.1 mrg }
4001 1.1 mrg masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4002 1.1 mrg }
4003 1.1 mrg
4004 1.1 mrg /* True if we should aim to use internal functions rather than
4005 1.1 mrg built-in functions. */
4006 1.1 mrg bool use_ifn_p = (DR_IS_READ (dr)
4007 1.1 mrg ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4008 1.1 mrg : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4009 1.1 mrg
4010 1.1 mrg base = DR_REF (dr);
4011 1.1 mrg /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4012 1.1 mrg see if we can use the def stmt of the address. */
4013 1.1 mrg if (masked_p
4014 1.1 mrg && TREE_CODE (base) == MEM_REF
4015 1.1 mrg && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4016 1.1 mrg && integer_zerop (TREE_OPERAND (base, 1))
4017 1.1 mrg && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4018 1.1 mrg {
4019 1.1 mrg gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4020 1.1 mrg if (is_gimple_assign (def_stmt)
4021 1.1 mrg && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4022 1.1 mrg base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4023 1.1 mrg }
4024 1.1 mrg
4025 1.1 mrg /* The gather and scatter builtins need address of the form
4026 1.1 mrg loop_invariant + vector * {1, 2, 4, 8}
4027 1.1 mrg or
4028 1.1 mrg loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4029 1.1 mrg Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4030 1.1 mrg of loop invariants/SSA_NAMEs defined in the loop, with casts,
4031 1.1 mrg multiplications and additions in it. To get a vector, we need
4032 1.1 mrg a single SSA_NAME that will be defined in the loop and will
4033 1.1 mrg contain everything that is not loop invariant and that can be
4034 1.1 mrg vectorized. The following code attempts to find such a preexistng
4035 1.1 mrg SSA_NAME OFF and put the loop invariants into a tree BASE
4036 1.1 mrg that can be gimplified before the loop. */
4037 1.1 mrg base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4038 1.1 mrg &punsignedp, &reversep, &pvolatilep);
4039 1.1 mrg if (reversep)
4040 1.1 mrg return false;
4041 1.1 mrg
4042 1.1 mrg poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4043 1.1 mrg
4044 1.1 mrg if (TREE_CODE (base) == MEM_REF)
4045 1.1 mrg {
4046 1.1 mrg if (!integer_zerop (TREE_OPERAND (base, 1)))
4047 1.1 mrg {
4048 1.1 mrg if (off == NULL_TREE)
4049 1.1 mrg off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4050 1.1 mrg else
4051 1.1 mrg off = size_binop (PLUS_EXPR, off,
4052 1.1 mrg fold_convert (sizetype, TREE_OPERAND (base, 1)));
4053 1.1 mrg }
4054 1.1 mrg base = TREE_OPERAND (base, 0);
4055 1.1 mrg }
4056 1.1 mrg else
4057 1.1 mrg base = build_fold_addr_expr (base);
4058 1.1 mrg
4059 1.1 mrg if (off == NULL_TREE)
4060 1.1 mrg off = size_zero_node;
4061 1.1 mrg
4062 1.1 mrg /* If base is not loop invariant, either off is 0, then we start with just
4063 1.1 mrg the constant offset in the loop invariant BASE and continue with base
4064 1.1 mrg as OFF, otherwise give up.
4065 1.1 mrg We could handle that case by gimplifying the addition of base + off
4066 1.1 mrg into some SSA_NAME and use that as off, but for now punt. */
4067 1.1 mrg if (!expr_invariant_in_loop_p (loop, base))
4068 1.1 mrg {
4069 1.1 mrg if (!integer_zerop (off))
4070 1.1 mrg return false;
4071 1.1 mrg off = base;
4072 1.1 mrg base = size_int (pbytepos);
4073 1.1 mrg }
4074 1.1 mrg /* Otherwise put base + constant offset into the loop invariant BASE
4075 1.1 mrg and continue with OFF. */
4076 1.1 mrg else
4077 1.1 mrg {
4078 1.1 mrg base = fold_convert (sizetype, base);
4079 1.1 mrg base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4080 1.1 mrg }
4081 1.1 mrg
4082 1.1 mrg /* OFF at this point may be either a SSA_NAME or some tree expression
4083 1.1 mrg from get_inner_reference. Try to peel off loop invariants from it
4084 1.1 mrg into BASE as long as possible. */
4085 1.1 mrg STRIP_NOPS (off);
4086 1.1 mrg while (offtype == NULL_TREE)
4087 1.1 mrg {
4088 1.1 mrg enum tree_code code;
4089 1.1 mrg tree op0, op1, add = NULL_TREE;
4090 1.1 mrg
4091 1.1 mrg if (TREE_CODE (off) == SSA_NAME)
4092 1.1 mrg {
4093 1.1 mrg gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4094 1.1 mrg
4095 1.1 mrg if (expr_invariant_in_loop_p (loop, off))
4096 1.1 mrg return false;
4097 1.1 mrg
4098 1.1 mrg if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4099 1.1 mrg break;
4100 1.1 mrg
4101 1.1 mrg op0 = gimple_assign_rhs1 (def_stmt);
4102 1.1 mrg code = gimple_assign_rhs_code (def_stmt);
4103 1.1 mrg op1 = gimple_assign_rhs2 (def_stmt);
4104 1.1 mrg }
4105 1.1 mrg else
4106 1.1 mrg {
4107 1.1 mrg if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4108 1.1 mrg return false;
4109 1.1 mrg code = TREE_CODE (off);
4110 1.1 mrg extract_ops_from_tree (off, &code, &op0, &op1);
4111 1.1 mrg }
4112 1.1 mrg switch (code)
4113 1.1 mrg {
4114 1.1 mrg case POINTER_PLUS_EXPR:
4115 1.1 mrg case PLUS_EXPR:
4116 1.1 mrg if (expr_invariant_in_loop_p (loop, op0))
4117 1.1 mrg {
4118 1.1 mrg add = op0;
4119 1.1 mrg off = op1;
4120 1.1 mrg do_add:
4121 1.1 mrg add = fold_convert (sizetype, add);
4122 1.1 mrg if (scale != 1)
4123 1.1 mrg add = size_binop (MULT_EXPR, add, size_int (scale));
4124 1.1 mrg base = size_binop (PLUS_EXPR, base, add);
4125 1.1 mrg continue;
4126 1.1 mrg }
4127 1.1 mrg if (expr_invariant_in_loop_p (loop, op1))
4128 1.1 mrg {
4129 1.1 mrg add = op1;
4130 1.1 mrg off = op0;
4131 1.1 mrg goto do_add;
4132 1.1 mrg }
4133 1.1 mrg break;
4134 1.1 mrg case MINUS_EXPR:
4135 1.1 mrg if (expr_invariant_in_loop_p (loop, op1))
4136 1.1 mrg {
4137 1.1 mrg add = fold_convert (sizetype, op1);
4138 1.1 mrg add = size_binop (MINUS_EXPR, size_zero_node, add);
4139 1.1 mrg off = op0;
4140 1.1 mrg goto do_add;
4141 1.1 mrg }
4142 1.1 mrg break;
4143 1.1 mrg case MULT_EXPR:
4144 1.1 mrg if (scale == 1 && tree_fits_shwi_p (op1))
4145 1.1 mrg {
4146 1.1 mrg int new_scale = tree_to_shwi (op1);
4147 1.1 mrg /* Only treat this as a scaling operation if the target
4148 1.1 mrg supports it for at least some offset type. */
4149 1.1 mrg if (use_ifn_p
4150 1.1 mrg && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4151 1.1 mrg masked_p, vectype, memory_type,
4152 1.1 mrg signed_char_type_node,
4153 1.1 mrg new_scale, &ifn,
4154 1.1 mrg &offset_vectype)
4155 1.1 mrg && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4156 1.1 mrg masked_p, vectype, memory_type,
4157 1.1 mrg unsigned_char_type_node,
4158 1.1 mrg new_scale, &ifn,
4159 1.1 mrg &offset_vectype))
4160 1.1 mrg break;
4161 1.1 mrg scale = new_scale;
4162 1.1 mrg off = op0;
4163 1.1 mrg continue;
4164 1.1 mrg }
4165 1.1 mrg break;
4166 1.1 mrg case SSA_NAME:
4167 1.1 mrg off = op0;
4168 1.1 mrg continue;
4169 1.1 mrg CASE_CONVERT:
4170 1.1 mrg if (!POINTER_TYPE_P (TREE_TYPE (op0))
4171 1.1 mrg && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4172 1.1 mrg break;
4173 1.1 mrg
4174 1.1 mrg /* Don't include the conversion if the target is happy with
4175 1.1 mrg the current offset type. */
4176 1.1 mrg if (use_ifn_p
4177 1.1 mrg && !POINTER_TYPE_P (TREE_TYPE (off))
4178 1.1 mrg && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4179 1.1 mrg masked_p, vectype, memory_type,
4180 1.1 mrg TREE_TYPE (off), scale, &ifn,
4181 1.1 mrg &offset_vectype))
4182 1.1 mrg break;
4183 1.1 mrg
4184 1.1 mrg if (TYPE_PRECISION (TREE_TYPE (op0))
4185 1.1 mrg == TYPE_PRECISION (TREE_TYPE (off)))
4186 1.1 mrg {
4187 1.1 mrg off = op0;
4188 1.1 mrg continue;
4189 1.1 mrg }
4190 1.1 mrg
4191 1.1 mrg /* Include the conversion if it is widening and we're using
4192 1.1 mrg the IFN path or the target can handle the converted from
4193 1.1 mrg offset or the current size is not already the same as the
4194 1.1 mrg data vector element size. */
4195 1.1 mrg if ((TYPE_PRECISION (TREE_TYPE (op0))
4196 1.1 mrg < TYPE_PRECISION (TREE_TYPE (off)))
4197 1.1 mrg && (use_ifn_p
4198 1.1 mrg || (DR_IS_READ (dr)
4199 1.1 mrg ? (targetm.vectorize.builtin_gather
4200 1.1 mrg && targetm.vectorize.builtin_gather (vectype,
4201 1.1 mrg TREE_TYPE (op0),
4202 1.1 mrg scale))
4203 1.1 mrg : (targetm.vectorize.builtin_scatter
4204 1.1 mrg && targetm.vectorize.builtin_scatter (vectype,
4205 1.1 mrg TREE_TYPE (op0),
4206 1.1 mrg scale)))
4207 1.1 mrg || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4208 1.1 mrg TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4209 1.1 mrg {
4210 1.1 mrg off = op0;
4211 1.1 mrg offtype = TREE_TYPE (off);
4212 1.1 mrg STRIP_NOPS (off);
4213 1.1 mrg continue;
4214 1.1 mrg }
4215 1.1 mrg break;
4216 1.1 mrg default:
4217 1.1 mrg break;
4218 1.1 mrg }
4219 1.1 mrg break;
4220 1.1 mrg }
4221 1.1 mrg
4222 1.1 mrg /* If at the end OFF still isn't a SSA_NAME or isn't
4223 1.1 mrg defined in the loop, punt. */
4224 1.1 mrg if (TREE_CODE (off) != SSA_NAME
4225 1.1 mrg || expr_invariant_in_loop_p (loop, off))
4226 1.1 mrg return false;
4227 1.1 mrg
4228 1.1 mrg if (offtype == NULL_TREE)
4229 1.1 mrg offtype = TREE_TYPE (off);
4230 1.1 mrg
4231 1.1 mrg if (use_ifn_p)
4232 1.1 mrg {
4233 1.1 mrg if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4234 1.1 mrg vectype, memory_type, offtype, scale,
4235 1.1 mrg &ifn, &offset_vectype))
4236 1.1 mrg ifn = IFN_LAST;
4237 1.1 mrg decl = NULL_TREE;
4238 1.1 mrg }
4239 1.1 mrg else
4240 1.1 mrg {
4241 1.1 mrg if (DR_IS_READ (dr))
4242 1.1 mrg {
4243 1.1 mrg if (targetm.vectorize.builtin_gather)
4244 1.1 mrg decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4245 1.1 mrg }
4246 1.1 mrg else
4247 1.1 mrg {
4248 1.1 mrg if (targetm.vectorize.builtin_scatter)
4249 1.1 mrg decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4250 1.1 mrg }
4251 1.1 mrg ifn = IFN_LAST;
4252 1.1 mrg /* The offset vector type will be read from DECL when needed. */
4253 1.1 mrg offset_vectype = NULL_TREE;
4254 1.1 mrg }
4255 1.1 mrg
4256 1.1 mrg info->ifn = ifn;
4257 1.1 mrg info->decl = decl;
4258 1.1 mrg info->base = base;
4259 1.1 mrg info->offset = off;
4260 1.1 mrg info->offset_dt = vect_unknown_def_type;
4261 1.1 mrg info->offset_vectype = offset_vectype;
4262 1.1 mrg info->scale = scale;
4263 1.1 mrg info->element_type = TREE_TYPE (vectype);
4264 1.1 mrg info->memory_type = memory_type;
4265 1.1 mrg return true;
4266 1.1 mrg }
4267 1.1 mrg
4268 1.1 mrg /* Find the data references in STMT, analyze them with respect to LOOP and
4269 1.1 mrg append them to DATAREFS. Return false if datarefs in this stmt cannot
4270 1.1 mrg be handled. */
4271 1.1 mrg
4272 1.1 mrg opt_result
4273 1.1 mrg vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4274 1.1 mrg vec<data_reference_p> *datarefs,
4275 1.1 mrg vec<int> *dataref_groups, int group_id)
4276 1.1 mrg {
4277 1.1 mrg /* We can ignore clobbers for dataref analysis - they are removed during
4278 1.1 mrg loop vectorization and BB vectorization checks dependences with a
4279 1.1 mrg stmt walk. */
4280 1.1 mrg if (gimple_clobber_p (stmt))
4281 1.1 mrg return opt_result::success ();
4282 1.1 mrg
4283 1.1 mrg if (gimple_has_volatile_ops (stmt))
4284 1.1 mrg return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4285 1.1 mrg stmt);
4286 1.1 mrg
4287 1.1 mrg if (stmt_can_throw_internal (cfun, stmt))
4288 1.1 mrg return opt_result::failure_at (stmt,
4289 1.1 mrg "not vectorized:"
4290 1.1 mrg " statement can throw an exception: %G",
4291 1.1 mrg stmt);
4292 1.1 mrg
4293 1.1 mrg auto_vec<data_reference_p, 2> refs;
4294 1.1 mrg opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4295 1.1 mrg if (!res)
4296 1.1 mrg return res;
4297 1.1 mrg
4298 1.1 mrg if (refs.is_empty ())
4299 1.1 mrg return opt_result::success ();
4300 1.1 mrg
4301 1.1 mrg if (refs.length () > 1)
4302 1.1 mrg {
4303 1.1 mrg while (!refs.is_empty ())
4304 1.1 mrg free_data_ref (refs.pop ());
4305 1.1 mrg return opt_result::failure_at (stmt,
4306 1.1 mrg "not vectorized: more than one "
4307 1.1 mrg "data ref in stmt: %G", stmt);
4308 1.1 mrg }
4309 1.1 mrg
4310 1.1 mrg data_reference_p dr = refs.pop ();
4311 1.1 mrg if (gcall *call = dyn_cast <gcall *> (stmt))
4312 1.1 mrg if (!gimple_call_internal_p (call)
4313 1.1 mrg || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4314 1.1 mrg && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4315 1.1 mrg {
4316 1.1 mrg free_data_ref (dr);
4317 1.1 mrg return opt_result::failure_at (stmt,
4318 1.1 mrg "not vectorized: dr in a call %G", stmt);
4319 1.1 mrg }
4320 1.1 mrg
4321 1.1 mrg if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4322 1.1 mrg && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4323 1.1 mrg {
4324 1.1 mrg free_data_ref (dr);
4325 1.1 mrg return opt_result::failure_at (stmt,
4326 1.1 mrg "not vectorized:"
4327 1.1 mrg " statement is bitfield access %G", stmt);
4328 1.1 mrg }
4329 1.1 mrg
4330 1.1 mrg if (DR_BASE_ADDRESS (dr)
4331 1.1 mrg && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4332 1.1 mrg {
4333 1.1 mrg free_data_ref (dr);
4334 1.1 mrg return opt_result::failure_at (stmt,
4335 1.1 mrg "not vectorized:"
4336 1.1 mrg " base addr of dr is a constant\n");
4337 1.1 mrg }
4338 1.1 mrg
4339 1.1 mrg /* Check whether this may be a SIMD lane access and adjust the
4340 1.1 mrg DR to make it easier for us to handle it. */
4341 1.1 mrg if (loop
4342 1.1 mrg && loop->simduid
4343 1.1 mrg && (!DR_BASE_ADDRESS (dr)
4344 1.1 mrg || !DR_OFFSET (dr)
4345 1.1 mrg || !DR_INIT (dr)
4346 1.1 mrg || !DR_STEP (dr)))
4347 1.1 mrg {
4348 1.1 mrg struct data_reference *newdr
4349 1.1 mrg = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4350 1.1 mrg DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4351 1.1 mrg if (DR_BASE_ADDRESS (newdr)
4352 1.1 mrg && DR_OFFSET (newdr)
4353 1.1 mrg && DR_INIT (newdr)
4354 1.1 mrg && DR_STEP (newdr)
4355 1.1 mrg && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4356 1.1 mrg && integer_zerop (DR_STEP (newdr)))
4357 1.1 mrg {
4358 1.1 mrg tree base_address = DR_BASE_ADDRESS (newdr);
4359 1.1 mrg tree off = DR_OFFSET (newdr);
4360 1.1 mrg tree step = ssize_int (1);
4361 1.1 mrg if (integer_zerop (off)
4362 1.1 mrg && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4363 1.1 mrg {
4364 1.1 mrg off = TREE_OPERAND (base_address, 1);
4365 1.1 mrg base_address = TREE_OPERAND (base_address, 0);
4366 1.1 mrg }
4367 1.1 mrg STRIP_NOPS (off);
4368 1.1 mrg if (TREE_CODE (off) == MULT_EXPR
4369 1.1 mrg && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4370 1.1 mrg {
4371 1.1 mrg step = TREE_OPERAND (off, 1);
4372 1.1 mrg off = TREE_OPERAND (off, 0);
4373 1.1 mrg STRIP_NOPS (off);
4374 1.1 mrg }
4375 1.1 mrg if (CONVERT_EXPR_P (off)
4376 1.1 mrg && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4377 1.1 mrg < TYPE_PRECISION (TREE_TYPE (off))))
4378 1.1 mrg off = TREE_OPERAND (off, 0);
4379 1.1 mrg if (TREE_CODE (off) == SSA_NAME)
4380 1.1 mrg {
4381 1.1 mrg gimple *def = SSA_NAME_DEF_STMT (off);
4382 1.1 mrg /* Look through widening conversion. */
4383 1.1 mrg if (is_gimple_assign (def)
4384 1.1 mrg && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4385 1.1 mrg {
4386 1.1 mrg tree rhs1 = gimple_assign_rhs1 (def);
4387 1.1 mrg if (TREE_CODE (rhs1) == SSA_NAME
4388 1.1 mrg && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4389 1.1 mrg && (TYPE_PRECISION (TREE_TYPE (off))
4390 1.1 mrg > TYPE_PRECISION (TREE_TYPE (rhs1))))
4391 1.1 mrg def = SSA_NAME_DEF_STMT (rhs1);
4392 1.1 mrg }
4393 1.1 mrg if (is_gimple_call (def)
4394 1.1 mrg && gimple_call_internal_p (def)
4395 1.1 mrg && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4396 1.1 mrg {
4397 1.1 mrg tree arg = gimple_call_arg (def, 0);
4398 1.1 mrg tree reft = TREE_TYPE (DR_REF (newdr));
4399 1.1 mrg gcc_assert (TREE_CODE (arg) == SSA_NAME);
4400 1.1 mrg arg = SSA_NAME_VAR (arg);
4401 1.1 mrg if (arg == loop->simduid
4402 1.1 mrg /* For now. */
4403 1.1 mrg && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4404 1.1 mrg {
4405 1.1 mrg DR_BASE_ADDRESS (newdr) = base_address;
4406 1.1 mrg DR_OFFSET (newdr) = ssize_int (0);
4407 1.1 mrg DR_STEP (newdr) = step;
4408 1.1 mrg DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4409 1.1 mrg DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4410 1.1 mrg /* Mark as simd-lane access. */
4411 1.1 mrg tree arg2 = gimple_call_arg (def, 1);
4412 1.1 mrg newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4413 1.1 mrg free_data_ref (dr);
4414 1.1 mrg datarefs->safe_push (newdr);
4415 1.1 mrg if (dataref_groups)
4416 1.1 mrg dataref_groups->safe_push (group_id);
4417 1.1 mrg return opt_result::success ();
4418 1.1 mrg }
4419 1.1 mrg }
4420 1.1 mrg }
4421 1.1 mrg }
4422 1.1 mrg free_data_ref (newdr);
4423 1.1 mrg }
4424 1.1 mrg
4425 1.1 mrg datarefs->safe_push (dr);
4426 1.1 mrg if (dataref_groups)
4427 1.1 mrg dataref_groups->safe_push (group_id);
4428 1.1 mrg return opt_result::success ();
4429 1.1 mrg }
4430 1.1 mrg
4431 1.1 mrg /* Function vect_analyze_data_refs.
4432 1.1 mrg
4433 1.1 mrg Find all the data references in the loop or basic block.
4434 1.1 mrg
4435 1.1 mrg The general structure of the analysis of data refs in the vectorizer is as
4436 1.1 mrg follows:
4437 1.1 mrg 1- vect_analyze_data_refs(loop/bb): call
4438 1.1 mrg compute_data_dependences_for_loop/bb to find and analyze all data-refs
4439 1.1 mrg in the loop/bb and their dependences.
4440 1.1 mrg 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4441 1.1 mrg 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4442 1.1 mrg 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4443 1.1 mrg
4444 1.1 mrg */
4445 1.1 mrg
4446 1.1 mrg opt_result
4447 1.1 mrg vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4448 1.1 mrg {
4449 1.1 mrg class loop *loop = NULL;
4450 1.1 mrg unsigned int i;
4451 1.1 mrg struct data_reference *dr;
4452 1.1 mrg tree scalar_type;
4453 1.1 mrg
4454 1.1 mrg DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4455 1.1 mrg
4456 1.1 mrg if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4457 1.1 mrg loop = LOOP_VINFO_LOOP (loop_vinfo);
4458 1.1 mrg
4459 1.1 mrg /* Go through the data-refs, check that the analysis succeeded. Update
4460 1.1 mrg pointer from stmt_vec_info struct to DR and vectype. */
4461 1.1 mrg
4462 1.1 mrg vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4463 1.1 mrg FOR_EACH_VEC_ELT (datarefs, i, dr)
4464 1.1 mrg {
4465 1.1 mrg enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4466 1.1 mrg poly_uint64 vf;
4467 1.1 mrg
4468 1.1 mrg gcc_assert (DR_REF (dr));
4469 1.1 mrg stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4470 1.1 mrg gcc_assert (!stmt_info->dr_aux.dr);
4471 1.1 mrg stmt_info->dr_aux.dr = dr;
4472 1.1 mrg stmt_info->dr_aux.stmt = stmt_info;
4473 1.1 mrg
4474 1.1 mrg /* Check that analysis of the data-ref succeeded. */
4475 1.1 mrg if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4476 1.1 mrg || !DR_STEP (dr))
4477 1.1 mrg {
4478 1.1 mrg bool maybe_gather
4479 1.1 mrg = DR_IS_READ (dr)
4480 1.1 mrg && !TREE_THIS_VOLATILE (DR_REF (dr));
4481 1.1 mrg bool maybe_scatter
4482 1.1 mrg = DR_IS_WRITE (dr)
4483 1.1 mrg && !TREE_THIS_VOLATILE (DR_REF (dr))
4484 1.1 mrg && (targetm.vectorize.builtin_scatter != NULL
4485 1.1 mrg || supports_vec_scatter_store_p ());
4486 1.1 mrg
4487 1.1 mrg /* If target supports vector gather loads or scatter stores,
4488 1.1 mrg see if they can't be used. */
4489 1.1 mrg if (is_a <loop_vec_info> (vinfo)
4490 1.1 mrg && !nested_in_vect_loop_p (loop, stmt_info))
4491 1.1 mrg {
4492 1.1 mrg if (maybe_gather || maybe_scatter)
4493 1.1 mrg {
4494 1.1 mrg if (maybe_gather)
4495 1.1 mrg gatherscatter = GATHER;
4496 1.1 mrg else
4497 1.1 mrg gatherscatter = SCATTER;
4498 1.1 mrg }
4499 1.1 mrg }
4500 1.1 mrg
4501 1.1 mrg if (gatherscatter == SG_NONE)
4502 1.1 mrg {
4503 1.1 mrg if (dump_enabled_p ())
4504 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4505 1.1 mrg "not vectorized: data ref analysis "
4506 1.1 mrg "failed %G", stmt_info->stmt);
4507 1.1 mrg if (is_a <bb_vec_info> (vinfo))
4508 1.1 mrg {
4509 1.1 mrg /* In BB vectorization the ref can still participate
4510 1.1 mrg in dependence analysis, we just can't vectorize it. */
4511 1.1 mrg STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4512 1.1 mrg continue;
4513 1.1 mrg }
4514 1.1 mrg return opt_result::failure_at (stmt_info->stmt,
4515 1.1 mrg "not vectorized:"
4516 1.1 mrg " data ref analysis failed: %G",
4517 1.1 mrg stmt_info->stmt);
4518 1.1 mrg }
4519 1.1 mrg }
4520 1.1 mrg
4521 1.1 mrg /* See if this was detected as SIMD lane access. */
4522 1.1 mrg if (dr->aux == (void *)-1
4523 1.1 mrg || dr->aux == (void *)-2
4524 1.1 mrg || dr->aux == (void *)-3
4525 1.1 mrg || dr->aux == (void *)-4)
4526 1.1 mrg {
4527 1.1 mrg if (nested_in_vect_loop_p (loop, stmt_info))
4528 1.1 mrg return opt_result::failure_at (stmt_info->stmt,
4529 1.1 mrg "not vectorized:"
4530 1.1 mrg " data ref analysis failed: %G",
4531 1.1 mrg stmt_info->stmt);
4532 1.1 mrg STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4533 1.1 mrg = -(uintptr_t) dr->aux;
4534 1.1 mrg }
4535 1.1 mrg
4536 1.1 mrg tree base = get_base_address (DR_REF (dr));
4537 1.1 mrg if (base && VAR_P (base) && DECL_NONALIASED (base))
4538 1.1 mrg {
4539 1.1 mrg if (dump_enabled_p ())
4540 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4541 1.1 mrg "not vectorized: base object not addressable "
4542 1.1 mrg "for stmt: %G", stmt_info->stmt);
4543 1.1 mrg if (is_a <bb_vec_info> (vinfo))
4544 1.1 mrg {
4545 1.1 mrg /* In BB vectorization the ref can still participate
4546 1.1 mrg in dependence analysis, we just can't vectorize it. */
4547 1.1 mrg STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4548 1.1 mrg continue;
4549 1.1 mrg }
4550 1.1 mrg return opt_result::failure_at (stmt_info->stmt,
4551 1.1 mrg "not vectorized: base object not"
4552 1.1 mrg " addressable for stmt: %G",
4553 1.1 mrg stmt_info->stmt);
4554 1.1 mrg }
4555 1.1 mrg
4556 1.1 mrg if (is_a <loop_vec_info> (vinfo)
4557 1.1 mrg && DR_STEP (dr)
4558 1.1 mrg && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4559 1.1 mrg {
4560 1.1 mrg if (nested_in_vect_loop_p (loop, stmt_info))
4561 1.1 mrg return opt_result::failure_at (stmt_info->stmt,
4562 1.1 mrg "not vectorized: "
4563 1.1 mrg "not suitable for strided load %G",
4564 1.1 mrg stmt_info->stmt);
4565 1.1 mrg STMT_VINFO_STRIDED_P (stmt_info) = true;
4566 1.1 mrg }
4567 1.1 mrg
4568 1.1 mrg /* Update DR field in stmt_vec_info struct. */
4569 1.1 mrg
4570 1.1 mrg /* If the dataref is in an inner-loop of the loop that is considered for
4571 1.1 mrg for vectorization, we also want to analyze the access relative to
4572 1.1 mrg the outer-loop (DR contains information only relative to the
4573 1.1 mrg inner-most enclosing loop). We do that by building a reference to the
4574 1.1 mrg first location accessed by the inner-loop, and analyze it relative to
4575 1.1 mrg the outer-loop. */
4576 1.1 mrg if (loop && nested_in_vect_loop_p (loop, stmt_info))
4577 1.1 mrg {
4578 1.1 mrg /* Build a reference to the first location accessed by the
4579 1.1 mrg inner loop: *(BASE + INIT + OFFSET). By construction,
4580 1.1 mrg this address must be invariant in the inner loop, so we
4581 1.1 mrg can consider it as being used in the outer loop. */
4582 1.1 mrg tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4583 1.1 mrg tree offset = unshare_expr (DR_OFFSET (dr));
4584 1.1 mrg tree init = unshare_expr (DR_INIT (dr));
4585 1.1 mrg tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4586 1.1 mrg init, offset);
4587 1.1 mrg tree init_addr = fold_build_pointer_plus (base, init_offset);
4588 1.1 mrg tree init_ref = build_fold_indirect_ref (init_addr);
4589 1.1 mrg
4590 1.1 mrg if (dump_enabled_p ())
4591 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
4592 1.1 mrg "analyze in outer loop: %T\n", init_ref);
4593 1.1 mrg
4594 1.1 mrg opt_result res
4595 1.1 mrg = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4596 1.1 mrg init_ref, loop, stmt_info->stmt);
4597 1.1 mrg if (!res)
4598 1.1 mrg /* dr_analyze_innermost already explained the failure. */
4599 1.1 mrg return res;
4600 1.1 mrg
4601 1.1 mrg if (dump_enabled_p ())
4602 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
4603 1.1 mrg "\touter base_address: %T\n"
4604 1.1 mrg "\touter offset from base address: %T\n"
4605 1.1 mrg "\touter constant offset from base address: %T\n"
4606 1.1 mrg "\touter step: %T\n"
4607 1.1 mrg "\touter base alignment: %d\n\n"
4608 1.1 mrg "\touter base misalignment: %d\n"
4609 1.1 mrg "\touter offset alignment: %d\n"
4610 1.1 mrg "\touter step alignment: %d\n",
4611 1.1 mrg STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4612 1.1 mrg STMT_VINFO_DR_OFFSET (stmt_info),
4613 1.1 mrg STMT_VINFO_DR_INIT (stmt_info),
4614 1.1 mrg STMT_VINFO_DR_STEP (stmt_info),
4615 1.1 mrg STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4616 1.1 mrg STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4617 1.1 mrg STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4618 1.1 mrg STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4619 1.1 mrg }
4620 1.1 mrg
4621 1.1 mrg /* Set vectype for STMT. */
4622 1.1 mrg scalar_type = TREE_TYPE (DR_REF (dr));
4623 1.1 mrg tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4624 1.1 mrg if (!vectype)
4625 1.1 mrg {
4626 1.1 mrg if (dump_enabled_p ())
4627 1.1 mrg {
4628 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4629 1.1 mrg "not vectorized: no vectype for stmt: %G",
4630 1.1 mrg stmt_info->stmt);
4631 1.1 mrg dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4632 1.1 mrg dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4633 1.1 mrg scalar_type);
4634 1.1 mrg dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4635 1.1 mrg }
4636 1.1 mrg
4637 1.1 mrg if (is_a <bb_vec_info> (vinfo))
4638 1.1 mrg {
4639 1.1 mrg /* No vector type is fine, the ref can still participate
4640 1.1 mrg in dependence analysis, we just can't vectorize it. */
4641 1.1 mrg STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4642 1.1 mrg continue;
4643 1.1 mrg }
4644 1.1 mrg if (fatal)
4645 1.1 mrg *fatal = false;
4646 1.1 mrg return opt_result::failure_at (stmt_info->stmt,
4647 1.1 mrg "not vectorized:"
4648 1.1 mrg " no vectype for stmt: %G"
4649 1.1 mrg " scalar_type: %T\n",
4650 1.1 mrg stmt_info->stmt, scalar_type);
4651 1.1 mrg }
4652 1.1 mrg else
4653 1.1 mrg {
4654 1.1 mrg if (dump_enabled_p ())
4655 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
4656 1.1 mrg "got vectype for stmt: %G%T\n",
4657 1.1 mrg stmt_info->stmt, vectype);
4658 1.1 mrg }
4659 1.1 mrg
4660 1.1 mrg /* Adjust the minimal vectorization factor according to the
4661 1.1 mrg vector type. */
4662 1.1 mrg vf = TYPE_VECTOR_SUBPARTS (vectype);
4663 1.1 mrg *min_vf = upper_bound (*min_vf, vf);
4664 1.1 mrg
4665 1.1 mrg /* Leave the BB vectorizer to pick the vector type later, based on
4666 1.1 mrg the final dataref group size and SLP node size. */
4667 1.1 mrg if (is_a <loop_vec_info> (vinfo))
4668 1.1 mrg STMT_VINFO_VECTYPE (stmt_info) = vectype;
4669 1.1 mrg
4670 1.1 mrg if (gatherscatter != SG_NONE)
4671 1.1 mrg {
4672 1.1 mrg gather_scatter_info gs_info;
4673 1.1 mrg if (!vect_check_gather_scatter (stmt_info,
4674 1.1 mrg as_a <loop_vec_info> (vinfo),
4675 1.1 mrg &gs_info)
4676 1.1 mrg || !get_vectype_for_scalar_type (vinfo,
4677 1.1 mrg TREE_TYPE (gs_info.offset)))
4678 1.1 mrg {
4679 1.1 mrg if (fatal)
4680 1.1 mrg *fatal = false;
4681 1.1 mrg return opt_result::failure_at
4682 1.1 mrg (stmt_info->stmt,
4683 1.1 mrg (gatherscatter == GATHER)
4684 1.1 mrg ? "not vectorized: not suitable for gather load %G"
4685 1.1 mrg : "not vectorized: not suitable for scatter store %G",
4686 1.1 mrg stmt_info->stmt);
4687 1.1 mrg }
4688 1.1 mrg STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4689 1.1 mrg }
4690 1.1 mrg }
4691 1.1 mrg
4692 1.1 mrg /* We used to stop processing and prune the list here. Verify we no
4693 1.1 mrg longer need to. */
4694 1.1 mrg gcc_assert (i == datarefs.length ());
4695 1.1 mrg
4696 1.1 mrg return opt_result::success ();
4697 1.1 mrg }
4698 1.1 mrg
4699 1.1 mrg
4700 1.1 mrg /* Function vect_get_new_vect_var.
4701 1.1 mrg
4702 1.1 mrg Returns a name for a new variable. The current naming scheme appends the
4703 1.1 mrg prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4704 1.1 mrg the name of vectorizer generated variables, and appends that to NAME if
4705 1.1 mrg provided. */
4706 1.1 mrg
4707 1.1 mrg tree
4708 1.1 mrg vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4709 1.1 mrg {
4710 1.1 mrg const char *prefix;
4711 1.1 mrg tree new_vect_var;
4712 1.1 mrg
4713 1.1 mrg switch (var_kind)
4714 1.1 mrg {
4715 1.1 mrg case vect_simple_var:
4716 1.1 mrg prefix = "vect";
4717 1.1 mrg break;
4718 1.1 mrg case vect_scalar_var:
4719 1.1 mrg prefix = "stmp";
4720 1.1 mrg break;
4721 1.1 mrg case vect_mask_var:
4722 1.1 mrg prefix = "mask";
4723 1.1 mrg break;
4724 1.1 mrg case vect_pointer_var:
4725 1.1 mrg prefix = "vectp";
4726 1.1 mrg break;
4727 1.1 mrg default:
4728 1.1 mrg gcc_unreachable ();
4729 1.1 mrg }
4730 1.1 mrg
4731 1.1 mrg if (name)
4732 1.1 mrg {
4733 1.1 mrg char* tmp = concat (prefix, "_", name, NULL);
4734 1.1 mrg new_vect_var = create_tmp_reg (type, tmp);
4735 1.1 mrg free (tmp);
4736 1.1 mrg }
4737 1.1 mrg else
4738 1.1 mrg new_vect_var = create_tmp_reg (type, prefix);
4739 1.1 mrg
4740 1.1 mrg return new_vect_var;
4741 1.1 mrg }
4742 1.1 mrg
4743 1.1 mrg /* Like vect_get_new_vect_var but return an SSA name. */
4744 1.1 mrg
4745 1.1 mrg tree
4746 1.1 mrg vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4747 1.1 mrg {
4748 1.1 mrg const char *prefix;
4749 1.1 mrg tree new_vect_var;
4750 1.1 mrg
4751 1.1 mrg switch (var_kind)
4752 1.1 mrg {
4753 1.1 mrg case vect_simple_var:
4754 1.1 mrg prefix = "vect";
4755 1.1 mrg break;
4756 1.1 mrg case vect_scalar_var:
4757 1.1 mrg prefix = "stmp";
4758 1.1 mrg break;
4759 1.1 mrg case vect_pointer_var:
4760 1.1 mrg prefix = "vectp";
4761 1.1 mrg break;
4762 1.1 mrg default:
4763 1.1 mrg gcc_unreachable ();
4764 1.1 mrg }
4765 1.1 mrg
4766 1.1 mrg if (name)
4767 1.1 mrg {
4768 1.1 mrg char* tmp = concat (prefix, "_", name, NULL);
4769 1.1 mrg new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4770 1.1 mrg free (tmp);
4771 1.1 mrg }
4772 1.1 mrg else
4773 1.1 mrg new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4774 1.1 mrg
4775 1.1 mrg return new_vect_var;
4776 1.1 mrg }
4777 1.1 mrg
4778 1.1 mrg /* Duplicate points-to info on NAME from DR_INFO. */
4779 1.1 mrg
4780 1.1 mrg static void
4781 1.1 mrg vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4782 1.1 mrg {
4783 1.1 mrg duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4784 1.1 mrg /* DR_PTR_INFO is for a base SSA name, not including constant or
4785 1.1 mrg variable offsets in the ref so its alignment info does not apply. */
4786 1.1 mrg mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4787 1.1 mrg }
4788 1.1 mrg
4789 1.1 mrg /* Function vect_create_addr_base_for_vector_ref.
4790 1.1 mrg
4791 1.1 mrg Create an expression that computes the address of the first memory location
4792 1.1 mrg that will be accessed for a data reference.
4793 1.1 mrg
4794 1.1 mrg Input:
4795 1.1 mrg STMT_INFO: The statement containing the data reference.
4796 1.1 mrg NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4797 1.1 mrg OFFSET: Optional. If supplied, it is be added to the initial address.
4798 1.1 mrg LOOP: Specify relative to which loop-nest should the address be computed.
4799 1.1 mrg For example, when the dataref is in an inner-loop nested in an
4800 1.1 mrg outer-loop that is now being vectorized, LOOP can be either the
4801 1.1 mrg outer-loop, or the inner-loop. The first memory location accessed
4802 1.1 mrg by the following dataref ('in' points to short):
4803 1.1 mrg
4804 1.1 mrg for (i=0; i<N; i++)
4805 1.1 mrg for (j=0; j<M; j++)
4806 1.1 mrg s += in[i+j]
4807 1.1 mrg
4808 1.1 mrg is as follows:
4809 1.1 mrg if LOOP=i_loop: &in (relative to i_loop)
4810 1.1 mrg if LOOP=j_loop: &in+i*2B (relative to j_loop)
4811 1.1 mrg
4812 1.1 mrg Output:
4813 1.1 mrg 1. Return an SSA_NAME whose value is the address of the memory location of
4814 1.1 mrg the first vector of the data reference.
4815 1.1 mrg 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4816 1.1 mrg these statement(s) which define the returned SSA_NAME.
4817 1.1 mrg
4818 1.1 mrg FORNOW: We are only handling array accesses with step 1. */
4819 1.1 mrg
4820 1.1 mrg tree
4821 1.1 mrg vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4822 1.1 mrg gimple_seq *new_stmt_list,
4823 1.1 mrg tree offset)
4824 1.1 mrg {
4825 1.1 mrg dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4826 1.1 mrg struct data_reference *dr = dr_info->dr;
4827 1.1 mrg const char *base_name;
4828 1.1 mrg tree addr_base;
4829 1.1 mrg tree dest;
4830 1.1 mrg gimple_seq seq = NULL;
4831 1.1 mrg tree vect_ptr_type;
4832 1.1 mrg loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4833 1.1 mrg innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4834 1.1 mrg
4835 1.1 mrg tree data_ref_base = unshare_expr (drb->base_address);
4836 1.1 mrg tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4837 1.1 mrg tree init = unshare_expr (drb->init);
4838 1.1 mrg
4839 1.1 mrg if (loop_vinfo)
4840 1.1 mrg base_name = get_name (data_ref_base);
4841 1.1 mrg else
4842 1.1 mrg {
4843 1.1 mrg base_offset = ssize_int (0);
4844 1.1 mrg init = ssize_int (0);
4845 1.1 mrg base_name = get_name (DR_REF (dr));
4846 1.1 mrg }
4847 1.1 mrg
4848 1.1 mrg /* Create base_offset */
4849 1.1 mrg base_offset = size_binop (PLUS_EXPR,
4850 1.1 mrg fold_convert (sizetype, base_offset),
4851 1.1 mrg fold_convert (sizetype, init));
4852 1.1 mrg
4853 1.1 mrg if (offset)
4854 1.1 mrg {
4855 1.1 mrg offset = fold_convert (sizetype, offset);
4856 1.1 mrg base_offset = fold_build2 (PLUS_EXPR, sizetype,
4857 1.1 mrg base_offset, offset);
4858 1.1 mrg }
4859 1.1 mrg
4860 1.1 mrg /* base + base_offset */
4861 1.1 mrg if (loop_vinfo)
4862 1.1 mrg addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4863 1.1 mrg else
4864 1.1 mrg addr_base = build1 (ADDR_EXPR,
4865 1.1 mrg build_pointer_type (TREE_TYPE (DR_REF (dr))),
4866 1.1 mrg /* Strip zero offset components since we don't need
4867 1.1 mrg them and they can confuse late diagnostics if
4868 1.1 mrg we CSE them wrongly. See PR106904 for example. */
4869 1.1 mrg unshare_expr (strip_zero_offset_components
4870 1.1 mrg (DR_REF (dr))));
4871 1.1 mrg
4872 1.1 mrg vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4873 1.1 mrg dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4874 1.1 mrg addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4875 1.1 mrg gimple_seq_add_seq (new_stmt_list, seq);
4876 1.1 mrg
4877 1.1 mrg if (DR_PTR_INFO (dr)
4878 1.1 mrg && TREE_CODE (addr_base) == SSA_NAME
4879 1.1 mrg /* We should only duplicate pointer info to newly created SSA names. */
4880 1.1 mrg && SSA_NAME_VAR (addr_base) == dest)
4881 1.1 mrg {
4882 1.1 mrg gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4883 1.1 mrg vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4884 1.1 mrg }
4885 1.1 mrg
4886 1.1 mrg if (dump_enabled_p ())
4887 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4888 1.1 mrg
4889 1.1 mrg return addr_base;
4890 1.1 mrg }
4891 1.1 mrg
4892 1.1 mrg
4893 1.1 mrg /* Function vect_create_data_ref_ptr.
4894 1.1 mrg
4895 1.1 mrg Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4896 1.1 mrg location accessed in the loop by STMT_INFO, along with the def-use update
4897 1.1 mrg chain to appropriately advance the pointer through the loop iterations.
4898 1.1 mrg Also set aliasing information for the pointer. This pointer is used by
4899 1.1 mrg the callers to this function to create a memory reference expression for
4900 1.1 mrg vector load/store access.
4901 1.1 mrg
4902 1.1 mrg Input:
4903 1.1 mrg 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4904 1.1 mrg GIMPLE_ASSIGN <name, data-ref> or
4905 1.1 mrg GIMPLE_ASSIGN <data-ref, name>.
4906 1.1 mrg 2. AGGR_TYPE: the type of the reference, which should be either a vector
4907 1.1 mrg or an array.
4908 1.1 mrg 3. AT_LOOP: the loop where the vector memref is to be created.
4909 1.1 mrg 4. OFFSET (optional): a byte offset to be added to the initial address
4910 1.1 mrg accessed by the data-ref in STMT_INFO.
4911 1.1 mrg 5. BSI: location where the new stmts are to be placed if there is no loop
4912 1.1 mrg 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4913 1.1 mrg pointing to the initial address.
4914 1.1 mrg 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4915 1.1 mrg to the IV during each iteration of the loop. NULL says to move
4916 1.1 mrg by one copy of AGGR_TYPE up or down, depending on the step of the
4917 1.1 mrg data reference.
4918 1.1 mrg
4919 1.1 mrg Output:
4920 1.1 mrg 1. Declare a new ptr to vector_type, and have it point to the base of the
4921 1.1 mrg data reference (initial addressed accessed by the data reference).
4922 1.1 mrg For example, for vector of type V8HI, the following code is generated:
4923 1.1 mrg
4924 1.1 mrg v8hi *ap;
4925 1.1 mrg ap = (v8hi *)initial_address;
4926 1.1 mrg
4927 1.1 mrg if OFFSET is not supplied:
4928 1.1 mrg initial_address = &a[init];
4929 1.1 mrg if OFFSET is supplied:
4930 1.1 mrg initial_address = &a[init] + OFFSET;
4931 1.1 mrg if BYTE_OFFSET is supplied:
4932 1.1 mrg initial_address = &a[init] + BYTE_OFFSET;
4933 1.1 mrg
4934 1.1 mrg Return the initial_address in INITIAL_ADDRESS.
4935 1.1 mrg
4936 1.1 mrg 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4937 1.1 mrg update the pointer in each iteration of the loop.
4938 1.1 mrg
4939 1.1 mrg Return the increment stmt that updates the pointer in PTR_INCR.
4940 1.1 mrg
4941 1.1 mrg 3. Return the pointer. */
4942 1.1 mrg
4943 1.1 mrg tree
4944 1.1 mrg vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4945 1.1 mrg tree aggr_type, class loop *at_loop, tree offset,
4946 1.1 mrg tree *initial_address, gimple_stmt_iterator *gsi,
4947 1.1 mrg gimple **ptr_incr, bool only_init,
4948 1.1 mrg tree iv_step)
4949 1.1 mrg {
4950 1.1 mrg const char *base_name;
4951 1.1 mrg loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4952 1.1 mrg class loop *loop = NULL;
4953 1.1 mrg bool nested_in_vect_loop = false;
4954 1.1 mrg class loop *containing_loop = NULL;
4955 1.1 mrg tree aggr_ptr_type;
4956 1.1 mrg tree aggr_ptr;
4957 1.1 mrg tree new_temp;
4958 1.1 mrg gimple_seq new_stmt_list = NULL;
4959 1.1 mrg edge pe = NULL;
4960 1.1 mrg basic_block new_bb;
4961 1.1 mrg tree aggr_ptr_init;
4962 1.1 mrg dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4963 1.1 mrg struct data_reference *dr = dr_info->dr;
4964 1.1 mrg tree aptr;
4965 1.1 mrg gimple_stmt_iterator incr_gsi;
4966 1.1 mrg bool insert_after;
4967 1.1 mrg tree indx_before_incr, indx_after_incr;
4968 1.1 mrg gimple *incr;
4969 1.1 mrg bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4970 1.1 mrg
4971 1.1 mrg gcc_assert (iv_step != NULL_TREE
4972 1.1 mrg || TREE_CODE (aggr_type) == ARRAY_TYPE
4973 1.1 mrg || TREE_CODE (aggr_type) == VECTOR_TYPE);
4974 1.1 mrg
4975 1.1 mrg if (loop_vinfo)
4976 1.1 mrg {
4977 1.1 mrg loop = LOOP_VINFO_LOOP (loop_vinfo);
4978 1.1 mrg nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4979 1.1 mrg containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4980 1.1 mrg pe = loop_preheader_edge (loop);
4981 1.1 mrg }
4982 1.1 mrg else
4983 1.1 mrg {
4984 1.1 mrg gcc_assert (bb_vinfo);
4985 1.1 mrg only_init = true;
4986 1.1 mrg *ptr_incr = NULL;
4987 1.1 mrg }
4988 1.1 mrg
4989 1.1 mrg /* Create an expression for the first address accessed by this load
4990 1.1 mrg in LOOP. */
4991 1.1 mrg base_name = get_name (DR_BASE_ADDRESS (dr));
4992 1.1 mrg
4993 1.1 mrg if (dump_enabled_p ())
4994 1.1 mrg {
4995 1.1 mrg tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4996 1.1 mrg dump_printf_loc (MSG_NOTE, vect_location,
4997 1.1 mrg "create %s-pointer variable to type: %T",
4998 1.1 mrg get_tree_code_name (TREE_CODE (aggr_type)),
4999 1.1 mrg aggr_type);
5000 1.1 mrg if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5001 1.1 mrg dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5002 1.1 mrg else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5003 1.1 mrg dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5004 1.1 mrg else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5005 1.1 mrg dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5006 1.1 mrg else
5007 1.1 mrg dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5008 1.1 mrg dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5009 1.1 mrg }
5010 1.1 mrg
5011 1.1 mrg /* (1) Create the new aggregate-pointer variable.
5012 1.1 mrg Vector and array types inherit the alias set of their component
5013 1.1 mrg type by default so we need to use a ref-all pointer if the data
5014 1.1 mrg reference does not conflict with the created aggregated data
5015 1.1 mrg reference because it is not addressable. */
5016 1.1 mrg bool need_ref_all = false;
5017 1.1 mrg if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5018 1.1 mrg get_alias_set (DR_REF (dr))))
5019 1.1 mrg need_ref_all = true;
5020 1.1 mrg /* Likewise for any of the data references in the stmt group. */
5021 1.1 mrg else if (DR_GROUP_SIZE (stmt_info) > 1)
5022 1.1 mrg {
5023 1.1 mrg stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5024 1.1 mrg do
5025 1.1 mrg {
5026 1.1 mrg struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5027 1.1 mrg if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5028 1.1 mrg get_alias_set (DR_REF (sdr))))
5029 1.1 mrg {
5030 1.1 mrg need_ref_all = true;
5031 1.1 mrg break;
5032 1.1 mrg }
5033 1.1 mrg sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5034 1.1 mrg }
5035 1.1 mrg while (sinfo);
5036 1.1 mrg }
5037 1.1 mrg aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5038 1.1 mrg need_ref_all);
5039 1.1 mrg aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5040 1.1 mrg
5041 1.1 mrg
5042 1.1 mrg /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5043 1.1 mrg vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5044 1.1 mrg def-use update cycles for the pointer: one relative to the outer-loop
5045 1.1 mrg (LOOP), which is what steps (3) and (4) below do. The other is relative
5046 1.1 mrg to the inner-loop (which is the inner-most loop containing the dataref),
5047 1.1 mrg and this is done be step (5) below.
5048 1.1 mrg
5049 1.1 mrg When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5050 1.1 mrg inner-most loop, and so steps (3),(4) work the same, and step (5) is
5051 1.1 mrg redundant. Steps (3),(4) create the following:
5052 1.1 mrg
5053 1.1 mrg vp0 = &base_addr;
5054 1.1 mrg LOOP: vp1 = phi(vp0,vp2)
5055 1.1 mrg ...
5056 1.1 mrg ...
5057 1.1 mrg vp2 = vp1 + step
5058 1.1 mrg goto LOOP
5059 1.1 mrg
5060 1.1 mrg If there is an inner-loop nested in loop, then step (5) will also be
5061 1.1 mrg applied, and an additional update in the inner-loop will be created:
5062 1.1 mrg
5063 1.1 mrg vp0 = &base_addr;
5064 1.1 mrg LOOP: vp1 = phi(vp0,vp2)
5065 1.1 mrg ...
5066 1.1 mrg inner: vp3 = phi(vp1,vp4)
5067 1.1 mrg vp4 = vp3 + inner_step
5068 1.1 mrg if () goto inner
5069 1.1 mrg ...
5070 1.1 mrg vp2 = vp1 + step
5071 1.1 mrg if () goto LOOP */
5072 1.1 mrg
5073 1.1 mrg /* (2) Calculate the initial address of the aggregate-pointer, and set
5074 1.1 mrg the aggregate-pointer to point to it before the loop. */
5075 1.1 mrg
5076 1.1 mrg /* Create: (&(base[init_val]+offset) in the loop preheader. */
5077 1.1 mrg
5078 1.1 mrg new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5079 1.1 mrg stmt_info, &new_stmt_list,
5080 1.1 mrg offset);
5081 1.1 mrg if (new_stmt_list)
5082 1.1 mrg {
5083 1.1 mrg if (pe)
5084 1.1 mrg {
5085 1.1 mrg new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5086 1.1 mrg gcc_assert (!new_bb);
5087 1.1 mrg }
5088 1.1 mrg else
5089 1.1 mrg gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5090 1.1 mrg }
5091 1.1 mrg
5092 1.1 mrg *initial_address = new_temp;
5093 1.1 mrg aggr_ptr_init = new_temp;
5094 1.1 mrg
5095 1.1 mrg /* (3) Handle the updating of the aggregate-pointer inside the loop.
5096 1.1 mrg This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5097 1.1 mrg inner-loop nested in LOOP (during outer-loop vectorization). */
5098 1.1 mrg
5099 1.1 mrg /* No update in loop is required. */
5100 1.1 mrg if (only_init && (!loop_vinfo || at_loop == loop))
5101 1.1 mrg aptr = aggr_ptr_init;
5102 1.1 mrg else
5103 1.1 mrg {
5104 1.1 mrg /* Accesses to invariant addresses should be handled specially
5105 1.1 mrg by the caller. */
5106 1.1 mrg tree step = vect_dr_behavior (vinfo, dr_info)->step;
5107 1.1 mrg gcc_assert (!integer_zerop (step));
5108 1.1 mrg
5109 1.1 mrg if (iv_step == NULL_TREE)
5110 1.1 mrg {
5111 1.1 mrg /* The step of the aggregate pointer is the type size,
5112 1.1 mrg negated for downward accesses. */
5113 1.1 mrg iv_step = TYPE_SIZE_UNIT (aggr_type);
5114 1.1 mrg if (tree_int_cst_sgn (step) == -1)
5115 1.1 mrg iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5116 1.1 mrg }
5117 1.1 mrg
5118 1.1 mrg standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5119 1.1 mrg
5120 1.1 mrg create_iv (aggr_ptr_init,
5121 1.1 mrg fold_convert (aggr_ptr_type, iv_step),
5122 1.1 mrg aggr_ptr, loop, &incr_gsi, insert_after,
5123 1.1 mrg &indx_before_incr, &indx_after_incr);
5124 1.1 mrg incr = gsi_stmt (incr_gsi);
5125 1.1 mrg
5126 1.1 mrg /* Copy the points-to information if it exists. */
5127 1.1 mrg if (DR_PTR_INFO (dr))
5128 1.1 mrg {
5129 1.1 mrg vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5130 1.1 mrg vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5131 1.1 mrg }
5132 1.1 mrg if (ptr_incr)
5133 1.1 mrg *ptr_incr = incr;
5134 1.1 mrg
5135 1.1 mrg aptr = indx_before_incr;
5136 1.1 mrg }
5137 1.1 mrg
5138 1.1 mrg if (!nested_in_vect_loop || only_init)
5139 1.1 mrg return aptr;
5140 1.1 mrg
5141 1.1 mrg
5142 1.1 mrg /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5143 1.1 mrg nested in LOOP, if exists. */
5144 1.1 mrg
5145 1.1 mrg gcc_assert (nested_in_vect_loop);
5146 1.1 mrg if (!only_init)
5147 1.1 mrg {
5148 1.1 mrg standard_iv_increment_position (containing_loop, &incr_gsi,
5149 1.1 mrg &insert_after);
5150 1.1 mrg create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5151 1.1 mrg containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5152 1.1 mrg &indx_after_incr);
5153 1.1 mrg incr = gsi_stmt (incr_gsi);
5154 1.1 mrg
5155 1.1 mrg /* Copy the points-to information if it exists. */
5156 1.1 mrg if (DR_PTR_INFO (dr))
5157 1.1 mrg {
5158 1.1 mrg vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5159 1.1 mrg vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5160 1.1 mrg }
5161 1.1 mrg if (ptr_incr)
5162 1.1 mrg *ptr_incr = incr;
5163 1.1 mrg
5164 1.1 mrg return indx_before_incr;
5165 1.1 mrg }
5166 1.1 mrg else
5167 1.1 mrg gcc_unreachable ();
5168 1.1 mrg }
5169 1.1 mrg
5170 1.1 mrg
5171 1.1 mrg /* Function bump_vector_ptr
5172 1.1 mrg
5173 1.1 mrg Increment a pointer (to a vector type) by vector-size. If requested,
5174 1.1 mrg i.e. if PTR-INCR is given, then also connect the new increment stmt
5175 1.1 mrg to the existing def-use update-chain of the pointer, by modifying
5176 1.1 mrg the PTR_INCR as illustrated below:
5177 1.1 mrg
5178 1.1 mrg The pointer def-use update-chain before this function:
5179 1.1 mrg DATAREF_PTR = phi (p_0, p_2)
5180 1.1 mrg ....
5181 1.1 mrg PTR_INCR: p_2 = DATAREF_PTR + step
5182 1.1 mrg
5183 1.1 mrg The pointer def-use update-chain after this function:
5184 1.1 mrg DATAREF_PTR = phi (p_0, p_2)
5185 1.1 mrg ....
5186 1.1 mrg NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5187 1.1 mrg ....
5188 1.1 mrg PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5189 1.1 mrg
5190 1.1 mrg Input:
5191 1.1 mrg DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5192 1.1 mrg in the loop.
5193 1.1 mrg PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5194 1.1 mrg the loop. The increment amount across iterations is expected
5195 1.1 mrg to be vector_size.
5196 1.1 mrg BSI - location where the new update stmt is to be placed.
5197 1.1 mrg STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5198 1.1 mrg BUMP - optional. The offset by which to bump the pointer. If not given,
5199 1.1 mrg the offset is assumed to be vector_size.
5200 1.1 mrg
5201 1.1 mrg Output: Return NEW_DATAREF_PTR as illustrated above.
5202 1.1 mrg
5203 1.1 mrg */
5204 1.1 mrg
5205 1.1 mrg tree
5206 1.1 mrg bump_vector_ptr (vec_info *vinfo,
5207 1.1 mrg tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5208 1.1 mrg stmt_vec_info stmt_info, tree bump)
5209 1.1 mrg {
5210 1.1 mrg struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5211 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5212 1.1 mrg tree update = TYPE_SIZE_UNIT (vectype);
5213 1.1 mrg gimple *incr_stmt;
5214 1.1 mrg ssa_op_iter iter;
5215 1.1 mrg use_operand_p use_p;
5216 1.1 mrg tree new_dataref_ptr;
5217 1.1 mrg
5218 1.1 mrg if (bump)
5219 1.1 mrg update = bump;
5220 1.1 mrg
5221 1.1 mrg if (TREE_CODE (dataref_ptr) == SSA_NAME)
5222 1.1 mrg new_dataref_ptr = copy_ssa_name (dataref_ptr);
5223 1.1 mrg else
5224 1.1 mrg new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5225 1.1 mrg incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5226 1.1 mrg dataref_ptr, update);
5227 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5228 1.1 mrg /* Fold the increment, avoiding excessive chains use-def chains of
5229 1.1 mrg those, leading to compile-time issues for passes until the next
5230 1.1 mrg forwprop pass which would do this as well. */
5231 1.1 mrg gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5232 1.1 mrg if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5233 1.1 mrg {
5234 1.1 mrg incr_stmt = gsi_stmt (fold_gsi);
5235 1.1 mrg update_stmt (incr_stmt);
5236 1.1 mrg }
5237 1.1 mrg
5238 1.1 mrg /* Copy the points-to information if it exists. */
5239 1.1 mrg if (DR_PTR_INFO (dr))
5240 1.1 mrg {
5241 1.1 mrg duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5242 1.1 mrg mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5243 1.1 mrg }
5244 1.1 mrg
5245 1.1 mrg if (!ptr_incr)
5246 1.1 mrg return new_dataref_ptr;
5247 1.1 mrg
5248 1.1 mrg /* Update the vector-pointer's cross-iteration increment. */
5249 1.1 mrg FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5250 1.1 mrg {
5251 1.1 mrg tree use = USE_FROM_PTR (use_p);
5252 1.1 mrg
5253 1.1 mrg if (use == dataref_ptr)
5254 1.1 mrg SET_USE (use_p, new_dataref_ptr);
5255 1.1 mrg else
5256 1.1 mrg gcc_assert (operand_equal_p (use, update, 0));
5257 1.1 mrg }
5258 1.1 mrg
5259 1.1 mrg return new_dataref_ptr;
5260 1.1 mrg }
5261 1.1 mrg
5262 1.1 mrg
5263 1.1 mrg /* Copy memory reference info such as base/clique from the SRC reference
5264 1.1 mrg to the DEST MEM_REF. */
5265 1.1 mrg
5266 1.1 mrg void
5267 1.1 mrg vect_copy_ref_info (tree dest, tree src)
5268 1.1 mrg {
5269 1.1 mrg if (TREE_CODE (dest) != MEM_REF)
5270 1.1 mrg return;
5271 1.1 mrg
5272 1.1 mrg tree src_base = src;
5273 1.1 mrg while (handled_component_p (src_base))
5274 1.1 mrg src_base = TREE_OPERAND (src_base, 0);
5275 1.1 mrg if (TREE_CODE (src_base) != MEM_REF
5276 1.1 mrg && TREE_CODE (src_base) != TARGET_MEM_REF)
5277 1.1 mrg return;
5278 1.1 mrg
5279 1.1 mrg MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5280 1.1 mrg MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5281 1.1 mrg }
5282 1.1 mrg
5283 1.1 mrg
5284 1.1 mrg /* Function vect_create_destination_var.
5285 1.1 mrg
5286 1.1 mrg Create a new temporary of type VECTYPE. */
5287 1.1 mrg
5288 1.1 mrg tree
5289 1.1 mrg vect_create_destination_var (tree scalar_dest, tree vectype)
5290 1.1 mrg {
5291 1.1 mrg tree vec_dest;
5292 1.1 mrg const char *name;
5293 1.1 mrg char *new_name;
5294 1.1 mrg tree type;
5295 1.1 mrg enum vect_var_kind kind;
5296 1.1 mrg
5297 1.1 mrg kind = vectype
5298 1.1 mrg ? VECTOR_BOOLEAN_TYPE_P (vectype)
5299 1.1 mrg ? vect_mask_var
5300 1.1 mrg : vect_simple_var
5301 1.1 mrg : vect_scalar_var;
5302 1.1 mrg type = vectype ? vectype : TREE_TYPE (scalar_dest);
5303 1.1 mrg
5304 1.1 mrg gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5305 1.1 mrg
5306 1.1 mrg name = get_name (scalar_dest);
5307 1.1 mrg if (name)
5308 1.1 mrg new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5309 1.1 mrg else
5310 1.1 mrg new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5311 1.1 mrg vec_dest = vect_get_new_vect_var (type, kind, new_name);
5312 1.1 mrg free (new_name);
5313 1.1 mrg
5314 1.1 mrg return vec_dest;
5315 1.1 mrg }
5316 1.1 mrg
5317 1.1 mrg /* Function vect_grouped_store_supported.
5318 1.1 mrg
5319 1.1 mrg Returns TRUE if interleave high and interleave low permutations
5320 1.1 mrg are supported, and FALSE otherwise. */
5321 1.1 mrg
5322 1.1 mrg bool
5323 1.1 mrg vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5324 1.1 mrg {
5325 1.1 mrg machine_mode mode = TYPE_MODE (vectype);
5326 1.1 mrg
5327 1.1 mrg /* vect_permute_store_chain requires the group size to be equal to 3 or
5328 1.1 mrg be a power of two. */
5329 1.1 mrg if (count != 3 && exact_log2 (count) == -1)
5330 1.1 mrg {
5331 1.1 mrg if (dump_enabled_p ())
5332 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5333 1.1 mrg "the size of the group of accesses"
5334 1.1 mrg " is not a power of 2 or not eqaul to 3\n");
5335 1.1 mrg return false;
5336 1.1 mrg }
5337 1.1 mrg
5338 1.1 mrg /* Check that the permutation is supported. */
5339 1.1 mrg if (VECTOR_MODE_P (mode))
5340 1.1 mrg {
5341 1.1 mrg unsigned int i;
5342 1.1 mrg if (count == 3)
5343 1.1 mrg {
5344 1.1 mrg unsigned int j0 = 0, j1 = 0, j2 = 0;
5345 1.1 mrg unsigned int i, j;
5346 1.1 mrg
5347 1.1 mrg unsigned int nelt;
5348 1.1 mrg if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5349 1.1 mrg {
5350 1.1 mrg if (dump_enabled_p ())
5351 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5352 1.1 mrg "cannot handle groups of 3 stores for"
5353 1.1 mrg " variable-length vectors\n");
5354 1.1 mrg return false;
5355 1.1 mrg }
5356 1.1 mrg
5357 1.1 mrg vec_perm_builder sel (nelt, nelt, 1);
5358 1.1 mrg sel.quick_grow (nelt);
5359 1.1 mrg vec_perm_indices indices;
5360 1.1 mrg for (j = 0; j < 3; j++)
5361 1.1 mrg {
5362 1.1 mrg int nelt0 = ((3 - j) * nelt) % 3;
5363 1.1 mrg int nelt1 = ((3 - j) * nelt + 1) % 3;
5364 1.1 mrg int nelt2 = ((3 - j) * nelt + 2) % 3;
5365 1.1 mrg for (i = 0; i < nelt; i++)
5366 1.1 mrg {
5367 1.1 mrg if (3 * i + nelt0 < nelt)
5368 1.1 mrg sel[3 * i + nelt0] = j0++;
5369 1.1 mrg if (3 * i + nelt1 < nelt)
5370 1.1 mrg sel[3 * i + nelt1] = nelt + j1++;
5371 1.1 mrg if (3 * i + nelt2 < nelt)
5372 1.1 mrg sel[3 * i + nelt2] = 0;
5373 1.1 mrg }
5374 1.1 mrg indices.new_vector (sel, 2, nelt);
5375 1.1 mrg if (!can_vec_perm_const_p (mode, indices))
5376 1.1 mrg {
5377 1.1 mrg if (dump_enabled_p ())
5378 1.1 mrg dump_printf (MSG_MISSED_OPTIMIZATION,
5379 1.1 mrg "permutation op not supported by target.\n");
5380 1.1 mrg return false;
5381 1.1 mrg }
5382 1.1 mrg
5383 1.1 mrg for (i = 0; i < nelt; i++)
5384 1.1 mrg {
5385 1.1 mrg if (3 * i + nelt0 < nelt)
5386 1.1 mrg sel[3 * i + nelt0] = 3 * i + nelt0;
5387 1.1 mrg if (3 * i + nelt1 < nelt)
5388 1.1 mrg sel[3 * i + nelt1] = 3 * i + nelt1;
5389 1.1 mrg if (3 * i + nelt2 < nelt)
5390 1.1 mrg sel[3 * i + nelt2] = nelt + j2++;
5391 1.1 mrg }
5392 1.1 mrg indices.new_vector (sel, 2, nelt);
5393 1.1 mrg if (!can_vec_perm_const_p (mode, indices))
5394 1.1 mrg {
5395 1.1 mrg if (dump_enabled_p ())
5396 1.1 mrg dump_printf (MSG_MISSED_OPTIMIZATION,
5397 1.1 mrg "permutation op not supported by target.\n");
5398 1.1 mrg return false;
5399 1.1 mrg }
5400 1.1 mrg }
5401 1.1 mrg return true;
5402 1.1 mrg }
5403 1.1 mrg else
5404 1.1 mrg {
5405 1.1 mrg /* If length is not equal to 3 then only power of 2 is supported. */
5406 1.1 mrg gcc_assert (pow2p_hwi (count));
5407 1.1 mrg poly_uint64 nelt = GET_MODE_NUNITS (mode);
5408 1.1 mrg
5409 1.1 mrg /* The encoding has 2 interleaved stepped patterns. */
5410 1.1 mrg vec_perm_builder sel (nelt, 2, 3);
5411 1.1 mrg sel.quick_grow (6);
5412 1.1 mrg for (i = 0; i < 3; i++)
5413 1.1 mrg {
5414 1.1 mrg sel[i * 2] = i;
5415 1.1 mrg sel[i * 2 + 1] = i + nelt;
5416 1.1 mrg }
5417 1.1 mrg vec_perm_indices indices (sel, 2, nelt);
5418 1.1 mrg if (can_vec_perm_const_p (mode, indices))
5419 1.1 mrg {
5420 1.1 mrg for (i = 0; i < 6; i++)
5421 1.1 mrg sel[i] += exact_div (nelt, 2);
5422 1.1 mrg indices.new_vector (sel, 2, nelt);
5423 1.1 mrg if (can_vec_perm_const_p (mode, indices))
5424 1.1 mrg return true;
5425 1.1 mrg }
5426 1.1 mrg }
5427 1.1 mrg }
5428 1.1 mrg
5429 1.1 mrg if (dump_enabled_p ())
5430 1.1 mrg dump_printf (MSG_MISSED_OPTIMIZATION,
5431 1.1 mrg "permutation op not supported by target.\n");
5432 1.1 mrg return false;
5433 1.1 mrg }
5434 1.1 mrg
5435 1.1 mrg
5436 1.1 mrg /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5437 1.1 mrg type VECTYPE. MASKED_P says whether the masked form is needed. */
5438 1.1 mrg
5439 1.1 mrg bool
5440 1.1 mrg vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5441 1.1 mrg bool masked_p)
5442 1.1 mrg {
5443 1.1 mrg if (masked_p)
5444 1.1 mrg return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5445 1.1 mrg vec_mask_store_lanes_optab,
5446 1.1 mrg vectype, count);
5447 1.1 mrg else
5448 1.1 mrg return vect_lanes_optab_supported_p ("vec_store_lanes",
5449 1.1 mrg vec_store_lanes_optab,
5450 1.1 mrg vectype, count);
5451 1.1 mrg }
5452 1.1 mrg
5453 1.1 mrg
5454 1.1 mrg /* Function vect_permute_store_chain.
5455 1.1 mrg
5456 1.1 mrg Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5457 1.1 mrg a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5458 1.1 mrg the data correctly for the stores. Return the final references for stores
5459 1.1 mrg in RESULT_CHAIN.
5460 1.1 mrg
5461 1.1 mrg E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5462 1.1 mrg The input is 4 vectors each containing 8 elements. We assign a number to
5463 1.1 mrg each element, the input sequence is:
5464 1.1 mrg
5465 1.1 mrg 1st vec: 0 1 2 3 4 5 6 7
5466 1.1 mrg 2nd vec: 8 9 10 11 12 13 14 15
5467 1.1 mrg 3rd vec: 16 17 18 19 20 21 22 23
5468 1.1 mrg 4th vec: 24 25 26 27 28 29 30 31
5469 1.1 mrg
5470 1.1 mrg The output sequence should be:
5471 1.1 mrg
5472 1.1 mrg 1st vec: 0 8 16 24 1 9 17 25
5473 1.1 mrg 2nd vec: 2 10 18 26 3 11 19 27
5474 1.1 mrg 3rd vec: 4 12 20 28 5 13 21 30
5475 1.1 mrg 4th vec: 6 14 22 30 7 15 23 31
5476 1.1 mrg
5477 1.1 mrg i.e., we interleave the contents of the four vectors in their order.
5478 1.1 mrg
5479 1.1 mrg We use interleave_high/low instructions to create such output. The input of
5480 1.1 mrg each interleave_high/low operation is two vectors:
5481 1.1 mrg 1st vec 2nd vec
5482 1.1 mrg 0 1 2 3 4 5 6 7
5483 1.1 mrg the even elements of the result vector are obtained left-to-right from the
5484 1.1 mrg high/low elements of the first vector. The odd elements of the result are
5485 1.1 mrg obtained left-to-right from the high/low elements of the second vector.
5486 1.1 mrg The output of interleave_high will be: 0 4 1 5
5487 1.1 mrg and of interleave_low: 2 6 3 7
5488 1.1 mrg
5489 1.1 mrg
5490 1.1 mrg The permutation is done in log LENGTH stages. In each stage interleave_high
5491 1.1 mrg and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5492 1.1 mrg where the first argument is taken from the first half of DR_CHAIN and the
5493 1.1 mrg second argument from it's second half.
5494 1.1 mrg In our example,
5495 1.1 mrg
5496 1.1 mrg I1: interleave_high (1st vec, 3rd vec)
5497 1.1 mrg I2: interleave_low (1st vec, 3rd vec)
5498 1.1 mrg I3: interleave_high (2nd vec, 4th vec)
5499 1.1 mrg I4: interleave_low (2nd vec, 4th vec)
5500 1.1 mrg
5501 1.1 mrg The output for the first stage is:
5502 1.1 mrg
5503 1.1 mrg I1: 0 16 1 17 2 18 3 19
5504 1.1 mrg I2: 4 20 5 21 6 22 7 23
5505 1.1 mrg I3: 8 24 9 25 10 26 11 27
5506 1.1 mrg I4: 12 28 13 29 14 30 15 31
5507 1.1 mrg
5508 1.1 mrg The output of the second stage, i.e. the final result is:
5509 1.1 mrg
5510 1.1 mrg I1: 0 8 16 24 1 9 17 25
5511 1.1 mrg I2: 2 10 18 26 3 11 19 27
5512 1.1 mrg I3: 4 12 20 28 5 13 21 30
5513 1.1 mrg I4: 6 14 22 30 7 15 23 31. */
5514 1.1 mrg
5515 1.1 mrg void
5516 1.1 mrg vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5517 1.1 mrg unsigned int length,
5518 1.1 mrg stmt_vec_info stmt_info,
5519 1.1 mrg gimple_stmt_iterator *gsi,
5520 1.1 mrg vec<tree> *result_chain)
5521 1.1 mrg {
5522 1.1 mrg tree vect1, vect2, high, low;
5523 1.1 mrg gimple *perm_stmt;
5524 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5525 1.1 mrg tree perm_mask_low, perm_mask_high;
5526 1.1 mrg tree data_ref;
5527 1.1 mrg tree perm3_mask_low, perm3_mask_high;
5528 1.1 mrg unsigned int i, j, n, log_length = exact_log2 (length);
5529 1.1 mrg
5530 1.1 mrg result_chain->quick_grow (length);
5531 1.1 mrg memcpy (result_chain->address (), dr_chain.address (),
5532 1.1 mrg length * sizeof (tree));
5533 1.1 mrg
5534 1.1 mrg if (length == 3)
5535 1.1 mrg {
5536 1.1 mrg /* vect_grouped_store_supported ensures that this is constant. */
5537 1.1 mrg unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5538 1.1 mrg unsigned int j0 = 0, j1 = 0, j2 = 0;
5539 1.1 mrg
5540 1.1 mrg vec_perm_builder sel (nelt, nelt, 1);
5541 1.1 mrg sel.quick_grow (nelt);
5542 1.1 mrg vec_perm_indices indices;
5543 1.1 mrg for (j = 0; j < 3; j++)
5544 1.1 mrg {
5545 1.1 mrg int nelt0 = ((3 - j) * nelt) % 3;
5546 1.1 mrg int nelt1 = ((3 - j) * nelt + 1) % 3;
5547 1.1 mrg int nelt2 = ((3 - j) * nelt + 2) % 3;
5548 1.1 mrg
5549 1.1 mrg for (i = 0; i < nelt; i++)
5550 1.1 mrg {
5551 1.1 mrg if (3 * i + nelt0 < nelt)
5552 1.1 mrg sel[3 * i + nelt0] = j0++;
5553 1.1 mrg if (3 * i + nelt1 < nelt)
5554 1.1 mrg sel[3 * i + nelt1] = nelt + j1++;
5555 1.1 mrg if (3 * i + nelt2 < nelt)
5556 1.1 mrg sel[3 * i + nelt2] = 0;
5557 1.1 mrg }
5558 1.1 mrg indices.new_vector (sel, 2, nelt);
5559 1.1 mrg perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5560 1.1 mrg
5561 1.1 mrg for (i = 0; i < nelt; i++)
5562 1.1 mrg {
5563 1.1 mrg if (3 * i + nelt0 < nelt)
5564 1.1 mrg sel[3 * i + nelt0] = 3 * i + nelt0;
5565 1.1 mrg if (3 * i + nelt1 < nelt)
5566 1.1 mrg sel[3 * i + nelt1] = 3 * i + nelt1;
5567 1.1 mrg if (3 * i + nelt2 < nelt)
5568 1.1 mrg sel[3 * i + nelt2] = nelt + j2++;
5569 1.1 mrg }
5570 1.1 mrg indices.new_vector (sel, 2, nelt);
5571 1.1 mrg perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5572 1.1 mrg
5573 1.1 mrg vect1 = dr_chain[0];
5574 1.1 mrg vect2 = dr_chain[1];
5575 1.1 mrg
5576 1.1 mrg /* Create interleaving stmt:
5577 1.1 mrg low = VEC_PERM_EXPR <vect1, vect2,
5578 1.1 mrg {j, nelt, *, j + 1, nelt + j + 1, *,
5579 1.1 mrg j + 2, nelt + j + 2, *, ...}> */
5580 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5581 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5582 1.1 mrg vect2, perm3_mask_low);
5583 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5584 1.1 mrg
5585 1.1 mrg vect1 = data_ref;
5586 1.1 mrg vect2 = dr_chain[2];
5587 1.1 mrg /* Create interleaving stmt:
5588 1.1 mrg low = VEC_PERM_EXPR <vect1, vect2,
5589 1.1 mrg {0, 1, nelt + j, 3, 4, nelt + j + 1,
5590 1.1 mrg 6, 7, nelt + j + 2, ...}> */
5591 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5592 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5593 1.1 mrg vect2, perm3_mask_high);
5594 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5595 1.1 mrg (*result_chain)[j] = data_ref;
5596 1.1 mrg }
5597 1.1 mrg }
5598 1.1 mrg else
5599 1.1 mrg {
5600 1.1 mrg /* If length is not equal to 3 then only power of 2 is supported. */
5601 1.1 mrg gcc_assert (pow2p_hwi (length));
5602 1.1 mrg
5603 1.1 mrg /* The encoding has 2 interleaved stepped patterns. */
5604 1.1 mrg poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5605 1.1 mrg vec_perm_builder sel (nelt, 2, 3);
5606 1.1 mrg sel.quick_grow (6);
5607 1.1 mrg for (i = 0; i < 3; i++)
5608 1.1 mrg {
5609 1.1 mrg sel[i * 2] = i;
5610 1.1 mrg sel[i * 2 + 1] = i + nelt;
5611 1.1 mrg }
5612 1.1 mrg vec_perm_indices indices (sel, 2, nelt);
5613 1.1 mrg perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5614 1.1 mrg
5615 1.1 mrg for (i = 0; i < 6; i++)
5616 1.1 mrg sel[i] += exact_div (nelt, 2);
5617 1.1 mrg indices.new_vector (sel, 2, nelt);
5618 1.1 mrg perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5619 1.1 mrg
5620 1.1 mrg for (i = 0, n = log_length; i < n; i++)
5621 1.1 mrg {
5622 1.1 mrg for (j = 0; j < length/2; j++)
5623 1.1 mrg {
5624 1.1 mrg vect1 = dr_chain[j];
5625 1.1 mrg vect2 = dr_chain[j+length/2];
5626 1.1 mrg
5627 1.1 mrg /* Create interleaving stmt:
5628 1.1 mrg high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5629 1.1 mrg ...}> */
5630 1.1 mrg high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5631 1.1 mrg perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5632 1.1 mrg vect2, perm_mask_high);
5633 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5634 1.1 mrg (*result_chain)[2*j] = high;
5635 1.1 mrg
5636 1.1 mrg /* Create interleaving stmt:
5637 1.1 mrg low = VEC_PERM_EXPR <vect1, vect2,
5638 1.1 mrg {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5639 1.1 mrg ...}> */
5640 1.1 mrg low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5641 1.1 mrg perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5642 1.1 mrg vect2, perm_mask_low);
5643 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5644 1.1 mrg (*result_chain)[2*j+1] = low;
5645 1.1 mrg }
5646 1.1 mrg memcpy (dr_chain.address (), result_chain->address (),
5647 1.1 mrg length * sizeof (tree));
5648 1.1 mrg }
5649 1.1 mrg }
5650 1.1 mrg }
5651 1.1 mrg
5652 1.1 mrg /* Function vect_setup_realignment
5653 1.1 mrg
5654 1.1 mrg This function is called when vectorizing an unaligned load using
5655 1.1 mrg the dr_explicit_realign[_optimized] scheme.
5656 1.1 mrg This function generates the following code at the loop prolog:
5657 1.1 mrg
5658 1.1 mrg p = initial_addr;
5659 1.1 mrg x msq_init = *(floor(p)); # prolog load
5660 1.1 mrg realignment_token = call target_builtin;
5661 1.1 mrg loop:
5662 1.1 mrg x msq = phi (msq_init, ---)
5663 1.1 mrg
5664 1.1 mrg The stmts marked with x are generated only for the case of
5665 1.1 mrg dr_explicit_realign_optimized.
5666 1.1 mrg
5667 1.1 mrg The code above sets up a new (vector) pointer, pointing to the first
5668 1.1 mrg location accessed by STMT_INFO, and a "floor-aligned" load using that
5669 1.1 mrg pointer. It also generates code to compute the "realignment-token"
5670 1.1 mrg (if the relevant target hook was defined), and creates a phi-node at the
5671 1.1 mrg loop-header bb whose arguments are the result of the prolog-load (created
5672 1.1 mrg by this function) and the result of a load that takes place in the loop
5673 1.1 mrg (to be created by the caller to this function).
5674 1.1 mrg
5675 1.1 mrg For the case of dr_explicit_realign_optimized:
5676 1.1 mrg The caller to this function uses the phi-result (msq) to create the
5677 1.1 mrg realignment code inside the loop, and sets up the missing phi argument,
5678 1.1 mrg as follows:
5679 1.1 mrg loop:
5680 1.1 mrg msq = phi (msq_init, lsq)
5681 1.1 mrg lsq = *(floor(p')); # load in loop
5682 1.1 mrg result = realign_load (msq, lsq, realignment_token);
5683 1.1 mrg
5684 1.1 mrg For the case of dr_explicit_realign:
5685 1.1 mrg loop:
5686 1.1 mrg msq = *(floor(p)); # load in loop
5687 1.1 mrg p' = p + (VS-1);
5688 1.1 mrg lsq = *(floor(p')); # load in loop
5689 1.1 mrg result = realign_load (msq, lsq, realignment_token);
5690 1.1 mrg
5691 1.1 mrg Input:
5692 1.1 mrg STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5693 1.1 mrg a memory location that may be unaligned.
5694 1.1 mrg BSI - place where new code is to be inserted.
5695 1.1 mrg ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5696 1.1 mrg is used.
5697 1.1 mrg
5698 1.1 mrg Output:
5699 1.1 mrg REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5700 1.1 mrg target hook, if defined.
5701 1.1 mrg Return value - the result of the loop-header phi node. */
5702 1.1 mrg
5703 1.1 mrg tree
5704 1.1 mrg vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5705 1.1 mrg gimple_stmt_iterator *gsi, tree *realignment_token,
5706 1.1 mrg enum dr_alignment_support alignment_support_scheme,
5707 1.1 mrg tree init_addr,
5708 1.1 mrg class loop **at_loop)
5709 1.1 mrg {
5710 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5711 1.1 mrg loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5712 1.1 mrg dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5713 1.1 mrg struct data_reference *dr = dr_info->dr;
5714 1.1 mrg class loop *loop = NULL;
5715 1.1 mrg edge pe = NULL;
5716 1.1 mrg tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5717 1.1 mrg tree vec_dest;
5718 1.1 mrg gimple *inc;
5719 1.1 mrg tree ptr;
5720 1.1 mrg tree data_ref;
5721 1.1 mrg basic_block new_bb;
5722 1.1 mrg tree msq_init = NULL_TREE;
5723 1.1 mrg tree new_temp;
5724 1.1 mrg gphi *phi_stmt;
5725 1.1 mrg tree msq = NULL_TREE;
5726 1.1 mrg gimple_seq stmts = NULL;
5727 1.1 mrg bool compute_in_loop = false;
5728 1.1 mrg bool nested_in_vect_loop = false;
5729 1.1 mrg class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5730 1.1 mrg class loop *loop_for_initial_load = NULL;
5731 1.1 mrg
5732 1.1 mrg if (loop_vinfo)
5733 1.1 mrg {
5734 1.1 mrg loop = LOOP_VINFO_LOOP (loop_vinfo);
5735 1.1 mrg nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5736 1.1 mrg }
5737 1.1 mrg
5738 1.1 mrg gcc_assert (alignment_support_scheme == dr_explicit_realign
5739 1.1 mrg || alignment_support_scheme == dr_explicit_realign_optimized);
5740 1.1 mrg
5741 1.1 mrg /* We need to generate three things:
5742 1.1 mrg 1. the misalignment computation
5743 1.1 mrg 2. the extra vector load (for the optimized realignment scheme).
5744 1.1 mrg 3. the phi node for the two vectors from which the realignment is
5745 1.1 mrg done (for the optimized realignment scheme). */
5746 1.1 mrg
5747 1.1 mrg /* 1. Determine where to generate the misalignment computation.
5748 1.1 mrg
5749 1.1 mrg If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5750 1.1 mrg calculation will be generated by this function, outside the loop (in the
5751 1.1 mrg preheader). Otherwise, INIT_ADDR had already been computed for us by the
5752 1.1 mrg caller, inside the loop.
5753 1.1 mrg
5754 1.1 mrg Background: If the misalignment remains fixed throughout the iterations of
5755 1.1 mrg the loop, then both realignment schemes are applicable, and also the
5756 1.1 mrg misalignment computation can be done outside LOOP. This is because we are
5757 1.1 mrg vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5758 1.1 mrg are a multiple of VS (the Vector Size), and therefore the misalignment in
5759 1.1 mrg different vectorized LOOP iterations is always the same.
5760 1.1 mrg The problem arises only if the memory access is in an inner-loop nested
5761 1.1 mrg inside LOOP, which is now being vectorized using outer-loop vectorization.
5762 1.1 mrg This is the only case when the misalignment of the memory access may not
5763 1.1 mrg remain fixed throughout the iterations of the inner-loop (as explained in
5764 1.1 mrg detail in vect_supportable_dr_alignment). In this case, not only is the
5765 1.1 mrg optimized realignment scheme not applicable, but also the misalignment
5766 1.1 mrg computation (and generation of the realignment token that is passed to
5767 1.1 mrg REALIGN_LOAD) have to be done inside the loop.
5768 1.1 mrg
5769 1.1 mrg In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5770 1.1 mrg or not, which in turn determines if the misalignment is computed inside
5771 1.1 mrg the inner-loop, or outside LOOP. */
5772 1.1 mrg
5773 1.1 mrg if (init_addr != NULL_TREE || !loop_vinfo)
5774 1.1 mrg {
5775 1.1 mrg compute_in_loop = true;
5776 1.1 mrg gcc_assert (alignment_support_scheme == dr_explicit_realign);
5777 1.1 mrg }
5778 1.1 mrg
5779 1.1 mrg
5780 1.1 mrg /* 2. Determine where to generate the extra vector load.
5781 1.1 mrg
5782 1.1 mrg For the optimized realignment scheme, instead of generating two vector
5783 1.1 mrg loads in each iteration, we generate a single extra vector load in the
5784 1.1 mrg preheader of the loop, and in each iteration reuse the result of the
5785 1.1 mrg vector load from the previous iteration. In case the memory access is in
5786 1.1 mrg an inner-loop nested inside LOOP, which is now being vectorized using
5787 1.1 mrg outer-loop vectorization, we need to determine whether this initial vector
5788 1.1 mrg load should be generated at the preheader of the inner-loop, or can be
5789 1.1 mrg generated at the preheader of LOOP. If the memory access has no evolution
5790 1.1 mrg in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5791 1.1 mrg to be generated inside LOOP (in the preheader of the inner-loop). */
5792 1.1 mrg
5793 1.1 mrg if (nested_in_vect_loop)
5794 1.1 mrg {
5795 1.1 mrg tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5796 1.1 mrg bool invariant_in_outerloop =
5797 1.1 mrg (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5798 1.1 mrg loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5799 1.1 mrg }
5800 1.1 mrg else
5801 1.1 mrg loop_for_initial_load = loop;
5802 1.1 mrg if (at_loop)
5803 1.1 mrg *at_loop = loop_for_initial_load;
5804 1.1 mrg
5805 1.1 mrg if (loop_for_initial_load)
5806 1.1 mrg pe = loop_preheader_edge (loop_for_initial_load);
5807 1.1 mrg
5808 1.1 mrg /* 3. For the case of the optimized realignment, create the first vector
5809 1.1 mrg load at the loop preheader. */
5810 1.1 mrg
5811 1.1 mrg if (alignment_support_scheme == dr_explicit_realign_optimized)
5812 1.1 mrg {
5813 1.1 mrg /* Create msq_init = *(floor(p1)) in the loop preheader */
5814 1.1 mrg gassign *new_stmt;
5815 1.1 mrg
5816 1.1 mrg gcc_assert (!compute_in_loop);
5817 1.1 mrg vec_dest = vect_create_destination_var (scalar_dest, vectype);
5818 1.1 mrg ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5819 1.1 mrg loop_for_initial_load, NULL_TREE,
5820 1.1 mrg &init_addr, NULL, &inc, true);
5821 1.1 mrg if (TREE_CODE (ptr) == SSA_NAME)
5822 1.1 mrg new_temp = copy_ssa_name (ptr);
5823 1.1 mrg else
5824 1.1 mrg new_temp = make_ssa_name (TREE_TYPE (ptr));
5825 1.1 mrg poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5826 1.1 mrg tree type = TREE_TYPE (ptr);
5827 1.1 mrg new_stmt = gimple_build_assign
5828 1.1 mrg (new_temp, BIT_AND_EXPR, ptr,
5829 1.1 mrg fold_build2 (MINUS_EXPR, type,
5830 1.1 mrg build_int_cst (type, 0),
5831 1.1 mrg build_int_cst (type, align)));
5832 1.1 mrg new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5833 1.1 mrg gcc_assert (!new_bb);
5834 1.1 mrg data_ref
5835 1.1 mrg = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5836 1.1 mrg build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5837 1.1 mrg vect_copy_ref_info (data_ref, DR_REF (dr));
5838 1.1 mrg new_stmt = gimple_build_assign (vec_dest, data_ref);
5839 1.1 mrg new_temp = make_ssa_name (vec_dest, new_stmt);
5840 1.1 mrg gimple_assign_set_lhs (new_stmt, new_temp);
5841 1.1 mrg if (pe)
5842 1.1 mrg {
5843 1.1 mrg new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5844 1.1 mrg gcc_assert (!new_bb);
5845 1.1 mrg }
5846 1.1 mrg else
5847 1.1 mrg gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5848 1.1 mrg
5849 1.1 mrg msq_init = gimple_assign_lhs (new_stmt);
5850 1.1 mrg }
5851 1.1 mrg
5852 1.1 mrg /* 4. Create realignment token using a target builtin, if available.
5853 1.1 mrg It is done either inside the containing loop, or before LOOP (as
5854 1.1 mrg determined above). */
5855 1.1 mrg
5856 1.1 mrg if (targetm.vectorize.builtin_mask_for_load)
5857 1.1 mrg {
5858 1.1 mrg gcall *new_stmt;
5859 1.1 mrg tree builtin_decl;
5860 1.1 mrg
5861 1.1 mrg /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5862 1.1 mrg if (!init_addr)
5863 1.1 mrg {
5864 1.1 mrg /* Generate the INIT_ADDR computation outside LOOP. */
5865 1.1 mrg init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5866 1.1 mrg stmt_info, &stmts,
5867 1.1 mrg NULL_TREE);
5868 1.1 mrg if (loop)
5869 1.1 mrg {
5870 1.1 mrg pe = loop_preheader_edge (loop);
5871 1.1 mrg new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5872 1.1 mrg gcc_assert (!new_bb);
5873 1.1 mrg }
5874 1.1 mrg else
5875 1.1 mrg gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5876 1.1 mrg }
5877 1.1 mrg
5878 1.1 mrg builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5879 1.1 mrg new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5880 1.1 mrg vec_dest =
5881 1.1 mrg vect_create_destination_var (scalar_dest,
5882 1.1 mrg gimple_call_return_type (new_stmt));
5883 1.1 mrg new_temp = make_ssa_name (vec_dest, new_stmt);
5884 1.1 mrg gimple_call_set_lhs (new_stmt, new_temp);
5885 1.1 mrg
5886 1.1 mrg if (compute_in_loop)
5887 1.1 mrg gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5888 1.1 mrg else
5889 1.1 mrg {
5890 1.1 mrg /* Generate the misalignment computation outside LOOP. */
5891 1.1 mrg pe = loop_preheader_edge (loop);
5892 1.1 mrg new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5893 1.1 mrg gcc_assert (!new_bb);
5894 1.1 mrg }
5895 1.1 mrg
5896 1.1 mrg *realignment_token = gimple_call_lhs (new_stmt);
5897 1.1 mrg
5898 1.1 mrg /* The result of the CALL_EXPR to this builtin is determined from
5899 1.1 mrg the value of the parameter and no global variables are touched
5900 1.1 mrg which makes the builtin a "const" function. Requiring the
5901 1.1 mrg builtin to have the "const" attribute makes it unnecessary
5902 1.1 mrg to call mark_call_clobbered. */
5903 1.1 mrg gcc_assert (TREE_READONLY (builtin_decl));
5904 1.1 mrg }
5905 1.1 mrg
5906 1.1 mrg if (alignment_support_scheme == dr_explicit_realign)
5907 1.1 mrg return msq;
5908 1.1 mrg
5909 1.1 mrg gcc_assert (!compute_in_loop);
5910 1.1 mrg gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5911 1.1 mrg
5912 1.1 mrg
5913 1.1 mrg /* 5. Create msq = phi <msq_init, lsq> in loop */
5914 1.1 mrg
5915 1.1 mrg pe = loop_preheader_edge (containing_loop);
5916 1.1 mrg vec_dest = vect_create_destination_var (scalar_dest, vectype);
5917 1.1 mrg msq = make_ssa_name (vec_dest);
5918 1.1 mrg phi_stmt = create_phi_node (msq, containing_loop->header);
5919 1.1 mrg add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5920 1.1 mrg
5921 1.1 mrg return msq;
5922 1.1 mrg }
5923 1.1 mrg
5924 1.1 mrg
5925 1.1 mrg /* Function vect_grouped_load_supported.
5926 1.1 mrg
5927 1.1 mrg COUNT is the size of the load group (the number of statements plus the
5928 1.1 mrg number of gaps). SINGLE_ELEMENT_P is true if there is actually
5929 1.1 mrg only one statement, with a gap of COUNT - 1.
5930 1.1 mrg
5931 1.1 mrg Returns true if a suitable permute exists. */
5932 1.1 mrg
5933 1.1 mrg bool
5934 1.1 mrg vect_grouped_load_supported (tree vectype, bool single_element_p,
5935 1.1 mrg unsigned HOST_WIDE_INT count)
5936 1.1 mrg {
5937 1.1 mrg machine_mode mode = TYPE_MODE (vectype);
5938 1.1 mrg
5939 1.1 mrg /* If this is single-element interleaving with an element distance
5940 1.1 mrg that leaves unused vector loads around punt - we at least create
5941 1.1 mrg very sub-optimal code in that case (and blow up memory,
5942 1.1 mrg see PR65518). */
5943 1.1 mrg if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5944 1.1 mrg {
5945 1.1 mrg if (dump_enabled_p ())
5946 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5947 1.1 mrg "single-element interleaving not supported "
5948 1.1 mrg "for not adjacent vector loads\n");
5949 1.1 mrg return false;
5950 1.1 mrg }
5951 1.1 mrg
5952 1.1 mrg /* vect_permute_load_chain requires the group size to be equal to 3 or
5953 1.1 mrg be a power of two. */
5954 1.1 mrg if (count != 3 && exact_log2 (count) == -1)
5955 1.1 mrg {
5956 1.1 mrg if (dump_enabled_p ())
5957 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5958 1.1 mrg "the size of the group of accesses"
5959 1.1 mrg " is not a power of 2 or not equal to 3\n");
5960 1.1 mrg return false;
5961 1.1 mrg }
5962 1.1 mrg
5963 1.1 mrg /* Check that the permutation is supported. */
5964 1.1 mrg if (VECTOR_MODE_P (mode))
5965 1.1 mrg {
5966 1.1 mrg unsigned int i, j;
5967 1.1 mrg if (count == 3)
5968 1.1 mrg {
5969 1.1 mrg unsigned int nelt;
5970 1.1 mrg if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5971 1.1 mrg {
5972 1.1 mrg if (dump_enabled_p ())
5973 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5974 1.1 mrg "cannot handle groups of 3 loads for"
5975 1.1 mrg " variable-length vectors\n");
5976 1.1 mrg return false;
5977 1.1 mrg }
5978 1.1 mrg
5979 1.1 mrg vec_perm_builder sel (nelt, nelt, 1);
5980 1.1 mrg sel.quick_grow (nelt);
5981 1.1 mrg vec_perm_indices indices;
5982 1.1 mrg unsigned int k;
5983 1.1 mrg for (k = 0; k < 3; k++)
5984 1.1 mrg {
5985 1.1 mrg for (i = 0; i < nelt; i++)
5986 1.1 mrg if (3 * i + k < 2 * nelt)
5987 1.1 mrg sel[i] = 3 * i + k;
5988 1.1 mrg else
5989 1.1 mrg sel[i] = 0;
5990 1.1 mrg indices.new_vector (sel, 2, nelt);
5991 1.1 mrg if (!can_vec_perm_const_p (mode, indices))
5992 1.1 mrg {
5993 1.1 mrg if (dump_enabled_p ())
5994 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5995 1.1 mrg "shuffle of 3 loads is not supported by"
5996 1.1 mrg " target\n");
5997 1.1 mrg return false;
5998 1.1 mrg }
5999 1.1 mrg for (i = 0, j = 0; i < nelt; i++)
6000 1.1 mrg if (3 * i + k < 2 * nelt)
6001 1.1 mrg sel[i] = i;
6002 1.1 mrg else
6003 1.1 mrg sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6004 1.1 mrg indices.new_vector (sel, 2, nelt);
6005 1.1 mrg if (!can_vec_perm_const_p (mode, indices))
6006 1.1 mrg {
6007 1.1 mrg if (dump_enabled_p ())
6008 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6009 1.1 mrg "shuffle of 3 loads is not supported by"
6010 1.1 mrg " target\n");
6011 1.1 mrg return false;
6012 1.1 mrg }
6013 1.1 mrg }
6014 1.1 mrg return true;
6015 1.1 mrg }
6016 1.1 mrg else
6017 1.1 mrg {
6018 1.1 mrg /* If length is not equal to 3 then only power of 2 is supported. */
6019 1.1 mrg gcc_assert (pow2p_hwi (count));
6020 1.1 mrg poly_uint64 nelt = GET_MODE_NUNITS (mode);
6021 1.1 mrg
6022 1.1 mrg /* The encoding has a single stepped pattern. */
6023 1.1 mrg vec_perm_builder sel (nelt, 1, 3);
6024 1.1 mrg sel.quick_grow (3);
6025 1.1 mrg for (i = 0; i < 3; i++)
6026 1.1 mrg sel[i] = i * 2;
6027 1.1 mrg vec_perm_indices indices (sel, 2, nelt);
6028 1.1 mrg if (can_vec_perm_const_p (mode, indices))
6029 1.1 mrg {
6030 1.1 mrg for (i = 0; i < 3; i++)
6031 1.1 mrg sel[i] = i * 2 + 1;
6032 1.1 mrg indices.new_vector (sel, 2, nelt);
6033 1.1 mrg if (can_vec_perm_const_p (mode, indices))
6034 1.1 mrg return true;
6035 1.1 mrg }
6036 1.1 mrg }
6037 1.1 mrg }
6038 1.1 mrg
6039 1.1 mrg if (dump_enabled_p ())
6040 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6041 1.1 mrg "extract even/odd not supported by target\n");
6042 1.1 mrg return false;
6043 1.1 mrg }
6044 1.1 mrg
6045 1.1 mrg /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6046 1.1 mrg type VECTYPE. MASKED_P says whether the masked form is needed. */
6047 1.1 mrg
6048 1.1 mrg bool
6049 1.1 mrg vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6050 1.1 mrg bool masked_p)
6051 1.1 mrg {
6052 1.1 mrg if (masked_p)
6053 1.1 mrg return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6054 1.1 mrg vec_mask_load_lanes_optab,
6055 1.1 mrg vectype, count);
6056 1.1 mrg else
6057 1.1 mrg return vect_lanes_optab_supported_p ("vec_load_lanes",
6058 1.1 mrg vec_load_lanes_optab,
6059 1.1 mrg vectype, count);
6060 1.1 mrg }
6061 1.1 mrg
6062 1.1 mrg /* Function vect_permute_load_chain.
6063 1.1 mrg
6064 1.1 mrg Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6065 1.1 mrg a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6066 1.1 mrg the input data correctly. Return the final references for loads in
6067 1.1 mrg RESULT_CHAIN.
6068 1.1 mrg
6069 1.1 mrg E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6070 1.1 mrg The input is 4 vectors each containing 8 elements. We assign a number to each
6071 1.1 mrg element, the input sequence is:
6072 1.1 mrg
6073 1.1 mrg 1st vec: 0 1 2 3 4 5 6 7
6074 1.1 mrg 2nd vec: 8 9 10 11 12 13 14 15
6075 1.1 mrg 3rd vec: 16 17 18 19 20 21 22 23
6076 1.1 mrg 4th vec: 24 25 26 27 28 29 30 31
6077 1.1 mrg
6078 1.1 mrg The output sequence should be:
6079 1.1 mrg
6080 1.1 mrg 1st vec: 0 4 8 12 16 20 24 28
6081 1.1 mrg 2nd vec: 1 5 9 13 17 21 25 29
6082 1.1 mrg 3rd vec: 2 6 10 14 18 22 26 30
6083 1.1 mrg 4th vec: 3 7 11 15 19 23 27 31
6084 1.1 mrg
6085 1.1 mrg i.e., the first output vector should contain the first elements of each
6086 1.1 mrg interleaving group, etc.
6087 1.1 mrg
6088 1.1 mrg We use extract_even/odd instructions to create such output. The input of
6089 1.1 mrg each extract_even/odd operation is two vectors
6090 1.1 mrg 1st vec 2nd vec
6091 1.1 mrg 0 1 2 3 4 5 6 7
6092 1.1 mrg
6093 1.1 mrg and the output is the vector of extracted even/odd elements. The output of
6094 1.1 mrg extract_even will be: 0 2 4 6
6095 1.1 mrg and of extract_odd: 1 3 5 7
6096 1.1 mrg
6097 1.1 mrg
6098 1.1 mrg The permutation is done in log LENGTH stages. In each stage extract_even
6099 1.1 mrg and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6100 1.1 mrg their order. In our example,
6101 1.1 mrg
6102 1.1 mrg E1: extract_even (1st vec, 2nd vec)
6103 1.1 mrg E2: extract_odd (1st vec, 2nd vec)
6104 1.1 mrg E3: extract_even (3rd vec, 4th vec)
6105 1.1 mrg E4: extract_odd (3rd vec, 4th vec)
6106 1.1 mrg
6107 1.1 mrg The output for the first stage will be:
6108 1.1 mrg
6109 1.1 mrg E1: 0 2 4 6 8 10 12 14
6110 1.1 mrg E2: 1 3 5 7 9 11 13 15
6111 1.1 mrg E3: 16 18 20 22 24 26 28 30
6112 1.1 mrg E4: 17 19 21 23 25 27 29 31
6113 1.1 mrg
6114 1.1 mrg In order to proceed and create the correct sequence for the next stage (or
6115 1.1 mrg for the correct output, if the second stage is the last one, as in our
6116 1.1 mrg example), we first put the output of extract_even operation and then the
6117 1.1 mrg output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6118 1.1 mrg The input for the second stage is:
6119 1.1 mrg
6120 1.1 mrg 1st vec (E1): 0 2 4 6 8 10 12 14
6121 1.1 mrg 2nd vec (E3): 16 18 20 22 24 26 28 30
6122 1.1 mrg 3rd vec (E2): 1 3 5 7 9 11 13 15
6123 1.1 mrg 4th vec (E4): 17 19 21 23 25 27 29 31
6124 1.1 mrg
6125 1.1 mrg The output of the second stage:
6126 1.1 mrg
6127 1.1 mrg E1: 0 4 8 12 16 20 24 28
6128 1.1 mrg E2: 2 6 10 14 18 22 26 30
6129 1.1 mrg E3: 1 5 9 13 17 21 25 29
6130 1.1 mrg E4: 3 7 11 15 19 23 27 31
6131 1.1 mrg
6132 1.1 mrg And RESULT_CHAIN after reordering:
6133 1.1 mrg
6134 1.1 mrg 1st vec (E1): 0 4 8 12 16 20 24 28
6135 1.1 mrg 2nd vec (E3): 1 5 9 13 17 21 25 29
6136 1.1 mrg 3rd vec (E2): 2 6 10 14 18 22 26 30
6137 1.1 mrg 4th vec (E4): 3 7 11 15 19 23 27 31. */
6138 1.1 mrg
6139 1.1 mrg static void
6140 1.1 mrg vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6141 1.1 mrg unsigned int length,
6142 1.1 mrg stmt_vec_info stmt_info,
6143 1.1 mrg gimple_stmt_iterator *gsi,
6144 1.1 mrg vec<tree> *result_chain)
6145 1.1 mrg {
6146 1.1 mrg tree data_ref, first_vect, second_vect;
6147 1.1 mrg tree perm_mask_even, perm_mask_odd;
6148 1.1 mrg tree perm3_mask_low, perm3_mask_high;
6149 1.1 mrg gimple *perm_stmt;
6150 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6151 1.1 mrg unsigned int i, j, log_length = exact_log2 (length);
6152 1.1 mrg
6153 1.1 mrg result_chain->quick_grow (length);
6154 1.1 mrg memcpy (result_chain->address (), dr_chain.address (),
6155 1.1 mrg length * sizeof (tree));
6156 1.1 mrg
6157 1.1 mrg if (length == 3)
6158 1.1 mrg {
6159 1.1 mrg /* vect_grouped_load_supported ensures that this is constant. */
6160 1.1 mrg unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6161 1.1 mrg unsigned int k;
6162 1.1 mrg
6163 1.1 mrg vec_perm_builder sel (nelt, nelt, 1);
6164 1.1 mrg sel.quick_grow (nelt);
6165 1.1 mrg vec_perm_indices indices;
6166 1.1 mrg for (k = 0; k < 3; k++)
6167 1.1 mrg {
6168 1.1 mrg for (i = 0; i < nelt; i++)
6169 1.1 mrg if (3 * i + k < 2 * nelt)
6170 1.1 mrg sel[i] = 3 * i + k;
6171 1.1 mrg else
6172 1.1 mrg sel[i] = 0;
6173 1.1 mrg indices.new_vector (sel, 2, nelt);
6174 1.1 mrg perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6175 1.1 mrg
6176 1.1 mrg for (i = 0, j = 0; i < nelt; i++)
6177 1.1 mrg if (3 * i + k < 2 * nelt)
6178 1.1 mrg sel[i] = i;
6179 1.1 mrg else
6180 1.1 mrg sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6181 1.1 mrg indices.new_vector (sel, 2, nelt);
6182 1.1 mrg perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6183 1.1 mrg
6184 1.1 mrg first_vect = dr_chain[0];
6185 1.1 mrg second_vect = dr_chain[1];
6186 1.1 mrg
6187 1.1 mrg /* Create interleaving stmt (low part of):
6188 1.1 mrg low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6189 1.1 mrg ...}> */
6190 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6191 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6192 1.1 mrg second_vect, perm3_mask_low);
6193 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6194 1.1 mrg
6195 1.1 mrg /* Create interleaving stmt (high part of):
6196 1.1 mrg high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6197 1.1 mrg ...}> */
6198 1.1 mrg first_vect = data_ref;
6199 1.1 mrg second_vect = dr_chain[2];
6200 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6201 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6202 1.1 mrg second_vect, perm3_mask_high);
6203 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6204 1.1 mrg (*result_chain)[k] = data_ref;
6205 1.1 mrg }
6206 1.1 mrg }
6207 1.1 mrg else
6208 1.1 mrg {
6209 1.1 mrg /* If length is not equal to 3 then only power of 2 is supported. */
6210 1.1 mrg gcc_assert (pow2p_hwi (length));
6211 1.1 mrg
6212 1.1 mrg /* The encoding has a single stepped pattern. */
6213 1.1 mrg poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6214 1.1 mrg vec_perm_builder sel (nelt, 1, 3);
6215 1.1 mrg sel.quick_grow (3);
6216 1.1 mrg for (i = 0; i < 3; ++i)
6217 1.1 mrg sel[i] = i * 2;
6218 1.1 mrg vec_perm_indices indices (sel, 2, nelt);
6219 1.1 mrg perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6220 1.1 mrg
6221 1.1 mrg for (i = 0; i < 3; ++i)
6222 1.1 mrg sel[i] = i * 2 + 1;
6223 1.1 mrg indices.new_vector (sel, 2, nelt);
6224 1.1 mrg perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6225 1.1 mrg
6226 1.1 mrg for (i = 0; i < log_length; i++)
6227 1.1 mrg {
6228 1.1 mrg for (j = 0; j < length; j += 2)
6229 1.1 mrg {
6230 1.1 mrg first_vect = dr_chain[j];
6231 1.1 mrg second_vect = dr_chain[j+1];
6232 1.1 mrg
6233 1.1 mrg /* data_ref = permute_even (first_data_ref, second_data_ref); */
6234 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6235 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6236 1.1 mrg first_vect, second_vect,
6237 1.1 mrg perm_mask_even);
6238 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6239 1.1 mrg (*result_chain)[j/2] = data_ref;
6240 1.1 mrg
6241 1.1 mrg /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6242 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6243 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6244 1.1 mrg first_vect, second_vect,
6245 1.1 mrg perm_mask_odd);
6246 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6247 1.1 mrg (*result_chain)[j/2+length/2] = data_ref;
6248 1.1 mrg }
6249 1.1 mrg memcpy (dr_chain.address (), result_chain->address (),
6250 1.1 mrg length * sizeof (tree));
6251 1.1 mrg }
6252 1.1 mrg }
6253 1.1 mrg }
6254 1.1 mrg
6255 1.1 mrg /* Function vect_shift_permute_load_chain.
6256 1.1 mrg
6257 1.1 mrg Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6258 1.1 mrg sequence of stmts to reorder the input data accordingly.
6259 1.1 mrg Return the final references for loads in RESULT_CHAIN.
6260 1.1 mrg Return true if successed, false otherwise.
6261 1.1 mrg
6262 1.1 mrg E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6263 1.1 mrg The input is 3 vectors each containing 8 elements. We assign a
6264 1.1 mrg number to each element, the input sequence is:
6265 1.1 mrg
6266 1.1 mrg 1st vec: 0 1 2 3 4 5 6 7
6267 1.1 mrg 2nd vec: 8 9 10 11 12 13 14 15
6268 1.1 mrg 3rd vec: 16 17 18 19 20 21 22 23
6269 1.1 mrg
6270 1.1 mrg The output sequence should be:
6271 1.1 mrg
6272 1.1 mrg 1st vec: 0 3 6 9 12 15 18 21
6273 1.1 mrg 2nd vec: 1 4 7 10 13 16 19 22
6274 1.1 mrg 3rd vec: 2 5 8 11 14 17 20 23
6275 1.1 mrg
6276 1.1 mrg We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6277 1.1 mrg
6278 1.1 mrg First we shuffle all 3 vectors to get correct elements order:
6279 1.1 mrg
6280 1.1 mrg 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6281 1.1 mrg 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6282 1.1 mrg 3rd vec: (16 19 22) (17 20 23) (18 21)
6283 1.1 mrg
6284 1.1 mrg Next we unite and shift vector 3 times:
6285 1.1 mrg
6286 1.1 mrg 1st step:
6287 1.1 mrg shift right by 6 the concatenation of:
6288 1.1 mrg "1st vec" and "2nd vec"
6289 1.1 mrg ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6290 1.1 mrg "2nd vec" and "3rd vec"
6291 1.1 mrg ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6292 1.1 mrg "3rd vec" and "1st vec"
6293 1.1 mrg (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6294 1.1 mrg | New vectors |
6295 1.1 mrg
6296 1.1 mrg So that now new vectors are:
6297 1.1 mrg
6298 1.1 mrg 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6299 1.1 mrg 2nd vec: (10 13) (16 19 22) (17 20 23)
6300 1.1 mrg 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6301 1.1 mrg
6302 1.1 mrg 2nd step:
6303 1.1 mrg shift right by 5 the concatenation of:
6304 1.1 mrg "1st vec" and "3rd vec"
6305 1.1 mrg ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6306 1.1 mrg "2nd vec" and "1st vec"
6307 1.1 mrg (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6308 1.1 mrg "3rd vec" and "2nd vec"
6309 1.1 mrg (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6310 1.1 mrg | New vectors |
6311 1.1 mrg
6312 1.1 mrg So that now new vectors are:
6313 1.1 mrg
6314 1.1 mrg 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6315 1.1 mrg 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6316 1.1 mrg 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6317 1.1 mrg
6318 1.1 mrg 3rd step:
6319 1.1 mrg shift right by 5 the concatenation of:
6320 1.1 mrg "1st vec" and "1st vec"
6321 1.1 mrg ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6322 1.1 mrg shift right by 3 the concatenation of:
6323 1.1 mrg "2nd vec" and "2nd vec"
6324 1.1 mrg (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6325 1.1 mrg | New vectors |
6326 1.1 mrg
6327 1.1 mrg So that now all vectors are READY:
6328 1.1 mrg 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6329 1.1 mrg 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6330 1.1 mrg 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6331 1.1 mrg
6332 1.1 mrg This algorithm is faster than one in vect_permute_load_chain if:
6333 1.1 mrg 1. "shift of a concatination" is faster than general permutation.
6334 1.1 mrg This is usually so.
6335 1.1 mrg 2. The TARGET machine can't execute vector instructions in parallel.
6336 1.1 mrg This is because each step of the algorithm depends on previous.
6337 1.1 mrg The algorithm in vect_permute_load_chain is much more parallel.
6338 1.1 mrg
6339 1.1 mrg The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6340 1.1 mrg */
6341 1.1 mrg
6342 1.1 mrg static bool
6343 1.1 mrg vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6344 1.1 mrg unsigned int length,
6345 1.1 mrg stmt_vec_info stmt_info,
6346 1.1 mrg gimple_stmt_iterator *gsi,
6347 1.1 mrg vec<tree> *result_chain)
6348 1.1 mrg {
6349 1.1 mrg tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6350 1.1 mrg tree perm2_mask1, perm2_mask2, perm3_mask;
6351 1.1 mrg tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6352 1.1 mrg gimple *perm_stmt;
6353 1.1 mrg
6354 1.1 mrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6355 1.1 mrg unsigned int i;
6356 1.1 mrg loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6357 1.1 mrg
6358 1.1 mrg unsigned HOST_WIDE_INT nelt, vf;
6359 1.1 mrg if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6360 1.1 mrg || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6361 1.1 mrg /* Not supported for variable-length vectors. */
6362 1.1 mrg return false;
6363 1.1 mrg
6364 1.1 mrg vec_perm_builder sel (nelt, nelt, 1);
6365 1.1 mrg sel.quick_grow (nelt);
6366 1.1 mrg
6367 1.1 mrg result_chain->quick_grow (length);
6368 1.1 mrg memcpy (result_chain->address (), dr_chain.address (),
6369 1.1 mrg length * sizeof (tree));
6370 1.1 mrg
6371 1.1 mrg if (pow2p_hwi (length) && vf > 4)
6372 1.1 mrg {
6373 1.1 mrg unsigned int j, log_length = exact_log2 (length);
6374 1.1 mrg for (i = 0; i < nelt / 2; ++i)
6375 1.1 mrg sel[i] = i * 2;
6376 1.1 mrg for (i = 0; i < nelt / 2; ++i)
6377 1.1 mrg sel[nelt / 2 + i] = i * 2 + 1;
6378 1.1 mrg vec_perm_indices indices (sel, 2, nelt);
6379 1.1 mrg if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6380 1.1 mrg {
6381 1.1 mrg if (dump_enabled_p ())
6382 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6383 1.1 mrg "shuffle of 2 fields structure is not \
6384 1.1 mrg supported by target\n");
6385 1.1 mrg return false;
6386 1.1 mrg }
6387 1.1 mrg perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6388 1.1 mrg
6389 1.1 mrg for (i = 0; i < nelt / 2; ++i)
6390 1.1 mrg sel[i] = i * 2 + 1;
6391 1.1 mrg for (i = 0; i < nelt / 2; ++i)
6392 1.1 mrg sel[nelt / 2 + i] = i * 2;
6393 1.1 mrg indices.new_vector (sel, 2, nelt);
6394 1.1 mrg if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6395 1.1 mrg {
6396 1.1 mrg if (dump_enabled_p ())
6397 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6398 1.1 mrg "shuffle of 2 fields structure is not \
6399 1.1 mrg supported by target\n");
6400 1.1 mrg return false;
6401 1.1 mrg }
6402 1.1 mrg perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6403 1.1 mrg
6404 1.1 mrg /* Generating permutation constant to shift all elements.
6405 1.1 mrg For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6406 1.1 mrg for (i = 0; i < nelt; i++)
6407 1.1 mrg sel[i] = nelt / 2 + i;
6408 1.1 mrg indices.new_vector (sel, 2, nelt);
6409 1.1 mrg if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6410 1.1 mrg {
6411 1.1 mrg if (dump_enabled_p ())
6412 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6413 1.1 mrg "shift permutation is not supported by target\n");
6414 1.1 mrg return false;
6415 1.1 mrg }
6416 1.1 mrg shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6417 1.1 mrg
6418 1.1 mrg /* Generating permutation constant to select vector from 2.
6419 1.1 mrg For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6420 1.1 mrg for (i = 0; i < nelt / 2; i++)
6421 1.1 mrg sel[i] = i;
6422 1.1 mrg for (i = nelt / 2; i < nelt; i++)
6423 1.1 mrg sel[i] = nelt + i;
6424 1.1 mrg indices.new_vector (sel, 2, nelt);
6425 1.1 mrg if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6426 1.1 mrg {
6427 1.1 mrg if (dump_enabled_p ())
6428 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6429 1.1 mrg "select is not supported by target\n");
6430 1.1 mrg return false;
6431 1.1 mrg }
6432 1.1 mrg select_mask = vect_gen_perm_mask_checked (vectype, indices);
6433 1.1 mrg
6434 1.1 mrg for (i = 0; i < log_length; i++)
6435 1.1 mrg {
6436 1.1 mrg for (j = 0; j < length; j += 2)
6437 1.1 mrg {
6438 1.1 mrg first_vect = dr_chain[j];
6439 1.1 mrg second_vect = dr_chain[j + 1];
6440 1.1 mrg
6441 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6442 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6443 1.1 mrg first_vect, first_vect,
6444 1.1 mrg perm2_mask1);
6445 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6446 1.1 mrg vect[0] = data_ref;
6447 1.1 mrg
6448 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6449 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6450 1.1 mrg second_vect, second_vect,
6451 1.1 mrg perm2_mask2);
6452 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6453 1.1 mrg vect[1] = data_ref;
6454 1.1 mrg
6455 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6456 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6457 1.1 mrg vect[0], vect[1], shift1_mask);
6458 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6459 1.1 mrg (*result_chain)[j/2 + length/2] = data_ref;
6460 1.1 mrg
6461 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6462 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6463 1.1 mrg vect[0], vect[1], select_mask);
6464 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6465 1.1 mrg (*result_chain)[j/2] = data_ref;
6466 1.1 mrg }
6467 1.1 mrg memcpy (dr_chain.address (), result_chain->address (),
6468 1.1 mrg length * sizeof (tree));
6469 1.1 mrg }
6470 1.1 mrg return true;
6471 1.1 mrg }
6472 1.1 mrg if (length == 3 && vf > 2)
6473 1.1 mrg {
6474 1.1 mrg unsigned int k = 0, l = 0;
6475 1.1 mrg
6476 1.1 mrg /* Generating permutation constant to get all elements in rigth order.
6477 1.1 mrg For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6478 1.1 mrg for (i = 0; i < nelt; i++)
6479 1.1 mrg {
6480 1.1 mrg if (3 * k + (l % 3) >= nelt)
6481 1.1 mrg {
6482 1.1 mrg k = 0;
6483 1.1 mrg l += (3 - (nelt % 3));
6484 1.1 mrg }
6485 1.1 mrg sel[i] = 3 * k + (l % 3);
6486 1.1 mrg k++;
6487 1.1 mrg }
6488 1.1 mrg vec_perm_indices indices (sel, 2, nelt);
6489 1.1 mrg if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6490 1.1 mrg {
6491 1.1 mrg if (dump_enabled_p ())
6492 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6493 1.1 mrg "shuffle of 3 fields structure is not \
6494 1.1 mrg supported by target\n");
6495 1.1 mrg return false;
6496 1.1 mrg }
6497 1.1 mrg perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6498 1.1 mrg
6499 1.1 mrg /* Generating permutation constant to shift all elements.
6500 1.1 mrg For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6501 1.1 mrg for (i = 0; i < nelt; i++)
6502 1.1 mrg sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6503 1.1 mrg indices.new_vector (sel, 2, nelt);
6504 1.1 mrg if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6505 1.1 mrg {
6506 1.1 mrg if (dump_enabled_p ())
6507 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6508 1.1 mrg "shift permutation is not supported by target\n");
6509 1.1 mrg return false;
6510 1.1 mrg }
6511 1.1 mrg shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6512 1.1 mrg
6513 1.1 mrg /* Generating permutation constant to shift all elements.
6514 1.1 mrg For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6515 1.1 mrg for (i = 0; i < nelt; i++)
6516 1.1 mrg sel[i] = 2 * (nelt / 3) + 1 + i;
6517 1.1 mrg indices.new_vector (sel, 2, nelt);
6518 1.1 mrg if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6519 1.1 mrg {
6520 1.1 mrg if (dump_enabled_p ())
6521 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6522 1.1 mrg "shift permutation is not supported by target\n");
6523 1.1 mrg return false;
6524 1.1 mrg }
6525 1.1 mrg shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6526 1.1 mrg
6527 1.1 mrg /* Generating permutation constant to shift all elements.
6528 1.1 mrg For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6529 1.1 mrg for (i = 0; i < nelt; i++)
6530 1.1 mrg sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6531 1.1 mrg indices.new_vector (sel, 2, nelt);
6532 1.1 mrg if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6533 1.1 mrg {
6534 1.1 mrg if (dump_enabled_p ())
6535 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6536 1.1 mrg "shift permutation is not supported by target\n");
6537 1.1 mrg return false;
6538 1.1 mrg }
6539 1.1 mrg shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6540 1.1 mrg
6541 1.1 mrg /* Generating permutation constant to shift all elements.
6542 1.1 mrg For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6543 1.1 mrg for (i = 0; i < nelt; i++)
6544 1.1 mrg sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6545 1.1 mrg indices.new_vector (sel, 2, nelt);
6546 1.1 mrg if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6547 1.1 mrg {
6548 1.1 mrg if (dump_enabled_p ())
6549 1.1 mrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6550 1.1 mrg "shift permutation is not supported by target\n");
6551 1.1 mrg return false;
6552 1.1 mrg }
6553 1.1 mrg shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6554 1.1 mrg
6555 1.1 mrg for (k = 0; k < 3; k++)
6556 1.1 mrg {
6557 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6558 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6559 1.1 mrg dr_chain[k], dr_chain[k],
6560 1.1 mrg perm3_mask);
6561 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6562 1.1 mrg vect[k] = data_ref;
6563 1.1 mrg }
6564 1.1 mrg
6565 1.1 mrg for (k = 0; k < 3; k++)
6566 1.1 mrg {
6567 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6568 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6569 1.1 mrg vect[k % 3], vect[(k + 1) % 3],
6570 1.1 mrg shift1_mask);
6571 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6572 1.1 mrg vect_shift[k] = data_ref;
6573 1.1 mrg }
6574 1.1 mrg
6575 1.1 mrg for (k = 0; k < 3; k++)
6576 1.1 mrg {
6577 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6578 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6579 1.1 mrg vect_shift[(4 - k) % 3],
6580 1.1 mrg vect_shift[(3 - k) % 3],
6581 1.1 mrg shift2_mask);
6582 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6583 1.1 mrg vect[k] = data_ref;
6584 1.1 mrg }
6585 1.1 mrg
6586 1.1 mrg (*result_chain)[3 - (nelt % 3)] = vect[2];
6587 1.1 mrg
6588 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6589 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6590 1.1 mrg vect[0], shift3_mask);
6591 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6592 1.1 mrg (*result_chain)[nelt % 3] = data_ref;
6593 1.1 mrg
6594 1.1 mrg data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6595 1.1 mrg perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6596 1.1 mrg vect[1], shift4_mask);
6597 1.1 mrg vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6598 1.1 mrg (*result_chain)[0] = data_ref;
6599 1.1 mrg return true;
6600 1.1 mrg }
6601 1.1 mrg return false;
6602 1.1 mrg }
6603 1.1 mrg
6604 1.1 mrg /* Function vect_transform_grouped_load.
6605 1.1 mrg
6606 1.1 mrg Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6607 1.1 mrg to perform their permutation and ascribe the result vectorized statements to
6608 1.1 mrg the scalar statements.
6609 1.1 mrg */
6610 1.1 mrg
6611 1.1 mrg void
6612 1.1 mrg vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6613 1.1 mrg vec<tree> dr_chain,
6614 1.1 mrg int size, gimple_stmt_iterator *gsi)
6615 1.1 mrg {
6616 1.1 mrg machine_mode mode;
6617 1.1 mrg vec<tree> result_chain = vNULL;
6618 1.1 mrg
6619 1.1 mrg /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6620 1.1 mrg RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6621 1.1 mrg vectors, that are ready for vector computation. */
6622 1.1 mrg result_chain.create (size);
6623 1.1 mrg
6624 1.1 mrg /* If reassociation width for vector type is 2 or greater target machine can
6625 1.1 mrg execute 2 or more vector instructions in parallel. Otherwise try to
6626 1.1 mrg get chain for loads group using vect_shift_permute_load_chain. */
6627 1.1 mrg mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6628 1.1 mrg if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6629 1.1 mrg || pow2p_hwi (size)
6630 1.1 mrg || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6631 1.1 mrg gsi, &result_chain))
6632 1.1 mrg vect_permute_load_chain (vinfo, dr_chain,
6633 1.1 mrg size, stmt_info, gsi, &result_chain);
6634 1.1 mrg vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6635 1.1 mrg result_chain.release ();
6636 1.1 mrg }
6637 1.1 mrg
6638 1.1 mrg /* RESULT_CHAIN contains the output of a group of grouped loads that were
6639 1.1 mrg generated as part of the vectorization of STMT_INFO. Assign the statement
6640 1.1 mrg for each vector to the associated scalar statement. */
6641 1.1 mrg
6642 1.1 mrg void
6643 1.1 mrg vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6644 1.1 mrg vec<tree> result_chain)
6645 1.1 mrg {
6646 1.1 mrg stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6647 1.1 mrg unsigned int i, gap_count;
6648 1.1 mrg tree tmp_data_ref;
6649 1.1 mrg
6650 1.1 mrg /* Put a permuted data-ref in the VECTORIZED_STMT field.
6651 1.1 mrg Since we scan the chain starting from it's first node, their order
6652 1.1 mrg corresponds the order of data-refs in RESULT_CHAIN. */
6653 1.1 mrg stmt_vec_info next_stmt_info = first_stmt_info;
6654 1.1 mrg gap_count = 1;
6655 1.1 mrg FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6656 1.1 mrg {
6657 1.1 mrg if (!next_stmt_info)
6658 1.1 mrg break;
6659 1.1 mrg
6660 1.1 mrg /* Skip the gaps. Loads created for the gaps will be removed by dead
6661 1.1 mrg code elimination pass later. No need to check for the first stmt in
6662 1.1 mrg the group, since it always exists.
6663 1.1 mrg DR_GROUP_GAP is the number of steps in elements from the previous
6664 1.1 mrg access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6665 1.1 mrg correspond to the gaps. */
6666 1.1 mrg if (next_stmt_info != first_stmt_info
6667 1.1 mrg && gap_count < DR_GROUP_GAP (next_stmt_info))
6668 1.1 mrg {
6669 1.1 mrg gap_count++;
6670 1.1 mrg continue;
6671 1.1 mrg }
6672 1.1 mrg
6673 1.1 mrg /* ??? The following needs cleanup after the removal of
6674 1.1 mrg DR_GROUP_SAME_DR_STMT. */
6675 1.1 mrg if (next_stmt_info)
6676 1.1 mrg {
6677 1.1 mrg gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6678 1.1 mrg /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6679 1.1 mrg copies, and we put the new vector statement last. */
6680 1.1 mrg STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6681 1.1 mrg
6682 1.1 mrg next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6683 1.1 mrg gap_count = 1;
6684 1.1 mrg }
6685 1.1 mrg }
6686 1.1 mrg }
6687 1.1 mrg
6688 1.1 mrg /* Function vect_force_dr_alignment_p.
6689 1.1 mrg
6690 1.1 mrg Returns whether the alignment of a DECL can be forced to be aligned
6691 1.1 mrg on ALIGNMENT bit boundary. */
6692 1.1 mrg
6693 1.1 mrg bool
6694 1.1 mrg vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6695 1.1 mrg {
6696 1.1 mrg if (!VAR_P (decl))
6697 1.1 mrg return false;
6698 1.1 mrg
6699 1.1 mrg if (decl_in_symtab_p (decl)
6700 1.1 mrg && !symtab_node::get (decl)->can_increase_alignment_p ())
6701 1.1 mrg return false;
6702 1.1 mrg
6703 1.1 mrg if (TREE_STATIC (decl))
6704 1.1 mrg return (known_le (alignment,
6705 1.1 mrg (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6706 1.1 mrg else
6707 1.1 mrg return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6708 1.1 mrg }
6709 1.1 mrg
6710 1.1 mrg /* Return whether the data reference DR_INFO is supported with respect to its
6711 1.1 mrg alignment.
6712 1.1 mrg If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6713 1.1 mrg it is aligned, i.e., check if it is possible to vectorize it with different
6714 1.1 mrg alignment. */
6715 1.1 mrg
6716 1.1 mrg enum dr_alignment_support
6717 1.1 mrg vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6718 1.1 mrg tree vectype, int misalignment)
6719 1.1 mrg {
6720 1.1 mrg data_reference *dr = dr_info->dr;
6721 1.1 mrg stmt_vec_info stmt_info = dr_info->stmt;
6722 1.1 mrg machine_mode mode = TYPE_MODE (vectype);
6723 1.1 mrg loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6724 1.1 mrg class loop *vect_loop = NULL;
6725 1.1 mrg bool nested_in_vect_loop = false;
6726 1.1 mrg
6727 1.1 mrg if (misalignment == 0)
6728 1.1 mrg return dr_aligned;
6729 1.1 mrg
6730 1.1 mrg /* For now assume all conditional loads/stores support unaligned
6731 1.1 mrg access without any special code. */
6732 1.1 mrg if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6733 1.1 mrg if (gimple_call_internal_p (stmt)
6734 1.1 mrg && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6735 1.1 mrg || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6736 1.1 mrg return dr_unaligned_supported;
6737 1.1 mrg
6738 1.1 mrg if (loop_vinfo)
6739 1.1 mrg {
6740 1.1 mrg vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6741 1.1 mrg nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6742 1.1 mrg }
6743 1.1 mrg
6744 1.1 mrg /* Possibly unaligned access. */
6745 1.1 mrg
6746 1.1 mrg /* We can choose between using the implicit realignment scheme (generating
6747 1.1 mrg a misaligned_move stmt) and the explicit realignment scheme (generating
6748 1.1 mrg aligned loads with a REALIGN_LOAD). There are two variants to the
6749 1.1 mrg explicit realignment scheme: optimized, and unoptimized.
6750 1.1 mrg We can optimize the realignment only if the step between consecutive
6751 1.1 mrg vector loads is equal to the vector size. Since the vector memory
6752 1.1 mrg accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6753 1.1 mrg is guaranteed that the misalignment amount remains the same throughout the
6754 1.1 mrg execution of the vectorized loop. Therefore, we can create the
6755 1.1 mrg "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6756 1.1 mrg at the loop preheader.
6757 1.1 mrg
6758 1.1 mrg However, in the case of outer-loop vectorization, when vectorizing a
6759 1.1 mrg memory access in the inner-loop nested within the LOOP that is now being
6760 1.1 mrg vectorized, while it is guaranteed that the misalignment of the
6761 1.1 mrg vectorized memory access will remain the same in different outer-loop
6762 1.1 mrg iterations, it is *not* guaranteed that is will remain the same throughout
6763 1.1 mrg the execution of the inner-loop. This is because the inner-loop advances
6764 1.1 mrg with the original scalar step (and not in steps of VS). If the inner-loop
6765 1.1 mrg step happens to be a multiple of VS, then the misalignment remains fixed
6766 1.1 mrg and we can use the optimized realignment scheme. For example:
6767 1.1 mrg
6768 1.1 mrg for (i=0; i<N; i++)
6769 1.1 mrg for (j=0; j<M; j++)
6770 1.1 mrg s += a[i+j];
6771 1.1 mrg
6772 1.1 mrg When vectorizing the i-loop in the above example, the step between
6773 1.1 mrg consecutive vector loads is 1, and so the misalignment does not remain
6774 1.1 mrg fixed across the execution of the inner-loop, and the realignment cannot
6775 1.1 mrg be optimized (as illustrated in the following pseudo vectorized loop):
6776 1.1 mrg
6777 1.1 mrg for (i=0; i<N; i+=4)
6778 1.1 mrg for (j=0; j<M; j++){
6779 1.1 mrg vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6780 1.1 mrg // when j is {0,1,2,3,4,5,6,7,...} respectively.
6781 1.1 mrg // (assuming that we start from an aligned address).
6782 1.1 mrg }
6783 1.1 mrg
6784 1.1 mrg We therefore have to use the unoptimized realignment scheme:
6785 1.1 mrg
6786 1.1 mrg for (i=0; i<N; i+=4)
6787 1.1 mrg for (j=k; j<M; j+=4)
6788 1.1 mrg vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6789 1.1 mrg // that the misalignment of the initial address is
6790 1.1 mrg // 0).
6791 1.1 mrg
6792 1.1 mrg The loop can then be vectorized as follows:
6793 1.1 mrg
6794 1.1 mrg for (k=0; k<4; k++){
6795 1.1 mrg rt = get_realignment_token (&vp[k]);
6796 1.1 mrg for (i=0; i<N; i+=4){
6797 1.1 mrg v1 = vp[i+k];
6798 1.1 mrg for (j=k; j<M; j+=4){
6799 1.1 mrg v2 = vp[i+j+VS-1];
6800 1.1 mrg va = REALIGN_LOAD <v1,v2,rt>;
6801 1.1 mrg vs += va;
6802 1.1 mrg v1 = v2;
6803 1.1 mrg }
6804 1.1 mrg }
6805 1.1 mrg } */
6806 1.1 mrg
6807 1.1 mrg if (DR_IS_READ (dr))
6808 1.1 mrg {
6809 1.1 mrg if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6810 1.1 mrg && (!targetm.vectorize.builtin_mask_for_load
6811 1.1 mrg || targetm.vectorize.builtin_mask_for_load ()))
6812 1.1 mrg {
6813 1.1 mrg /* If we are doing SLP then the accesses need not have the
6814 1.1 mrg same alignment, instead it depends on the SLP group size. */
6815 1.1 mrg if (loop_vinfo
6816 1.1 mrg && STMT_SLP_TYPE (stmt_info)
6817 1.1 mrg && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6818 1.1 mrg * (DR_GROUP_SIZE
6819 1.1 mrg (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6820 1.1 mrg TYPE_VECTOR_SUBPARTS (vectype)))
6821 1.1 mrg ;
6822 1.1 mrg else if (!loop_vinfo
6823 1.1 mrg || (nested_in_vect_loop
6824 1.1 mrg && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6825 1.1 mrg GET_MODE_SIZE (TYPE_MODE (vectype)))))
6826 1.1 mrg return dr_explicit_realign;
6827 1.1 mrg else
6828 1.1 mrg return dr_explicit_realign_optimized;
6829 1.1 mrg }
6830 1.1 mrg }
6831 1.1 mrg
6832 1.1 mrg bool is_packed = false;
6833 1.1 mrg tree type = TREE_TYPE (DR_REF (dr));
6834 1.1 mrg if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6835 1.1 mrg is_packed = not_size_aligned (DR_REF (dr));
6836 1.1 mrg if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6837 1.1 mrg is_packed))
6838 1.1 mrg return dr_unaligned_supported;
6839 1.1 mrg
6840 1.1 mrg /* Unsupported. */
6841 1.1 mrg return dr_unaligned_unsupported;
6842 1.1 mrg }
6843