tree-vect-data-refs.cc revision 1.1.1.1 1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit (at) il.ibm.com>
4 and Ira Rosen <irar (at) il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "predict.h"
32 #include "memmodel.h"
33 #include "tm_p.h"
34 #include "ssa.h"
35 #include "optabs-tree.h"
36 #include "cgraph.h"
37 #include "dumpfile.h"
38 #include "alias.h"
39 #include "fold-const.h"
40 #include "stor-layout.h"
41 #include "tree-eh.h"
42 #include "gimplify.h"
43 #include "gimple-iterator.h"
44 #include "gimplify-me.h"
45 #include "tree-ssa-loop-ivopts.h"
46 #include "tree-ssa-loop-manip.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "expr.h"
52 #include "builtins.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57 #include "gimple-fold.h"
58
59 /* Return true if load- or store-lanes optab OPTAB is implemented for
60 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61
62 static bool
63 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
64 tree vectype, unsigned HOST_WIDE_INT count)
65 {
66 machine_mode mode, array_mode;
67 bool limit_p;
68
69 mode = TYPE_MODE (vectype);
70 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 {
72 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
73 limit_p = !targetm.array_mode_supported_p (mode, count);
74 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 {
76 if (dump_enabled_p ())
77 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78 "no array mode for %s[%wu]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
81 }
82 }
83
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 {
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
91 }
92
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
97
98 return true;
99 }
100
101
102 /* Return the smallest scalar part of STMT_INFO.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
118
119 tree
120 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
121 {
122 HOST_WIDE_INT lhs, rhs;
123
124 /* During the analysis phase, this function is called on arbitrary
125 statements that might not have scalar results. */
126 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
127 return scalar_type;
128
129 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
130
131 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
132 if (assign)
133 {
134 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
135 if (gimple_assign_cast_p (assign)
136 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
141 || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
142 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
143 {
144 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
145
146 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
147 if (rhs < lhs)
148 scalar_type = rhs_type;
149 }
150 }
151 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
152 {
153 unsigned int i = 0;
154 if (gimple_call_internal_p (call))
155 {
156 internal_fn ifn = gimple_call_internal_fn (call);
157 if (internal_load_fn_p (ifn))
158 /* For loads the LHS type does the trick. */
159 i = ~0U;
160 else if (internal_store_fn_p (ifn))
161 {
162 /* For stores use the tyep of the stored value. */
163 i = internal_fn_stored_value_index (ifn);
164 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
165 i = ~0U;
166 }
167 else if (internal_fn_mask_index (ifn) == 0)
168 i = 1;
169 }
170 if (i < gimple_call_num_args (call))
171 {
172 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
173 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
174 {
175 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
176 if (rhs < lhs)
177 scalar_type = rhs_type;
178 }
179 }
180 }
181
182 return scalar_type;
183 }
184
185
186 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
187 tested at run-time. Return TRUE if DDR was successfully inserted.
188 Return false if versioning is not supported. */
189
190 static opt_result
191 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
192 {
193 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
194
195 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
196 return opt_result::failure_at (vect_location,
197 "will not create alias checks, as"
198 " --param vect-max-version-for-alias-checks"
199 " == 0\n");
200
201 opt_result res
202 = runtime_alias_check_p (ddr, loop,
203 optimize_loop_nest_for_speed_p (loop));
204 if (!res)
205 return res;
206
207 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
208 return opt_result::success ();
209 }
210
211 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
212
213 static void
214 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
215 {
216 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
217 for (unsigned int i = 0; i < checks.length(); ++i)
218 if (checks[i] == value)
219 return;
220
221 if (dump_enabled_p ())
222 dump_printf_loc (MSG_NOTE, vect_location,
223 "need run-time check that %T is nonzero\n",
224 value);
225 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
226 }
227
228 /* Return true if we know that the order of vectorized DR_INFO_A and
229 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
230 DR_INFO_B. At least one of the accesses is a write. */
231
232 static bool
233 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
234 {
235 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
236 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
237
238 /* Single statements are always kept in their original order. */
239 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
240 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
241 return true;
242
243 /* STMT_A and STMT_B belong to overlapping groups. All loads are
244 emitted at the position of the first scalar load.
245 Stores in a group are emitted at the position of the last scalar store.
246 Compute that position and check whether the resulting order matches
247 the current one. */
248 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
249 if (il_a)
250 {
251 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
252 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
253 s = DR_GROUP_NEXT_ELEMENT (s))
254 il_a = get_later_stmt (il_a, s);
255 else /* DR_IS_READ */
256 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
257 s = DR_GROUP_NEXT_ELEMENT (s))
258 if (get_later_stmt (il_a, s) == il_a)
259 il_a = s;
260 }
261 else
262 il_a = stmtinfo_a;
263 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
264 if (il_b)
265 {
266 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
267 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
268 s = DR_GROUP_NEXT_ELEMENT (s))
269 il_b = get_later_stmt (il_b, s);
270 else /* DR_IS_READ */
271 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
272 s = DR_GROUP_NEXT_ELEMENT (s))
273 if (get_later_stmt (il_b, s) == il_b)
274 il_b = s;
275 }
276 else
277 il_b = stmtinfo_b;
278 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
279 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
280 }
281
282 /* A subroutine of vect_analyze_data_ref_dependence. Handle
283 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
284 distances. These distances are conservatively correct but they don't
285 reflect a guaranteed dependence.
286
287 Return true if this function does all the work necessary to avoid
288 an alias or false if the caller should use the dependence distances
289 to limit the vectorization factor in the usual way. LOOP_DEPTH is
290 the depth of the loop described by LOOP_VINFO and the other arguments
291 are as for vect_analyze_data_ref_dependence. */
292
293 static bool
294 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
295 loop_vec_info loop_vinfo,
296 int loop_depth, unsigned int *max_vf)
297 {
298 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
299 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
300 {
301 int dist = dist_v[loop_depth];
302 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
303 {
304 /* If the user asserted safelen >= DIST consecutive iterations
305 can be executed concurrently, assume independence.
306
307 ??? An alternative would be to add the alias check even
308 in this case, and vectorize the fallback loop with the
309 maximum VF set to safelen. However, if the user has
310 explicitly given a length, it's less likely that that
311 would be a win. */
312 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
313 {
314 if ((unsigned int) loop->safelen < *max_vf)
315 *max_vf = loop->safelen;
316 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
317 continue;
318 }
319
320 /* For dependence distances of 2 or more, we have the option
321 of limiting VF or checking for an alias at runtime.
322 Prefer to check at runtime if we can, to avoid limiting
323 the VF unnecessarily when the bases are in fact independent.
324
325 Note that the alias checks will be removed if the VF ends up
326 being small enough. */
327 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
328 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
329 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
330 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
331 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
332 }
333 }
334 return true;
335 }
336
337
338 /* Function vect_analyze_data_ref_dependence.
339
340 FIXME: I needed to change the sense of the returned flag.
341
342 Return FALSE if there (might) exist a dependence between a memory-reference
343 DRA and a memory-reference DRB. When versioning for alias may check a
344 dependence at run-time, return TRUE. Adjust *MAX_VF according to
345 the data dependence. */
346
347 static opt_result
348 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
349 loop_vec_info loop_vinfo,
350 unsigned int *max_vf)
351 {
352 unsigned int i;
353 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
354 struct data_reference *dra = DDR_A (ddr);
355 struct data_reference *drb = DDR_B (ddr);
356 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
357 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
358 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
359 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
360 lambda_vector dist_v;
361 unsigned int loop_depth;
362
363 /* If user asserted safelen consecutive iterations can be
364 executed concurrently, assume independence. */
365 auto apply_safelen = [&]()
366 {
367 if (loop->safelen >= 2)
368 {
369 if ((unsigned int) loop->safelen < *max_vf)
370 *max_vf = loop->safelen;
371 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
372 return true;
373 }
374 return false;
375 };
376
377 /* In loop analysis all data references should be vectorizable. */
378 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
379 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
380 gcc_unreachable ();
381
382 /* Independent data accesses. */
383 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
384 return opt_result::success ();
385
386 if (dra == drb
387 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
388 return opt_result::success ();
389
390 /* We do not have to consider dependences between accesses that belong
391 to the same group, unless the stride could be smaller than the
392 group size. */
393 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
394 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
395 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
396 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
397 return opt_result::success ();
398
399 /* Even if we have an anti-dependence then, as the vectorized loop covers at
400 least two scalar iterations, there is always also a true dependence.
401 As the vectorizer does not re-order loads and stores we can ignore
402 the anti-dependence if TBAA can disambiguate both DRs similar to the
403 case with known negative distance anti-dependences (positive
404 distance anti-dependences would violate TBAA constraints). */
405 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
406 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
407 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
408 get_alias_set (DR_REF (drb))))
409 return opt_result::success ();
410
411 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
412 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
413 {
414 if (apply_safelen ())
415 return opt_result::success ();
416
417 return opt_result::failure_at
418 (stmtinfo_a->stmt,
419 "possible alias involving gather/scatter between %T and %T\n",
420 DR_REF (dra), DR_REF (drb));
421 }
422
423 /* Unknown data dependence. */
424 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
425 {
426 if (apply_safelen ())
427 return opt_result::success ();
428
429 if (dump_enabled_p ())
430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
431 "versioning for alias required: "
432 "can't determine dependence between %T and %T\n",
433 DR_REF (dra), DR_REF (drb));
434
435 /* Add to list of ddrs that need to be tested at run-time. */
436 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
437 }
438
439 /* Known data dependence. */
440 if (DDR_NUM_DIST_VECTS (ddr) == 0)
441 {
442 if (apply_safelen ())
443 return opt_result::success ();
444
445 if (dump_enabled_p ())
446 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
447 "versioning for alias required: "
448 "bad dist vector for %T and %T\n",
449 DR_REF (dra), DR_REF (drb));
450 /* Add to list of ddrs that need to be tested at run-time. */
451 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
452 }
453
454 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
455
456 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
457 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
458 loop_depth, max_vf))
459 return opt_result::success ();
460
461 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
462 {
463 int dist = dist_v[loop_depth];
464
465 if (dump_enabled_p ())
466 dump_printf_loc (MSG_NOTE, vect_location,
467 "dependence distance = %d.\n", dist);
468
469 if (dist == 0)
470 {
471 if (dump_enabled_p ())
472 dump_printf_loc (MSG_NOTE, vect_location,
473 "dependence distance == 0 between %T and %T\n",
474 DR_REF (dra), DR_REF (drb));
475
476 /* When we perform grouped accesses and perform implicit CSE
477 by detecting equal accesses and doing disambiguation with
478 runtime alias tests like for
479 .. = a[i];
480 .. = a[i+1];
481 a[i] = ..;
482 a[i+1] = ..;
483 *p = ..;
484 .. = a[i];
485 .. = a[i+1];
486 where we will end up loading { a[i], a[i+1] } once, make
487 sure that inserting group loads before the first load and
488 stores after the last store will do the right thing.
489 Similar for groups like
490 a[i] = ...;
491 ... = a[i];
492 a[i+1] = ...;
493 where loads from the group interleave with the store. */
494 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
495 return opt_result::failure_at (stmtinfo_a->stmt,
496 "READ_WRITE dependence"
497 " in interleaving.\n");
498
499 if (loop->safelen < 2)
500 {
501 tree indicator = dr_zero_step_indicator (dra);
502 if (!indicator || integer_zerop (indicator))
503 return opt_result::failure_at (stmtinfo_a->stmt,
504 "access also has a zero step\n");
505 else if (TREE_CODE (indicator) != INTEGER_CST)
506 vect_check_nonzero_value (loop_vinfo, indicator);
507 }
508 continue;
509 }
510
511 if (dist > 0 && DDR_REVERSED_P (ddr))
512 {
513 /* If DDR_REVERSED_P the order of the data-refs in DDR was
514 reversed (to make distance vector positive), and the actual
515 distance is negative. */
516 if (dump_enabled_p ())
517 dump_printf_loc (MSG_NOTE, vect_location,
518 "dependence distance negative.\n");
519 /* When doing outer loop vectorization, we need to check if there is
520 a backward dependence at the inner loop level if the dependence
521 at the outer loop is reversed. See PR81740. */
522 if (nested_in_vect_loop_p (loop, stmtinfo_a)
523 || nested_in_vect_loop_p (loop, stmtinfo_b))
524 {
525 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
526 DDR_LOOP_NEST (ddr));
527 if (dist_v[inner_depth] < 0)
528 return opt_result::failure_at (stmtinfo_a->stmt,
529 "not vectorized, dependence "
530 "between data-refs %T and %T\n",
531 DR_REF (dra), DR_REF (drb));
532 }
533 /* Record a negative dependence distance to later limit the
534 amount of stmt copying / unrolling we can perform.
535 Only need to handle read-after-write dependence. */
536 if (DR_IS_READ (drb)
537 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
538 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
539 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
540 continue;
541 }
542
543 unsigned int abs_dist = abs (dist);
544 if (abs_dist >= 2 && abs_dist < *max_vf)
545 {
546 /* The dependence distance requires reduction of the maximal
547 vectorization factor. */
548 *max_vf = abs_dist;
549 if (dump_enabled_p ())
550 dump_printf_loc (MSG_NOTE, vect_location,
551 "adjusting maximal vectorization factor to %i\n",
552 *max_vf);
553 }
554
555 if (abs_dist >= *max_vf)
556 {
557 /* Dependence distance does not create dependence, as far as
558 vectorization is concerned, in this case. */
559 if (dump_enabled_p ())
560 dump_printf_loc (MSG_NOTE, vect_location,
561 "dependence distance >= VF.\n");
562 continue;
563 }
564
565 return opt_result::failure_at (stmtinfo_a->stmt,
566 "not vectorized, possible dependence "
567 "between data-refs %T and %T\n",
568 DR_REF (dra), DR_REF (drb));
569 }
570
571 return opt_result::success ();
572 }
573
574 /* Function vect_analyze_data_ref_dependences.
575
576 Examine all the data references in the loop, and make sure there do not
577 exist any data dependences between them. Set *MAX_VF according to
578 the maximum vectorization factor the data dependences allow. */
579
580 opt_result
581 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
582 unsigned int *max_vf)
583 {
584 unsigned int i;
585 struct data_dependence_relation *ddr;
586
587 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
588
589 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
590 {
591 LOOP_VINFO_DDRS (loop_vinfo)
592 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
593 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
594 /* We do not need read-read dependences. */
595 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
596 &LOOP_VINFO_DDRS (loop_vinfo),
597 LOOP_VINFO_LOOP_NEST (loop_vinfo),
598 false);
599 gcc_assert (res);
600 }
601
602 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
603
604 /* For epilogues we either have no aliases or alias versioning
605 was applied to original loop. Therefore we may just get max_vf
606 using VF of original loop. */
607 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
608 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
609 else
610 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
611 {
612 opt_result res
613 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
614 if (!res)
615 return res;
616 }
617
618 return opt_result::success ();
619 }
620
621
622 /* Function vect_slp_analyze_data_ref_dependence.
623
624 Return TRUE if there (might) exist a dependence between a memory-reference
625 DRA and a memory-reference DRB for VINFO. When versioning for alias
626 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
627 according to the data dependence. */
628
629 static bool
630 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
631 struct data_dependence_relation *ddr)
632 {
633 struct data_reference *dra = DDR_A (ddr);
634 struct data_reference *drb = DDR_B (ddr);
635 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
636 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
637
638 /* We need to check dependences of statements marked as unvectorizable
639 as well, they still can prohibit vectorization. */
640
641 /* Independent data accesses. */
642 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
643 return false;
644
645 if (dra == drb)
646 return false;
647
648 /* Read-read is OK. */
649 if (DR_IS_READ (dra) && DR_IS_READ (drb))
650 return false;
651
652 /* If dra and drb are part of the same interleaving chain consider
653 them independent. */
654 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
655 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
656 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
657 return false;
658
659 /* Unknown data dependence. */
660 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
661 {
662 if (dump_enabled_p ())
663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
664 "can't determine dependence between %T and %T\n",
665 DR_REF (dra), DR_REF (drb));
666 }
667 else if (dump_enabled_p ())
668 dump_printf_loc (MSG_NOTE, vect_location,
669 "determined dependence between %T and %T\n",
670 DR_REF (dra), DR_REF (drb));
671
672 return true;
673 }
674
675
676 /* Analyze dependences involved in the transform of a store SLP NODE. */
677
678 static bool
679 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
680 {
681 /* This walks over all stmts involved in the SLP store done
682 in NODE verifying we can sink them up to the last stmt in the
683 group. */
684 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
685 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
686
687 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
688 {
689 stmt_vec_info access_info
690 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
691 if (access_info == last_access_info)
692 continue;
693 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
694 ao_ref ref;
695 bool ref_initialized_p = false;
696 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
697 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
698 {
699 gimple *stmt = gsi_stmt (gsi);
700 if (! gimple_vuse (stmt))
701 continue;
702
703 /* If we couldn't record a (single) data reference for this
704 stmt we have to resort to the alias oracle. */
705 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
706 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
707 if (!dr_b)
708 {
709 /* We are moving a store - this means
710 we cannot use TBAA for disambiguation. */
711 if (!ref_initialized_p)
712 ao_ref_init (&ref, DR_REF (dr_a));
713 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
714 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
715 return false;
716 continue;
717 }
718
719 gcc_assert (!gimple_visited_p (stmt));
720
721 ddr_p ddr = initialize_data_dependence_relation (dr_a,
722 dr_b, vNULL);
723 bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
724 free_dependence_relation (ddr);
725 if (dependent)
726 return false;
727 }
728 }
729 return true;
730 }
731
732 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
733 contain the vector of scalar stores of this instance if we are
734 disambiguating the loads. */
735
736 static bool
737 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
738 vec<stmt_vec_info> stores,
739 stmt_vec_info last_store_info)
740 {
741 /* This walks over all stmts involved in the SLP load done
742 in NODE verifying we can hoist them up to the first stmt in the
743 group. */
744 stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
745 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
746
747 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
748 {
749 stmt_vec_info access_info
750 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
751 if (access_info == first_access_info)
752 continue;
753 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
754 ao_ref ref;
755 bool ref_initialized_p = false;
756 hash_set<stmt_vec_info> grp_visited;
757 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
758 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
759 {
760 gimple *stmt = gsi_stmt (gsi);
761 if (! gimple_vdef (stmt))
762 continue;
763
764 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
765
766 /* If we run into a store of this same instance (we've just
767 marked those) then delay dependence checking until we run
768 into the last store because this is where it will have
769 been sunk to (and we verified that we can do that already). */
770 if (gimple_visited_p (stmt))
771 {
772 if (stmt_info != last_store_info)
773 continue;
774
775 for (stmt_vec_info &store_info : stores)
776 {
777 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
778 ddr_p ddr = initialize_data_dependence_relation
779 (dr_a, store_dr, vNULL);
780 bool dependent
781 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
782 free_dependence_relation (ddr);
783 if (dependent)
784 return false;
785 }
786 continue;
787 }
788
789 auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
790 {
791 /* We are hoisting a load - this means we can use TBAA for
792 disambiguation. */
793 if (!ref_initialized_p)
794 ao_ref_init (&ref, DR_REF (dr_a));
795 if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
796 {
797 /* If we couldn't record a (single) data reference for this
798 stmt we have to give up now. */
799 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
800 if (!dr_b)
801 return false;
802 ddr_p ddr = initialize_data_dependence_relation (dr_a,
803 dr_b, vNULL);
804 bool dependent
805 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
806 free_dependence_relation (ddr);
807 if (dependent)
808 return false;
809 }
810 /* No dependence. */
811 return true;
812 };
813 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
814 {
815 /* When we run into a store group we have to honor
816 that earlier stores might be moved here. We don't
817 know exactly which and where to since we lack a
818 back-mapping from DR to SLP node, so assume all
819 earlier stores are sunk here. It's enough to
820 consider the last stmt of a group for this.
821 ??? Both this and the fact that we disregard that
822 the conflicting instance might be removed later
823 is overly conservative. */
824 if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
825 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
826 store_info != NULL;
827 store_info = DR_GROUP_NEXT_ELEMENT (store_info))
828 if ((store_info == stmt_info
829 || get_later_stmt (store_info, stmt_info) == stmt_info)
830 && !check_hoist (store_info))
831 return false;
832 }
833 else
834 {
835 if (!check_hoist (stmt_info))
836 return false;
837 }
838 }
839 }
840 return true;
841 }
842
843
844 /* Function vect_analyze_data_ref_dependences.
845
846 Examine all the data references in the basic-block, and make sure there
847 do not exist any data dependences between them. Set *MAX_VF according to
848 the maximum vectorization factor the data dependences allow. */
849
850 bool
851 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
852 {
853 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
854
855 /* The stores of this instance are at the root of the SLP tree. */
856 slp_tree store = NULL;
857 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
858 store = SLP_INSTANCE_TREE (instance);
859
860 /* Verify we can sink stores to the vectorized stmt insert location. */
861 stmt_vec_info last_store_info = NULL;
862 if (store)
863 {
864 if (! vect_slp_analyze_store_dependences (vinfo, store))
865 return false;
866
867 /* Mark stores in this instance and remember the last one. */
868 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
869 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
870 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
871 }
872
873 bool res = true;
874
875 /* Verify we can sink loads to the vectorized stmt insert location,
876 special-casing stores of this instance. */
877 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
878 if (! vect_slp_analyze_load_dependences (vinfo, load,
879 store
880 ? SLP_TREE_SCALAR_STMTS (store)
881 : vNULL, last_store_info))
882 {
883 res = false;
884 break;
885 }
886
887 /* Unset the visited flag. */
888 if (store)
889 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
890 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
891
892 return res;
893 }
894
895 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
896 applied. */
897
898 int
899 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
900 {
901 HOST_WIDE_INT diff = 0;
902 /* Alignment is only analyzed for the first element of a DR group,
903 use that but adjust misalignment by the offset of the access. */
904 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
905 {
906 dr_vec_info *first_dr
907 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
908 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
909 INTEGER_CSTs and the first element in the group has the lowest
910 address. */
911 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
912 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
913 gcc_assert (diff >= 0);
914 dr_info = first_dr;
915 }
916
917 int misalign = dr_info->misalignment;
918 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
919 if (misalign == DR_MISALIGNMENT_UNKNOWN)
920 return misalign;
921
922 /* If the access is only aligned for a vector type with smaller alignment
923 requirement the access has unknown misalignment. */
924 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
925 targetm.vectorize.preferred_vector_alignment (vectype)))
926 return DR_MISALIGNMENT_UNKNOWN;
927
928 /* Apply the offset from the DR group start and the externally supplied
929 offset which can for example result from a negative stride access. */
930 poly_int64 misalignment = misalign + diff + offset;
931
932 /* vect_compute_data_ref_alignment will have ensured that target_alignment
933 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
934 unsigned HOST_WIDE_INT target_alignment_c
935 = dr_info->target_alignment.to_constant ();
936 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
937 return DR_MISALIGNMENT_UNKNOWN;
938 return misalign;
939 }
940
941 /* Record the base alignment guarantee given by DRB, which occurs
942 in STMT_INFO. */
943
944 static void
945 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
946 innermost_loop_behavior *drb)
947 {
948 bool existed;
949 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
950 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
951 if (!existed || entry.second->base_alignment < drb->base_alignment)
952 {
953 entry = std::make_pair (stmt_info, drb);
954 if (dump_enabled_p ())
955 dump_printf_loc (MSG_NOTE, vect_location,
956 "recording new base alignment for %T\n"
957 " alignment: %d\n"
958 " misalignment: %d\n"
959 " based on: %G",
960 drb->base_address,
961 drb->base_alignment,
962 drb->base_misalignment,
963 stmt_info->stmt);
964 }
965 }
966
967 /* If the region we're going to vectorize is reached, all unconditional
968 data references occur at least once. We can therefore pool the base
969 alignment guarantees from each unconditional reference. Do this by
970 going through all the data references in VINFO and checking whether
971 the containing statement makes the reference unconditionally. If so,
972 record the alignment of the base address in VINFO so that it can be
973 used for all other references with the same base. */
974
975 void
976 vect_record_base_alignments (vec_info *vinfo)
977 {
978 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
979 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
980 for (data_reference *dr : vinfo->shared->datarefs)
981 {
982 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
983 stmt_vec_info stmt_info = dr_info->stmt;
984 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
985 && STMT_VINFO_VECTORIZABLE (stmt_info)
986 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
987 {
988 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
989
990 /* If DR is nested in the loop that is being vectorized, we can also
991 record the alignment of the base wrt the outer loop. */
992 if (loop && nested_in_vect_loop_p (loop, stmt_info))
993 vect_record_base_alignment
994 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
995 }
996 }
997 }
998
999 /* Function vect_compute_data_ref_alignment
1000
1001 Compute the misalignment of the data reference DR_INFO when vectorizing
1002 with VECTYPE.
1003
1004 Output:
1005 1. initialized misalignment info for DR_INFO
1006
1007 FOR NOW: No analysis is actually performed. Misalignment is calculated
1008 only for trivial cases. TODO. */
1009
1010 static void
1011 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1012 tree vectype)
1013 {
1014 stmt_vec_info stmt_info = dr_info->stmt;
1015 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1016 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1017 class loop *loop = NULL;
1018 tree ref = DR_REF (dr_info->dr);
1019
1020 if (dump_enabled_p ())
1021 dump_printf_loc (MSG_NOTE, vect_location,
1022 "vect_compute_data_ref_alignment:\n");
1023
1024 if (loop_vinfo)
1025 loop = LOOP_VINFO_LOOP (loop_vinfo);
1026
1027 /* Initialize misalignment to unknown. */
1028 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1029
1030 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1031 return;
1032
1033 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1034 bool step_preserves_misalignment_p;
1035
1036 poly_uint64 vector_alignment
1037 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1038 BITS_PER_UNIT);
1039 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1040
1041 /* If the main loop has peeled for alignment we have no way of knowing
1042 whether the data accesses in the epilogues are aligned. We can't at
1043 compile time answer the question whether we have entered the main loop or
1044 not. Fixes PR 92351. */
1045 if (loop_vinfo)
1046 {
1047 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1048 if (orig_loop_vinfo
1049 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1050 return;
1051 }
1052
1053 unsigned HOST_WIDE_INT vect_align_c;
1054 if (!vector_alignment.is_constant (&vect_align_c))
1055 return;
1056
1057 /* No step for BB vectorization. */
1058 if (!loop)
1059 {
1060 gcc_assert (integer_zerop (drb->step));
1061 step_preserves_misalignment_p = true;
1062 }
1063
1064 /* In case the dataref is in an inner-loop of the loop that is being
1065 vectorized (LOOP), we use the base and misalignment information
1066 relative to the outer-loop (LOOP). This is ok only if the misalignment
1067 stays the same throughout the execution of the inner-loop, which is why
1068 we have to check that the stride of the dataref in the inner-loop evenly
1069 divides by the vector alignment. */
1070 else if (nested_in_vect_loop_p (loop, stmt_info))
1071 {
1072 step_preserves_misalignment_p
1073 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1074
1075 if (dump_enabled_p ())
1076 {
1077 if (step_preserves_misalignment_p)
1078 dump_printf_loc (MSG_NOTE, vect_location,
1079 "inner step divides the vector alignment.\n");
1080 else
1081 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1082 "inner step doesn't divide the vector"
1083 " alignment.\n");
1084 }
1085 }
1086
1087 /* Similarly we can only use base and misalignment information relative to
1088 an innermost loop if the misalignment stays the same throughout the
1089 execution of the loop. As above, this is the case if the stride of
1090 the dataref evenly divides by the alignment. */
1091 else
1092 {
1093 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1094 step_preserves_misalignment_p
1095 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1096
1097 if (!step_preserves_misalignment_p && dump_enabled_p ())
1098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1099 "step doesn't divide the vector alignment.\n");
1100 }
1101
1102 unsigned int base_alignment = drb->base_alignment;
1103 unsigned int base_misalignment = drb->base_misalignment;
1104
1105 /* Calculate the maximum of the pooled base address alignment and the
1106 alignment that we can compute for DR itself. */
1107 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1108 = base_alignments->get (drb->base_address);
1109 if (entry
1110 && base_alignment < (*entry).second->base_alignment
1111 && (loop_vinfo
1112 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1113 gimple_bb (entry->first->stmt))
1114 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1115 || (entry->first->dr_aux.group <= dr_info->group)))))
1116 {
1117 base_alignment = entry->second->base_alignment;
1118 base_misalignment = entry->second->base_misalignment;
1119 }
1120
1121 if (drb->offset_alignment < vect_align_c
1122 || !step_preserves_misalignment_p
1123 /* We need to know whether the step wrt the vectorized loop is
1124 negative when computing the starting misalignment below. */
1125 || TREE_CODE (drb->step) != INTEGER_CST)
1126 {
1127 if (dump_enabled_p ())
1128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1129 "Unknown alignment for access: %T\n", ref);
1130 return;
1131 }
1132
1133 if (base_alignment < vect_align_c)
1134 {
1135 unsigned int max_alignment;
1136 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1137 if (max_alignment < vect_align_c
1138 || !vect_can_force_dr_alignment_p (base,
1139 vect_align_c * BITS_PER_UNIT))
1140 {
1141 if (dump_enabled_p ())
1142 dump_printf_loc (MSG_NOTE, vect_location,
1143 "can't force alignment of ref: %T\n", ref);
1144 return;
1145 }
1146
1147 /* Force the alignment of the decl.
1148 NOTE: This is the only change to the code we make during
1149 the analysis phase, before deciding to vectorize the loop. */
1150 if (dump_enabled_p ())
1151 dump_printf_loc (MSG_NOTE, vect_location,
1152 "force alignment of %T\n", ref);
1153
1154 dr_info->base_decl = base;
1155 dr_info->base_misaligned = true;
1156 base_misalignment = 0;
1157 }
1158 poly_int64 misalignment
1159 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1160
1161 unsigned int const_misalignment;
1162 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1163 {
1164 if (dump_enabled_p ())
1165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1166 "Non-constant misalignment for access: %T\n", ref);
1167 return;
1168 }
1169
1170 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1171
1172 if (dump_enabled_p ())
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 "misalign = %d bytes of ref %T\n",
1175 const_misalignment, ref);
1176
1177 return;
1178 }
1179
1180 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1181 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1182 is made aligned via peeling. */
1183
1184 static bool
1185 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1186 dr_vec_info *dr_peel_info)
1187 {
1188 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1189 DR_TARGET_ALIGNMENT (dr_info)))
1190 {
1191 poly_offset_int diff
1192 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1193 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1194 if (known_eq (diff, 0)
1195 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1196 return true;
1197 }
1198 return false;
1199 }
1200
1201 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1202 aligned via peeling. */
1203
1204 static bool
1205 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1206 dr_vec_info *dr_peel_info)
1207 {
1208 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1209 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1210 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1211 DR_OFFSET (dr_peel_info->dr), 0)
1212 || !operand_equal_p (DR_STEP (dr_info->dr),
1213 DR_STEP (dr_peel_info->dr), 0))
1214 return false;
1215
1216 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1217 }
1218
1219 /* Compute the value for dr_info->misalign so that the access appears
1220 aligned. This is used by peeling to compensate for dr_misalignment
1221 applying the offset for negative step. */
1222
1223 int
1224 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1225 {
1226 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1227 return 0;
1228
1229 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1230 poly_int64 misalignment
1231 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1232 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1233
1234 unsigned HOST_WIDE_INT target_alignment_c;
1235 int misalign;
1236 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1237 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1238 return DR_MISALIGNMENT_UNKNOWN;
1239 return misalign;
1240 }
1241
1242 /* Function vect_update_misalignment_for_peel.
1243 Sets DR_INFO's misalignment
1244 - to 0 if it has the same alignment as DR_PEEL_INFO,
1245 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1246 - to -1 (unknown) otherwise.
1247
1248 DR_INFO - the data reference whose misalignment is to be adjusted.
1249 DR_PEEL_INFO - the data reference whose misalignment is being made
1250 zero in the vector loop by the peel.
1251 NPEEL - the number of iterations in the peel loop if the misalignment
1252 of DR_PEEL_INFO is known at compile time. */
1253
1254 static void
1255 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1256 dr_vec_info *dr_peel_info, int npeel)
1257 {
1258 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1259 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1260 {
1261 SET_DR_MISALIGNMENT (dr_info,
1262 vect_dr_misalign_for_aligned_access (dr_peel_info));
1263 return;
1264 }
1265
1266 unsigned HOST_WIDE_INT alignment;
1267 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1268 && known_alignment_for_access_p (dr_info,
1269 STMT_VINFO_VECTYPE (dr_info->stmt))
1270 && known_alignment_for_access_p (dr_peel_info,
1271 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1272 {
1273 int misal = dr_info->misalignment;
1274 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1275 misal &= alignment - 1;
1276 set_dr_misalignment (dr_info, misal);
1277 return;
1278 }
1279
1280 if (dump_enabled_p ())
1281 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1282 "to unknown (-1).\n");
1283 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1284 }
1285
1286 /* Return true if alignment is relevant for DR_INFO. */
1287
1288 static bool
1289 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1290 {
1291 stmt_vec_info stmt_info = dr_info->stmt;
1292
1293 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1294 return false;
1295
1296 /* For interleaving, only the alignment of the first access matters. */
1297 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1298 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1299 return false;
1300
1301 /* Scatter-gather and invariant accesses continue to address individual
1302 scalars, so vector-level alignment is irrelevant. */
1303 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1304 || integer_zerop (DR_STEP (dr_info->dr)))
1305 return false;
1306
1307 /* Strided accesses perform only component accesses, alignment is
1308 irrelevant for them. */
1309 if (STMT_VINFO_STRIDED_P (stmt_info)
1310 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1311 return false;
1312
1313 return true;
1314 }
1315
1316 /* Given an memory reference EXP return whether its alignment is less
1317 than its size. */
1318
1319 static bool
1320 not_size_aligned (tree exp)
1321 {
1322 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1323 return true;
1324
1325 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1326 > get_object_alignment (exp));
1327 }
1328
1329 /* Function vector_alignment_reachable_p
1330
1331 Return true if vector alignment for DR_INFO is reachable by peeling
1332 a few loop iterations. Return false otherwise. */
1333
1334 static bool
1335 vector_alignment_reachable_p (dr_vec_info *dr_info)
1336 {
1337 stmt_vec_info stmt_info = dr_info->stmt;
1338 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1339
1340 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1341 {
1342 /* For interleaved access we peel only if number of iterations in
1343 the prolog loop ({VF - misalignment}), is a multiple of the
1344 number of the interleaved accesses. */
1345 int elem_size, mis_in_elements;
1346
1347 /* FORNOW: handle only known alignment. */
1348 if (!known_alignment_for_access_p (dr_info, vectype))
1349 return false;
1350
1351 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1352 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1353 elem_size = vector_element_size (vector_size, nelements);
1354 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1355
1356 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1357 return false;
1358 }
1359
1360 /* If misalignment is known at the compile time then allow peeling
1361 only if natural alignment is reachable through peeling. */
1362 if (known_alignment_for_access_p (dr_info, vectype)
1363 && !aligned_access_p (dr_info, vectype))
1364 {
1365 HOST_WIDE_INT elmsize =
1366 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1367 if (dump_enabled_p ())
1368 {
1369 dump_printf_loc (MSG_NOTE, vect_location,
1370 "data size = %wd. misalignment = %d.\n", elmsize,
1371 dr_misalignment (dr_info, vectype));
1372 }
1373 if (dr_misalignment (dr_info, vectype) % elmsize)
1374 {
1375 if (dump_enabled_p ())
1376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377 "data size does not divide the misalignment.\n");
1378 return false;
1379 }
1380 }
1381
1382 if (!known_alignment_for_access_p (dr_info, vectype))
1383 {
1384 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1385 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1386 if (dump_enabled_p ())
1387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1388 "Unknown misalignment, %snaturally aligned\n",
1389 is_packed ? "not " : "");
1390 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1391 }
1392
1393 return true;
1394 }
1395
1396
1397 /* Calculate the cost of the memory access represented by DR_INFO. */
1398
1399 static void
1400 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1401 dr_alignment_support alignment_support_scheme,
1402 int misalignment,
1403 unsigned int *inside_cost,
1404 unsigned int *outside_cost,
1405 stmt_vector_for_cost *body_cost_vec,
1406 stmt_vector_for_cost *prologue_cost_vec)
1407 {
1408 stmt_vec_info stmt_info = dr_info->stmt;
1409 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1410 int ncopies;
1411
1412 if (PURE_SLP_STMT (stmt_info))
1413 ncopies = 1;
1414 else
1415 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1416
1417 if (DR_IS_READ (dr_info->dr))
1418 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1419 misalignment, true, inside_cost,
1420 outside_cost, prologue_cost_vec, body_cost_vec, false);
1421 else
1422 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1423 misalignment, inside_cost, body_cost_vec);
1424
1425 if (dump_enabled_p ())
1426 dump_printf_loc (MSG_NOTE, vect_location,
1427 "vect_get_data_access_cost: inside_cost = %d, "
1428 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1429 }
1430
1431
1432 typedef struct _vect_peel_info
1433 {
1434 dr_vec_info *dr_info;
1435 int npeel;
1436 unsigned int count;
1437 } *vect_peel_info;
1438
1439 typedef struct _vect_peel_extended_info
1440 {
1441 vec_info *vinfo;
1442 struct _vect_peel_info peel_info;
1443 unsigned int inside_cost;
1444 unsigned int outside_cost;
1445 } *vect_peel_extended_info;
1446
1447
1448 /* Peeling hashtable helpers. */
1449
1450 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1451 {
1452 static inline hashval_t hash (const _vect_peel_info *);
1453 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1454 };
1455
1456 inline hashval_t
1457 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1458 {
1459 return (hashval_t) peel_info->npeel;
1460 }
1461
1462 inline bool
1463 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1464 {
1465 return (a->npeel == b->npeel);
1466 }
1467
1468
1469 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1470
1471 static void
1472 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1473 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1474 int npeel, bool supportable_if_not_aligned)
1475 {
1476 struct _vect_peel_info elem, *slot;
1477 _vect_peel_info **new_slot;
1478
1479 elem.npeel = npeel;
1480 slot = peeling_htab->find (&elem);
1481 if (slot)
1482 slot->count++;
1483 else
1484 {
1485 slot = XNEW (struct _vect_peel_info);
1486 slot->npeel = npeel;
1487 slot->dr_info = dr_info;
1488 slot->count = 1;
1489 new_slot = peeling_htab->find_slot (slot, INSERT);
1490 *new_slot = slot;
1491 }
1492
1493 /* If this DR is not supported with unknown misalignment then bias
1494 this slot when the cost model is disabled. */
1495 if (!supportable_if_not_aligned
1496 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1497 slot->count += VECT_MAX_COST;
1498 }
1499
1500
1501 /* Traverse peeling hash table to find peeling option that aligns maximum
1502 number of data accesses. */
1503
1504 int
1505 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1506 _vect_peel_extended_info *max)
1507 {
1508 vect_peel_info elem = *slot;
1509
1510 if (elem->count > max->peel_info.count
1511 || (elem->count == max->peel_info.count
1512 && max->peel_info.npeel > elem->npeel))
1513 {
1514 max->peel_info.npeel = elem->npeel;
1515 max->peel_info.count = elem->count;
1516 max->peel_info.dr_info = elem->dr_info;
1517 }
1518
1519 return 1;
1520 }
1521
1522 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1523 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1524 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1525 after peeling. */
1526
1527 static void
1528 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1529 dr_vec_info *dr0_info,
1530 unsigned int *inside_cost,
1531 unsigned int *outside_cost,
1532 stmt_vector_for_cost *body_cost_vec,
1533 stmt_vector_for_cost *prologue_cost_vec,
1534 unsigned int npeel)
1535 {
1536 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1537
1538 bool dr0_alignment_known_p
1539 = (dr0_info
1540 && known_alignment_for_access_p (dr0_info,
1541 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1542
1543 for (data_reference *dr : datarefs)
1544 {
1545 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1546 if (!vect_relevant_for_alignment_p (dr_info))
1547 continue;
1548
1549 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1550 dr_alignment_support alignment_support_scheme;
1551 int misalignment;
1552 unsigned HOST_WIDE_INT alignment;
1553
1554 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1555 size_zero_node) < 0;
1556 poly_int64 off = 0;
1557 if (negative)
1558 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1559 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1560
1561 if (npeel == 0)
1562 misalignment = dr_misalignment (dr_info, vectype, off);
1563 else if (dr_info == dr0_info
1564 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1565 misalignment = 0;
1566 else if (!dr0_alignment_known_p
1567 || !known_alignment_for_access_p (dr_info, vectype)
1568 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1569 misalignment = DR_MISALIGNMENT_UNKNOWN;
1570 else
1571 {
1572 misalignment = dr_misalignment (dr_info, vectype, off);
1573 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1574 misalignment &= alignment - 1;
1575 }
1576 alignment_support_scheme
1577 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1578 misalignment);
1579
1580 vect_get_data_access_cost (loop_vinfo, dr_info,
1581 alignment_support_scheme, misalignment,
1582 inside_cost, outside_cost,
1583 body_cost_vec, prologue_cost_vec);
1584 }
1585 }
1586
1587 /* Traverse peeling hash table and calculate cost for each peeling option.
1588 Find the one with the lowest cost. */
1589
1590 int
1591 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1592 _vect_peel_extended_info *min)
1593 {
1594 vect_peel_info elem = *slot;
1595 int dummy;
1596 unsigned int inside_cost = 0, outside_cost = 0;
1597 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1598 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1599 epilogue_cost_vec;
1600
1601 prologue_cost_vec.create (2);
1602 body_cost_vec.create (2);
1603 epilogue_cost_vec.create (2);
1604
1605 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1606 &outside_cost, &body_cost_vec,
1607 &prologue_cost_vec, elem->npeel);
1608
1609 body_cost_vec.release ();
1610
1611 outside_cost += vect_get_known_peeling_cost
1612 (loop_vinfo, elem->npeel, &dummy,
1613 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1614 &prologue_cost_vec, &epilogue_cost_vec);
1615
1616 /* Prologue and epilogue costs are added to the target model later.
1617 These costs depend only on the scalar iteration cost, the
1618 number of peeling iterations finally chosen, and the number of
1619 misaligned statements. So discard the information found here. */
1620 prologue_cost_vec.release ();
1621 epilogue_cost_vec.release ();
1622
1623 if (inside_cost < min->inside_cost
1624 || (inside_cost == min->inside_cost
1625 && outside_cost < min->outside_cost))
1626 {
1627 min->inside_cost = inside_cost;
1628 min->outside_cost = outside_cost;
1629 min->peel_info.dr_info = elem->dr_info;
1630 min->peel_info.npeel = elem->npeel;
1631 min->peel_info.count = elem->count;
1632 }
1633
1634 return 1;
1635 }
1636
1637
1638 /* Choose best peeling option by traversing peeling hash table and either
1639 choosing an option with the lowest cost (if cost model is enabled) or the
1640 option that aligns as many accesses as possible. */
1641
1642 static struct _vect_peel_extended_info
1643 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1644 loop_vec_info loop_vinfo)
1645 {
1646 struct _vect_peel_extended_info res;
1647
1648 res.peel_info.dr_info = NULL;
1649 res.vinfo = loop_vinfo;
1650
1651 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1652 {
1653 res.inside_cost = INT_MAX;
1654 res.outside_cost = INT_MAX;
1655 peeling_htab->traverse <_vect_peel_extended_info *,
1656 vect_peeling_hash_get_lowest_cost> (&res);
1657 }
1658 else
1659 {
1660 res.peel_info.count = 0;
1661 peeling_htab->traverse <_vect_peel_extended_info *,
1662 vect_peeling_hash_get_most_frequent> (&res);
1663 res.inside_cost = 0;
1664 res.outside_cost = 0;
1665 }
1666
1667 return res;
1668 }
1669
1670 /* Return true if the new peeling NPEEL is supported. */
1671
1672 static bool
1673 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1674 unsigned npeel)
1675 {
1676 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1677 enum dr_alignment_support supportable_dr_alignment;
1678
1679 bool dr0_alignment_known_p
1680 = known_alignment_for_access_p (dr0_info,
1681 STMT_VINFO_VECTYPE (dr0_info->stmt));
1682
1683 /* Ensure that all data refs can be vectorized after the peel. */
1684 for (data_reference *dr : datarefs)
1685 {
1686 if (dr == dr0_info->dr)
1687 continue;
1688
1689 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1690 if (!vect_relevant_for_alignment_p (dr_info)
1691 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1692 continue;
1693
1694 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1695 int misalignment;
1696 unsigned HOST_WIDE_INT alignment;
1697 if (!dr0_alignment_known_p
1698 || !known_alignment_for_access_p (dr_info, vectype)
1699 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1700 misalignment = DR_MISALIGNMENT_UNKNOWN;
1701 else
1702 {
1703 misalignment = dr_misalignment (dr_info, vectype);
1704 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1705 misalignment &= alignment - 1;
1706 }
1707 supportable_dr_alignment
1708 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1709 misalignment);
1710 if (supportable_dr_alignment == dr_unaligned_unsupported)
1711 return false;
1712 }
1713
1714 return true;
1715 }
1716
1717 /* Compare two data-references DRA and DRB to group them into chunks
1718 with related alignment. */
1719
1720 static int
1721 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1722 {
1723 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1724 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1725 int cmp;
1726
1727 /* Stabilize sort. */
1728 if (dra == drb)
1729 return 0;
1730
1731 /* Ordering of DRs according to base. */
1732 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1733 DR_BASE_ADDRESS (drb));
1734 if (cmp != 0)
1735 return cmp;
1736
1737 /* And according to DR_OFFSET. */
1738 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1739 if (cmp != 0)
1740 return cmp;
1741
1742 /* And after step. */
1743 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1744 if (cmp != 0)
1745 return cmp;
1746
1747 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1748 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1749 if (cmp == 0)
1750 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1751 return cmp;
1752 }
1753
1754 /* Function vect_enhance_data_refs_alignment
1755
1756 This pass will use loop versioning and loop peeling in order to enhance
1757 the alignment of data references in the loop.
1758
1759 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1760 original loop is to be vectorized. Any other loops that are created by
1761 the transformations performed in this pass - are not supposed to be
1762 vectorized. This restriction will be relaxed.
1763
1764 This pass will require a cost model to guide it whether to apply peeling
1765 or versioning or a combination of the two. For example, the scheme that
1766 intel uses when given a loop with several memory accesses, is as follows:
1767 choose one memory access ('p') which alignment you want to force by doing
1768 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1769 other accesses are not necessarily aligned, or (2) use loop versioning to
1770 generate one loop in which all accesses are aligned, and another loop in
1771 which only 'p' is necessarily aligned.
1772
1773 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1774 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1775 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1776
1777 Devising a cost model is the most critical aspect of this work. It will
1778 guide us on which access to peel for, whether to use loop versioning, how
1779 many versions to create, etc. The cost model will probably consist of
1780 generic considerations as well as target specific considerations (on
1781 powerpc for example, misaligned stores are more painful than misaligned
1782 loads).
1783
1784 Here are the general steps involved in alignment enhancements:
1785
1786 -- original loop, before alignment analysis:
1787 for (i=0; i<N; i++){
1788 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1789 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1790 }
1791
1792 -- After vect_compute_data_refs_alignment:
1793 for (i=0; i<N; i++){
1794 x = q[i]; # DR_MISALIGNMENT(q) = 3
1795 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1796 }
1797
1798 -- Possibility 1: we do loop versioning:
1799 if (p is aligned) {
1800 for (i=0; i<N; i++){ # loop 1A
1801 x = q[i]; # DR_MISALIGNMENT(q) = 3
1802 p[i] = y; # DR_MISALIGNMENT(p) = 0
1803 }
1804 }
1805 else {
1806 for (i=0; i<N; i++){ # loop 1B
1807 x = q[i]; # DR_MISALIGNMENT(q) = 3
1808 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1809 }
1810 }
1811
1812 -- Possibility 2: we do loop peeling:
1813 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1814 x = q[i];
1815 p[i] = y;
1816 }
1817 for (i = 3; i < N; i++){ # loop 2A
1818 x = q[i]; # DR_MISALIGNMENT(q) = 0
1819 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1820 }
1821
1822 -- Possibility 3: combination of loop peeling and versioning:
1823 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1824 x = q[i];
1825 p[i] = y;
1826 }
1827 if (p is aligned) {
1828 for (i = 3; i<N; i++){ # loop 3A
1829 x = q[i]; # DR_MISALIGNMENT(q) = 0
1830 p[i] = y; # DR_MISALIGNMENT(p) = 0
1831 }
1832 }
1833 else {
1834 for (i = 3; i<N; i++){ # loop 3B
1835 x = q[i]; # DR_MISALIGNMENT(q) = 0
1836 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1837 }
1838 }
1839
1840 These loops are later passed to loop_transform to be vectorized. The
1841 vectorizer will use the alignment information to guide the transformation
1842 (whether to generate regular loads/stores, or with special handling for
1843 misalignment). */
1844
1845 opt_result
1846 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1847 {
1848 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1849 dr_vec_info *first_store = NULL;
1850 dr_vec_info *dr0_info = NULL;
1851 struct data_reference *dr;
1852 unsigned int i;
1853 bool do_peeling = false;
1854 bool do_versioning = false;
1855 unsigned int npeel = 0;
1856 bool one_misalignment_known = false;
1857 bool one_misalignment_unknown = false;
1858 bool one_dr_unsupportable = false;
1859 dr_vec_info *unsupportable_dr_info = NULL;
1860 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1861 hash_table<peel_info_hasher> peeling_htab (1);
1862
1863 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1864
1865 /* Reset data so we can safely be called multiple times. */
1866 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1867 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1868
1869 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1870 return opt_result::success ();
1871
1872 /* Sort the vector of datarefs so DRs that have the same or dependent
1873 alignment are next to each other. */
1874 auto_vec<data_reference_p> datarefs
1875 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1876 datarefs.qsort (dr_align_group_sort_cmp);
1877
1878 /* Compute the number of DRs that become aligned when we peel
1879 a dataref so it becomes aligned. */
1880 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1881 n_same_align_refs.quick_grow_cleared (datarefs.length ());
1882 unsigned i0;
1883 for (i0 = 0; i0 < datarefs.length (); ++i0)
1884 if (DR_BASE_ADDRESS (datarefs[i0]))
1885 break;
1886 for (i = i0 + 1; i <= datarefs.length (); ++i)
1887 {
1888 if (i == datarefs.length ()
1889 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1890 DR_BASE_ADDRESS (datarefs[i]), 0)
1891 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1892 DR_OFFSET (datarefs[i]), 0)
1893 || !operand_equal_p (DR_STEP (datarefs[i0]),
1894 DR_STEP (datarefs[i]), 0))
1895 {
1896 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1897 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1898 will get known misalignment if we align one of the refs
1899 with the largest DR_TARGET_ALIGNMENT. */
1900 for (unsigned j = i0; j < i; ++j)
1901 {
1902 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1903 for (unsigned k = i0; k < i; ++k)
1904 {
1905 if (k == j)
1906 continue;
1907 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1908 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1909 dr_infoj))
1910 n_same_align_refs[j]++;
1911 }
1912 }
1913 i0 = i;
1914 }
1915 }
1916
1917 /* While cost model enhancements are expected in the future, the high level
1918 view of the code at this time is as follows:
1919
1920 A) If there is a misaligned access then see if peeling to align
1921 this access can make all data references satisfy
1922 vect_supportable_dr_alignment. If so, update data structures
1923 as needed and return true.
1924
1925 B) If peeling wasn't possible and there is a data reference with an
1926 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1927 then see if loop versioning checks can be used to make all data
1928 references satisfy vect_supportable_dr_alignment. If so, update
1929 data structures as needed and return true.
1930
1931 C) If neither peeling nor versioning were successful then return false if
1932 any data reference does not satisfy vect_supportable_dr_alignment.
1933
1934 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1935
1936 Note, Possibility 3 above (which is peeling and versioning together) is not
1937 being done at this time. */
1938
1939 /* (1) Peeling to force alignment. */
1940
1941 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1942 Considerations:
1943 + How many accesses will become aligned due to the peeling
1944 - How many accesses will become unaligned due to the peeling,
1945 and the cost of misaligned accesses.
1946 - The cost of peeling (the extra runtime checks, the increase
1947 in code size). */
1948
1949 FOR_EACH_VEC_ELT (datarefs, i, dr)
1950 {
1951 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1952 if (!vect_relevant_for_alignment_p (dr_info))
1953 continue;
1954
1955 stmt_vec_info stmt_info = dr_info->stmt;
1956 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1957 do_peeling = vector_alignment_reachable_p (dr_info);
1958 if (do_peeling)
1959 {
1960 if (known_alignment_for_access_p (dr_info, vectype))
1961 {
1962 unsigned int npeel_tmp = 0;
1963 bool negative = tree_int_cst_compare (DR_STEP (dr),
1964 size_zero_node) < 0;
1965
1966 /* If known_alignment_for_access_p then we have set
1967 DR_MISALIGNMENT which is only done if we know it at compiler
1968 time, so it is safe to assume target alignment is constant.
1969 */
1970 unsigned int target_align =
1971 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1972 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1973 poly_int64 off = 0;
1974 if (negative)
1975 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1976 unsigned int mis = dr_misalignment (dr_info, vectype, off);
1977 mis = negative ? mis : -mis;
1978 if (mis != 0)
1979 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1980
1981 /* For multiple types, it is possible that the bigger type access
1982 will have more than one peeling option. E.g., a loop with two
1983 types: one of size (vector size / 4), and the other one of
1984 size (vector size / 8). Vectorization factor will 8. If both
1985 accesses are misaligned by 3, the first one needs one scalar
1986 iteration to be aligned, and the second one needs 5. But the
1987 first one will be aligned also by peeling 5 scalar
1988 iterations, and in that case both accesses will be aligned.
1989 Hence, except for the immediate peeling amount, we also want
1990 to try to add full vector size, while we don't exceed
1991 vectorization factor.
1992 We do this automatically for cost model, since we calculate
1993 cost for every peeling option. */
1994 poly_uint64 nscalars = npeel_tmp;
1995 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1996 {
1997 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1998 nscalars = (STMT_SLP_TYPE (stmt_info)
1999 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2000 }
2001
2002 /* Save info about DR in the hash table. Also include peeling
2003 amounts according to the explanation above. Indicate
2004 the alignment status when the ref is not aligned.
2005 ??? Rather than using unknown alignment here we should
2006 prune all entries from the peeling hashtable which cause
2007 DRs to be not supported. */
2008 bool supportable_if_not_aligned
2009 = vect_supportable_dr_alignment
2010 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2011 while (known_le (npeel_tmp, nscalars))
2012 {
2013 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2014 dr_info, npeel_tmp,
2015 supportable_if_not_aligned);
2016 npeel_tmp += MAX (1, target_align / dr_size);
2017 }
2018
2019 one_misalignment_known = true;
2020 }
2021 else
2022 {
2023 /* If we don't know any misalignment values, we prefer
2024 peeling for data-ref that has the maximum number of data-refs
2025 with the same alignment, unless the target prefers to align
2026 stores over load. */
2027 unsigned same_align_drs = n_same_align_refs[i];
2028 if (!dr0_info
2029 || dr0_same_align_drs < same_align_drs)
2030 {
2031 dr0_same_align_drs = same_align_drs;
2032 dr0_info = dr_info;
2033 }
2034 /* For data-refs with the same number of related
2035 accesses prefer the one where the misalign
2036 computation will be invariant in the outermost loop. */
2037 else if (dr0_same_align_drs == same_align_drs)
2038 {
2039 class loop *ivloop0, *ivloop;
2040 ivloop0 = outermost_invariant_loop_for_expr
2041 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2042 ivloop = outermost_invariant_loop_for_expr
2043 (loop, DR_BASE_ADDRESS (dr));
2044 if ((ivloop && !ivloop0)
2045 || (ivloop && ivloop0
2046 && flow_loop_nested_p (ivloop, ivloop0)))
2047 dr0_info = dr_info;
2048 }
2049
2050 one_misalignment_unknown = true;
2051
2052 /* Check for data refs with unsupportable alignment that
2053 can be peeled. */
2054 enum dr_alignment_support supportable_dr_alignment
2055 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2056 DR_MISALIGNMENT_UNKNOWN);
2057 if (supportable_dr_alignment == dr_unaligned_unsupported)
2058 {
2059 one_dr_unsupportable = true;
2060 unsupportable_dr_info = dr_info;
2061 }
2062
2063 if (!first_store && DR_IS_WRITE (dr))
2064 {
2065 first_store = dr_info;
2066 first_store_same_align_drs = same_align_drs;
2067 }
2068 }
2069 }
2070 else
2071 {
2072 if (!aligned_access_p (dr_info, vectype))
2073 {
2074 if (dump_enabled_p ())
2075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076 "vector alignment may not be reachable\n");
2077 break;
2078 }
2079 }
2080 }
2081
2082 /* Check if we can possibly peel the loop. */
2083 if (!vect_can_advance_ivs_p (loop_vinfo)
2084 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2085 || loop->inner)
2086 do_peeling = false;
2087
2088 struct _vect_peel_extended_info peel_for_known_alignment;
2089 struct _vect_peel_extended_info peel_for_unknown_alignment;
2090 struct _vect_peel_extended_info best_peel;
2091
2092 peel_for_unknown_alignment.inside_cost = INT_MAX;
2093 peel_for_unknown_alignment.outside_cost = INT_MAX;
2094 peel_for_unknown_alignment.peel_info.count = 0;
2095
2096 if (do_peeling
2097 && one_misalignment_unknown)
2098 {
2099 /* Check if the target requires to prefer stores over loads, i.e., if
2100 misaligned stores are more expensive than misaligned loads (taking
2101 drs with same alignment into account). */
2102 unsigned int load_inside_cost = 0;
2103 unsigned int load_outside_cost = 0;
2104 unsigned int store_inside_cost = 0;
2105 unsigned int store_outside_cost = 0;
2106 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2107
2108 stmt_vector_for_cost dummy;
2109 dummy.create (2);
2110 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2111 &load_inside_cost,
2112 &load_outside_cost,
2113 &dummy, &dummy, estimated_npeels);
2114 dummy.release ();
2115
2116 if (first_store)
2117 {
2118 dummy.create (2);
2119 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2120 &store_inside_cost,
2121 &store_outside_cost,
2122 &dummy, &dummy,
2123 estimated_npeels);
2124 dummy.release ();
2125 }
2126 else
2127 {
2128 store_inside_cost = INT_MAX;
2129 store_outside_cost = INT_MAX;
2130 }
2131
2132 if (load_inside_cost > store_inside_cost
2133 || (load_inside_cost == store_inside_cost
2134 && load_outside_cost > store_outside_cost))
2135 {
2136 dr0_info = first_store;
2137 dr0_same_align_drs = first_store_same_align_drs;
2138 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2139 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2140 }
2141 else
2142 {
2143 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2144 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2145 }
2146
2147 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2148 prologue_cost_vec.create (2);
2149 epilogue_cost_vec.create (2);
2150
2151 int dummy2;
2152 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2153 (loop_vinfo, estimated_npeels, &dummy2,
2154 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2155 &prologue_cost_vec, &epilogue_cost_vec);
2156
2157 prologue_cost_vec.release ();
2158 epilogue_cost_vec.release ();
2159
2160 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2161 }
2162
2163 peel_for_unknown_alignment.peel_info.npeel = 0;
2164 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2165
2166 best_peel = peel_for_unknown_alignment;
2167
2168 peel_for_known_alignment.inside_cost = INT_MAX;
2169 peel_for_known_alignment.outside_cost = INT_MAX;
2170 peel_for_known_alignment.peel_info.count = 0;
2171 peel_for_known_alignment.peel_info.dr_info = NULL;
2172
2173 if (do_peeling && one_misalignment_known)
2174 {
2175 /* Peeling is possible, but there is no data access that is not supported
2176 unless aligned. So we try to choose the best possible peeling from
2177 the hash table. */
2178 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2179 (&peeling_htab, loop_vinfo);
2180 }
2181
2182 /* Compare costs of peeling for known and unknown alignment. */
2183 if (peel_for_known_alignment.peel_info.dr_info != NULL
2184 && peel_for_unknown_alignment.inside_cost
2185 >= peel_for_known_alignment.inside_cost)
2186 {
2187 best_peel = peel_for_known_alignment;
2188
2189 /* If the best peeling for known alignment has NPEEL == 0, perform no
2190 peeling at all except if there is an unsupportable dr that we can
2191 align. */
2192 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2193 do_peeling = false;
2194 }
2195
2196 /* If there is an unsupportable data ref, prefer this over all choices so far
2197 since we'd have to discard a chosen peeling except when it accidentally
2198 aligned the unsupportable data ref. */
2199 if (one_dr_unsupportable)
2200 dr0_info = unsupportable_dr_info;
2201 else if (do_peeling)
2202 {
2203 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2204 TODO: Use nopeel_outside_cost or get rid of it? */
2205 unsigned nopeel_inside_cost = 0;
2206 unsigned nopeel_outside_cost = 0;
2207
2208 stmt_vector_for_cost dummy;
2209 dummy.create (2);
2210 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2211 &nopeel_outside_cost, &dummy, &dummy, 0);
2212 dummy.release ();
2213
2214 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2215 costs will be recorded. */
2216 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2217 prologue_cost_vec.create (2);
2218 epilogue_cost_vec.create (2);
2219
2220 int dummy2;
2221 nopeel_outside_cost += vect_get_known_peeling_cost
2222 (loop_vinfo, 0, &dummy2,
2223 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2224 &prologue_cost_vec, &epilogue_cost_vec);
2225
2226 prologue_cost_vec.release ();
2227 epilogue_cost_vec.release ();
2228
2229 npeel = best_peel.peel_info.npeel;
2230 dr0_info = best_peel.peel_info.dr_info;
2231
2232 /* If no peeling is not more expensive than the best peeling we
2233 have so far, don't perform any peeling. */
2234 if (nopeel_inside_cost <= best_peel.inside_cost)
2235 do_peeling = false;
2236 }
2237
2238 if (do_peeling)
2239 {
2240 stmt_vec_info stmt_info = dr0_info->stmt;
2241 if (known_alignment_for_access_p (dr0_info,
2242 STMT_VINFO_VECTYPE (stmt_info)))
2243 {
2244 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2245 size_zero_node) < 0;
2246 if (!npeel)
2247 {
2248 /* Since it's known at compile time, compute the number of
2249 iterations in the peeled loop (the peeling factor) for use in
2250 updating DR_MISALIGNMENT values. The peeling factor is the
2251 vectorization factor minus the misalignment as an element
2252 count. */
2253 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2254 poly_int64 off = 0;
2255 if (negative)
2256 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2257 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2258 unsigned int mis
2259 = dr_misalignment (dr0_info, vectype, off);
2260 mis = negative ? mis : -mis;
2261 /* If known_alignment_for_access_p then we have set
2262 DR_MISALIGNMENT which is only done if we know it at compiler
2263 time, so it is safe to assume target alignment is constant.
2264 */
2265 unsigned int target_align =
2266 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2267 npeel = ((mis & (target_align - 1))
2268 / vect_get_scalar_dr_size (dr0_info));
2269 }
2270
2271 /* For interleaved data access every iteration accesses all the
2272 members of the group, therefore we divide the number of iterations
2273 by the group size. */
2274 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2275 npeel /= DR_GROUP_SIZE (stmt_info);
2276
2277 if (dump_enabled_p ())
2278 dump_printf_loc (MSG_NOTE, vect_location,
2279 "Try peeling by %d\n", npeel);
2280 }
2281
2282 /* Ensure that all datarefs can be vectorized after the peel. */
2283 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2284 do_peeling = false;
2285
2286 /* Check if all datarefs are supportable and log. */
2287 if (do_peeling
2288 && npeel == 0
2289 && known_alignment_for_access_p (dr0_info,
2290 STMT_VINFO_VECTYPE (stmt_info)))
2291 return opt_result::success ();
2292
2293 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2294 if (do_peeling)
2295 {
2296 unsigned max_allowed_peel
2297 = param_vect_max_peeling_for_alignment;
2298 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2299 max_allowed_peel = 0;
2300 if (max_allowed_peel != (unsigned)-1)
2301 {
2302 unsigned max_peel = npeel;
2303 if (max_peel == 0)
2304 {
2305 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2306 unsigned HOST_WIDE_INT target_align_c;
2307 if (target_align.is_constant (&target_align_c))
2308 max_peel =
2309 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2310 else
2311 {
2312 do_peeling = false;
2313 if (dump_enabled_p ())
2314 dump_printf_loc (MSG_NOTE, vect_location,
2315 "Disable peeling, max peels set and vector"
2316 " alignment unknown\n");
2317 }
2318 }
2319 if (max_peel > max_allowed_peel)
2320 {
2321 do_peeling = false;
2322 if (dump_enabled_p ())
2323 dump_printf_loc (MSG_NOTE, vect_location,
2324 "Disable peeling, max peels reached: %d\n", max_peel);
2325 }
2326 }
2327 }
2328
2329 /* Cost model #2 - if peeling may result in a remaining loop not
2330 iterating enough to be vectorized then do not peel. Since this
2331 is a cost heuristic rather than a correctness decision, use the
2332 most likely runtime value for variable vectorization factors. */
2333 if (do_peeling
2334 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2335 {
2336 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2337 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2338 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2339 < assumed_vf + max_peel)
2340 do_peeling = false;
2341 }
2342
2343 if (do_peeling)
2344 {
2345 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2346 If the misalignment of DR_i is identical to that of dr0 then set
2347 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2348 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2349 by the peeling factor times the element size of DR_i (MOD the
2350 vectorization factor times the size). Otherwise, the
2351 misalignment of DR_i must be set to unknown. */
2352 FOR_EACH_VEC_ELT (datarefs, i, dr)
2353 if (dr != dr0_info->dr)
2354 {
2355 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2356 if (!vect_relevant_for_alignment_p (dr_info))
2357 continue;
2358
2359 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2360 }
2361
2362 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2363 if (npeel)
2364 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2365 else
2366 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2367 SET_DR_MISALIGNMENT (dr0_info,
2368 vect_dr_misalign_for_aligned_access (dr0_info));
2369 if (dump_enabled_p ())
2370 {
2371 dump_printf_loc (MSG_NOTE, vect_location,
2372 "Alignment of access forced using peeling.\n");
2373 dump_printf_loc (MSG_NOTE, vect_location,
2374 "Peeling for alignment will be applied.\n");
2375 }
2376
2377 /* The inside-loop cost will be accounted for in vectorizable_load
2378 and vectorizable_store correctly with adjusted alignments.
2379 Drop the body_cst_vec on the floor here. */
2380 return opt_result::success ();
2381 }
2382 }
2383
2384 /* (2) Versioning to force alignment. */
2385
2386 /* Try versioning if:
2387 1) optimize loop for speed and the cost-model is not cheap
2388 2) there is at least one unsupported misaligned data ref with an unknown
2389 misalignment, and
2390 3) all misaligned data refs with a known misalignment are supported, and
2391 4) the number of runtime alignment checks is within reason. */
2392
2393 do_versioning
2394 = (optimize_loop_nest_for_speed_p (loop)
2395 && !loop->inner /* FORNOW */
2396 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2397
2398 if (do_versioning)
2399 {
2400 FOR_EACH_VEC_ELT (datarefs, i, dr)
2401 {
2402 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2403 if (!vect_relevant_for_alignment_p (dr_info))
2404 continue;
2405
2406 stmt_vec_info stmt_info = dr_info->stmt;
2407 if (STMT_VINFO_STRIDED_P (stmt_info))
2408 {
2409 do_versioning = false;
2410 break;
2411 }
2412
2413 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2414 bool negative = tree_int_cst_compare (DR_STEP (dr),
2415 size_zero_node) < 0;
2416 poly_int64 off = 0;
2417 if (negative)
2418 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2419 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2420 int misalignment;
2421 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2422 continue;
2423
2424 enum dr_alignment_support supportable_dr_alignment
2425 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2426 misalignment);
2427 if (supportable_dr_alignment == dr_unaligned_unsupported)
2428 {
2429 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2430 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2431 >= (unsigned) param_vect_max_version_for_alignment_checks))
2432 {
2433 do_versioning = false;
2434 break;
2435 }
2436
2437 /* At present we don't support versioning for alignment
2438 with variable VF, since there's no guarantee that the
2439 VF is a power of two. We could relax this if we added
2440 a way of enforcing a power-of-two size. */
2441 unsigned HOST_WIDE_INT size;
2442 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2443 {
2444 do_versioning = false;
2445 break;
2446 }
2447
2448 /* Forcing alignment in the first iteration is no good if
2449 we don't keep it across iterations. For now, just disable
2450 versioning in this case.
2451 ?? We could actually unroll the loop to achieve the required
2452 overall step alignment, and forcing the alignment could be
2453 done by doing some iterations of the non-vectorized loop. */
2454 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2455 * DR_STEP_ALIGNMENT (dr),
2456 DR_TARGET_ALIGNMENT (dr_info)))
2457 {
2458 do_versioning = false;
2459 break;
2460 }
2461
2462 /* The rightmost bits of an aligned address must be zeros.
2463 Construct the mask needed for this test. For example,
2464 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2465 mask must be 15 = 0xf. */
2466 int mask = size - 1;
2467
2468 /* FORNOW: use the same mask to test all potentially unaligned
2469 references in the loop. */
2470 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2471 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2472 {
2473 do_versioning = false;
2474 break;
2475 }
2476
2477 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2478 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2479 }
2480 }
2481
2482 /* Versioning requires at least one misaligned data reference. */
2483 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2484 do_versioning = false;
2485 else if (!do_versioning)
2486 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2487 }
2488
2489 if (do_versioning)
2490 {
2491 const vec<stmt_vec_info> &may_misalign_stmts
2492 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2493 stmt_vec_info stmt_info;
2494
2495 /* It can now be assumed that the data references in the statements
2496 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2497 of the loop being vectorized. */
2498 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2499 {
2500 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2501 SET_DR_MISALIGNMENT (dr_info,
2502 vect_dr_misalign_for_aligned_access (dr_info));
2503 if (dump_enabled_p ())
2504 dump_printf_loc (MSG_NOTE, vect_location,
2505 "Alignment of access forced using versioning.\n");
2506 }
2507
2508 if (dump_enabled_p ())
2509 dump_printf_loc (MSG_NOTE, vect_location,
2510 "Versioning for alignment will be applied.\n");
2511
2512 /* Peeling and versioning can't be done together at this time. */
2513 gcc_assert (! (do_peeling && do_versioning));
2514
2515 return opt_result::success ();
2516 }
2517
2518 /* This point is reached if neither peeling nor versioning is being done. */
2519 gcc_assert (! (do_peeling || do_versioning));
2520
2521 return opt_result::success ();
2522 }
2523
2524
2525 /* Function vect_analyze_data_refs_alignment
2526
2527 Analyze the alignment of the data-references in the loop.
2528 Return FALSE if a data reference is found that cannot be vectorized. */
2529
2530 opt_result
2531 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2532 {
2533 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2534
2535 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2536 struct data_reference *dr;
2537 unsigned int i;
2538
2539 vect_record_base_alignments (loop_vinfo);
2540 FOR_EACH_VEC_ELT (datarefs, i, dr)
2541 {
2542 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2543 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2544 {
2545 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2546 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2547 continue;
2548 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2549 STMT_VINFO_VECTYPE (dr_info->stmt));
2550 }
2551 }
2552
2553 return opt_result::success ();
2554 }
2555
2556
2557 /* Analyze alignment of DRs of stmts in NODE. */
2558
2559 static bool
2560 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2561 {
2562 /* Alignment is maintained in the first element of the group. */
2563 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2564 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2565 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2566 tree vectype = SLP_TREE_VECTYPE (node);
2567 poly_uint64 vector_alignment
2568 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2569 BITS_PER_UNIT);
2570 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2571 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2572 /* Re-analyze alignment when we're facing a vectorization with a bigger
2573 alignment requirement. */
2574 else if (known_lt (dr_info->target_alignment, vector_alignment))
2575 {
2576 poly_uint64 old_target_alignment = dr_info->target_alignment;
2577 int old_misalignment = dr_info->misalignment;
2578 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2579 /* But keep knowledge about a smaller alignment. */
2580 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2581 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2582 {
2583 dr_info->target_alignment = old_target_alignment;
2584 dr_info->misalignment = old_misalignment;
2585 }
2586 }
2587 /* When we ever face unordered target alignments the first one wins in terms
2588 of analyzing and the other will become unknown in dr_misalignment. */
2589 return true;
2590 }
2591
2592 /* Function vect_slp_analyze_instance_alignment
2593
2594 Analyze the alignment of the data-references in the SLP instance.
2595 Return FALSE if a data reference is found that cannot be vectorized. */
2596
2597 bool
2598 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2599 slp_instance instance)
2600 {
2601 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2602
2603 slp_tree node;
2604 unsigned i;
2605 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2606 if (! vect_slp_analyze_node_alignment (vinfo, node))
2607 return false;
2608
2609 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2610 && ! vect_slp_analyze_node_alignment
2611 (vinfo, SLP_INSTANCE_TREE (instance)))
2612 return false;
2613
2614 return true;
2615 }
2616
2617
2618 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2619 accesses of legal size, step, etc. Detect gaps, single element
2620 interleaving, and other special cases. Set grouped access info.
2621 Collect groups of strided stores for further use in SLP analysis.
2622 Worker for vect_analyze_group_access. */
2623
2624 static bool
2625 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2626 {
2627 data_reference *dr = dr_info->dr;
2628 tree step = DR_STEP (dr);
2629 tree scalar_type = TREE_TYPE (DR_REF (dr));
2630 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2631 stmt_vec_info stmt_info = dr_info->stmt;
2632 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2633 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2634 HOST_WIDE_INT dr_step = -1;
2635 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2636 bool slp_impossible = false;
2637
2638 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2639 size of the interleaving group (including gaps). */
2640 if (tree_fits_shwi_p (step))
2641 {
2642 dr_step = tree_to_shwi (step);
2643 /* Check that STEP is a multiple of type size. Otherwise there is
2644 a non-element-sized gap at the end of the group which we
2645 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2646 ??? As we can handle non-constant step fine here we should
2647 simply remove uses of DR_GROUP_GAP between the last and first
2648 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2649 simply not include that gap. */
2650 if ((dr_step % type_size) != 0)
2651 {
2652 if (dump_enabled_p ())
2653 dump_printf_loc (MSG_NOTE, vect_location,
2654 "Step %T is not a multiple of the element size"
2655 " for %T\n",
2656 step, DR_REF (dr));
2657 return false;
2658 }
2659 groupsize = absu_hwi (dr_step) / type_size;
2660 }
2661 else
2662 groupsize = 0;
2663
2664 /* Not consecutive access is possible only if it is a part of interleaving. */
2665 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2666 {
2667 /* Check if it this DR is a part of interleaving, and is a single
2668 element of the group that is accessed in the loop. */
2669
2670 /* Gaps are supported only for loads. STEP must be a multiple of the type
2671 size. */
2672 if (DR_IS_READ (dr)
2673 && (dr_step % type_size) == 0
2674 && groupsize > 0
2675 /* This could be UINT_MAX but as we are generating code in a very
2676 inefficient way we have to cap earlier.
2677 See PR91403 for example. */
2678 && groupsize <= 4096)
2679 {
2680 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2681 DR_GROUP_SIZE (stmt_info) = groupsize;
2682 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2683 if (dump_enabled_p ())
2684 dump_printf_loc (MSG_NOTE, vect_location,
2685 "Detected single element interleaving %T"
2686 " step %T\n",
2687 DR_REF (dr), step);
2688
2689 return true;
2690 }
2691
2692 if (dump_enabled_p ())
2693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2694 "not consecutive access %G", stmt_info->stmt);
2695
2696 if (bb_vinfo)
2697 {
2698 /* Mark the statement as unvectorizable. */
2699 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2700 return true;
2701 }
2702
2703 if (dump_enabled_p ())
2704 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2705 STMT_VINFO_STRIDED_P (stmt_info) = true;
2706 return true;
2707 }
2708
2709 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2710 {
2711 /* First stmt in the interleaving chain. Check the chain. */
2712 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2713 struct data_reference *data_ref = dr;
2714 unsigned int count = 1;
2715 tree prev_init = DR_INIT (data_ref);
2716 HOST_WIDE_INT diff, gaps = 0;
2717
2718 /* By construction, all group members have INTEGER_CST DR_INITs. */
2719 while (next)
2720 {
2721 /* We never have the same DR multiple times. */
2722 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2723 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2724
2725 data_ref = STMT_VINFO_DATA_REF (next);
2726
2727 /* All group members have the same STEP by construction. */
2728 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2729
2730 /* Check that the distance between two accesses is equal to the type
2731 size. Otherwise, we have gaps. */
2732 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2733 - TREE_INT_CST_LOW (prev_init)) / type_size;
2734 if (diff < 1 || diff > UINT_MAX)
2735 {
2736 /* For artificial testcases with array accesses with large
2737 constant indices we can run into overflow issues which
2738 can end up fooling the groupsize constraint below so
2739 check the individual gaps (which are represented as
2740 unsigned int) as well. */
2741 if (dump_enabled_p ())
2742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2743 "interleaved access with gap larger "
2744 "than representable\n");
2745 return false;
2746 }
2747 if (diff != 1)
2748 {
2749 /* FORNOW: SLP of accesses with gaps is not supported. */
2750 slp_impossible = true;
2751 if (DR_IS_WRITE (data_ref))
2752 {
2753 if (dump_enabled_p ())
2754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2755 "interleaved store with gaps\n");
2756 return false;
2757 }
2758
2759 gaps += diff - 1;
2760 }
2761
2762 last_accessed_element += diff;
2763
2764 /* Store the gap from the previous member of the group. If there is no
2765 gap in the access, DR_GROUP_GAP is always 1. */
2766 DR_GROUP_GAP (next) = diff;
2767
2768 prev_init = DR_INIT (data_ref);
2769 next = DR_GROUP_NEXT_ELEMENT (next);
2770 /* Count the number of data-refs in the chain. */
2771 count++;
2772 }
2773
2774 if (groupsize == 0)
2775 groupsize = count + gaps;
2776
2777 /* This could be UINT_MAX but as we are generating code in a very
2778 inefficient way we have to cap earlier. See PR78699 for example. */
2779 if (groupsize > 4096)
2780 {
2781 if (dump_enabled_p ())
2782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2783 "group is too large\n");
2784 return false;
2785 }
2786
2787 /* Check that the size of the interleaving is equal to count for stores,
2788 i.e., that there are no gaps. */
2789 if (groupsize != count
2790 && !DR_IS_READ (dr))
2791 {
2792 groupsize = count;
2793 STMT_VINFO_STRIDED_P (stmt_info) = true;
2794 }
2795
2796 /* If there is a gap after the last load in the group it is the
2797 difference between the groupsize and the last accessed
2798 element.
2799 When there is no gap, this difference should be 0. */
2800 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2801
2802 DR_GROUP_SIZE (stmt_info) = groupsize;
2803 if (dump_enabled_p ())
2804 {
2805 dump_printf_loc (MSG_NOTE, vect_location,
2806 "Detected interleaving ");
2807 if (DR_IS_READ (dr))
2808 dump_printf (MSG_NOTE, "load ");
2809 else if (STMT_VINFO_STRIDED_P (stmt_info))
2810 dump_printf (MSG_NOTE, "strided store ");
2811 else
2812 dump_printf (MSG_NOTE, "store ");
2813 dump_printf (MSG_NOTE, "of size %u\n",
2814 (unsigned)groupsize);
2815 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2816 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2817 while (next)
2818 {
2819 if (DR_GROUP_GAP (next) != 1)
2820 dump_printf_loc (MSG_NOTE, vect_location,
2821 "\t<gap of %d elements>\n",
2822 DR_GROUP_GAP (next) - 1);
2823 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2824 next = DR_GROUP_NEXT_ELEMENT (next);
2825 }
2826 if (DR_GROUP_GAP (stmt_info) != 0)
2827 dump_printf_loc (MSG_NOTE, vect_location,
2828 "\t<gap of %d elements>\n",
2829 DR_GROUP_GAP (stmt_info));
2830 }
2831
2832 /* SLP: create an SLP data structure for every interleaving group of
2833 stores for further analysis in vect_analyse_slp. */
2834 if (DR_IS_WRITE (dr) && !slp_impossible)
2835 {
2836 if (loop_vinfo)
2837 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2838 if (bb_vinfo)
2839 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2840 }
2841 }
2842
2843 return true;
2844 }
2845
2846 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2847 accesses of legal size, step, etc. Detect gaps, single element
2848 interleaving, and other special cases. Set grouped access info.
2849 Collect groups of strided stores for further use in SLP analysis. */
2850
2851 static bool
2852 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2853 {
2854 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2855 {
2856 /* Dissolve the group if present. */
2857 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2858 while (stmt_info)
2859 {
2860 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2861 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2862 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2863 stmt_info = next;
2864 }
2865 return false;
2866 }
2867 return true;
2868 }
2869
2870 /* Analyze the access pattern of the data-reference DR_INFO.
2871 In case of non-consecutive accesses call vect_analyze_group_access() to
2872 analyze groups of accesses. */
2873
2874 static bool
2875 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2876 {
2877 data_reference *dr = dr_info->dr;
2878 tree step = DR_STEP (dr);
2879 tree scalar_type = TREE_TYPE (DR_REF (dr));
2880 stmt_vec_info stmt_info = dr_info->stmt;
2881 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2882 class loop *loop = NULL;
2883
2884 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2885 return true;
2886
2887 if (loop_vinfo)
2888 loop = LOOP_VINFO_LOOP (loop_vinfo);
2889
2890 if (loop_vinfo && !step)
2891 {
2892 if (dump_enabled_p ())
2893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2894 "bad data-ref access in loop\n");
2895 return false;
2896 }
2897
2898 /* Allow loads with zero step in inner-loop vectorization. */
2899 if (loop_vinfo && integer_zerop (step))
2900 {
2901 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2902 if (!nested_in_vect_loop_p (loop, stmt_info))
2903 return DR_IS_READ (dr);
2904 /* Allow references with zero step for outer loops marked
2905 with pragma omp simd only - it guarantees absence of
2906 loop-carried dependencies between inner loop iterations. */
2907 if (loop->safelen < 2)
2908 {
2909 if (dump_enabled_p ())
2910 dump_printf_loc (MSG_NOTE, vect_location,
2911 "zero step in inner loop of nest\n");
2912 return false;
2913 }
2914 }
2915
2916 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2917 {
2918 /* Interleaved accesses are not yet supported within outer-loop
2919 vectorization for references in the inner-loop. */
2920 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2921
2922 /* For the rest of the analysis we use the outer-loop step. */
2923 step = STMT_VINFO_DR_STEP (stmt_info);
2924 if (integer_zerop (step))
2925 {
2926 if (dump_enabled_p ())
2927 dump_printf_loc (MSG_NOTE, vect_location,
2928 "zero step in outer loop.\n");
2929 return DR_IS_READ (dr);
2930 }
2931 }
2932
2933 /* Consecutive? */
2934 if (TREE_CODE (step) == INTEGER_CST)
2935 {
2936 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2937 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2938 || (dr_step < 0
2939 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2940 {
2941 /* Mark that it is not interleaving. */
2942 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2943 return true;
2944 }
2945 }
2946
2947 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2948 {
2949 if (dump_enabled_p ())
2950 dump_printf_loc (MSG_NOTE, vect_location,
2951 "grouped access in outer loop.\n");
2952 return false;
2953 }
2954
2955
2956 /* Assume this is a DR handled by non-constant strided load case. */
2957 if (TREE_CODE (step) != INTEGER_CST)
2958 return (STMT_VINFO_STRIDED_P (stmt_info)
2959 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2960 || vect_analyze_group_access (vinfo, dr_info)));
2961
2962 /* Not consecutive access - check if it's a part of interleaving group. */
2963 return vect_analyze_group_access (vinfo, dr_info);
2964 }
2965
2966 /* Compare two data-references DRA and DRB to group them into chunks
2967 suitable for grouping. */
2968
2969 static int
2970 dr_group_sort_cmp (const void *dra_, const void *drb_)
2971 {
2972 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2973 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2974 data_reference_p dra = dra_info->dr;
2975 data_reference_p drb = drb_info->dr;
2976 int cmp;
2977
2978 /* Stabilize sort. */
2979 if (dra == drb)
2980 return 0;
2981
2982 /* Different group IDs lead never belong to the same group. */
2983 if (dra_info->group != drb_info->group)
2984 return dra_info->group < drb_info->group ? -1 : 1;
2985
2986 /* Ordering of DRs according to base. */
2987 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2988 DR_BASE_ADDRESS (drb));
2989 if (cmp != 0)
2990 return cmp;
2991
2992 /* And according to DR_OFFSET. */
2993 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2994 if (cmp != 0)
2995 return cmp;
2996
2997 /* Put reads before writes. */
2998 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2999 return DR_IS_READ (dra) ? -1 : 1;
3000
3001 /* Then sort after access size. */
3002 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3003 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3004 if (cmp != 0)
3005 return cmp;
3006
3007 /* And after step. */
3008 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3009 if (cmp != 0)
3010 return cmp;
3011
3012 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3013 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3014 if (cmp == 0)
3015 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3016 return cmp;
3017 }
3018
3019 /* If OP is the result of a conversion, return the unconverted value,
3020 otherwise return null. */
3021
3022 static tree
3023 strip_conversion (tree op)
3024 {
3025 if (TREE_CODE (op) != SSA_NAME)
3026 return NULL_TREE;
3027 gimple *stmt = SSA_NAME_DEF_STMT (op);
3028 if (!is_gimple_assign (stmt)
3029 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3030 return NULL_TREE;
3031 return gimple_assign_rhs1 (stmt);
3032 }
3033
3034 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3035 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3036 be grouped in SLP mode. */
3037
3038 static bool
3039 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3040 bool allow_slp_p)
3041 {
3042 if (gimple_assign_single_p (stmt1_info->stmt))
3043 return gimple_assign_single_p (stmt2_info->stmt);
3044
3045 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3046 if (call1 && gimple_call_internal_p (call1))
3047 {
3048 /* Check for two masked loads or two masked stores. */
3049 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3050 if (!call2 || !gimple_call_internal_p (call2))
3051 return false;
3052 internal_fn ifn = gimple_call_internal_fn (call1);
3053 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3054 return false;
3055 if (ifn != gimple_call_internal_fn (call2))
3056 return false;
3057
3058 /* Check that the masks are the same. Cope with casts of masks,
3059 like those created by build_mask_conversion. */
3060 tree mask1 = gimple_call_arg (call1, 2);
3061 tree mask2 = gimple_call_arg (call2, 2);
3062 if (!operand_equal_p (mask1, mask2, 0)
3063 && (ifn == IFN_MASK_STORE || !allow_slp_p))
3064 {
3065 mask1 = strip_conversion (mask1);
3066 if (!mask1)
3067 return false;
3068 mask2 = strip_conversion (mask2);
3069 if (!mask2)
3070 return false;
3071 if (!operand_equal_p (mask1, mask2, 0))
3072 return false;
3073 }
3074 return true;
3075 }
3076
3077 return false;
3078 }
3079
3080 /* Function vect_analyze_data_ref_accesses.
3081
3082 Analyze the access pattern of all the data references in the loop.
3083
3084 FORNOW: the only access pattern that is considered vectorizable is a
3085 simple step 1 (consecutive) access.
3086
3087 FORNOW: handle only arrays and pointer accesses. */
3088
3089 opt_result
3090 vect_analyze_data_ref_accesses (vec_info *vinfo,
3091 vec<int> *dataref_groups)
3092 {
3093 unsigned int i;
3094 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3095
3096 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3097
3098 if (datarefs.is_empty ())
3099 return opt_result::success ();
3100
3101 /* Sort the array of datarefs to make building the interleaving chains
3102 linear. Don't modify the original vector's order, it is needed for
3103 determining what dependencies are reversed. */
3104 vec<dr_vec_info *> datarefs_copy;
3105 datarefs_copy.create (datarefs.length ());
3106 for (unsigned i = 0; i < datarefs.length (); i++)
3107 {
3108 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3109 /* If the caller computed DR grouping use that, otherwise group by
3110 basic blocks. */
3111 if (dataref_groups)
3112 dr_info->group = (*dataref_groups)[i];
3113 else
3114 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3115 datarefs_copy.quick_push (dr_info);
3116 }
3117 datarefs_copy.qsort (dr_group_sort_cmp);
3118 hash_set<stmt_vec_info> to_fixup;
3119
3120 /* Build the interleaving chains. */
3121 for (i = 0; i < datarefs_copy.length () - 1;)
3122 {
3123 dr_vec_info *dr_info_a = datarefs_copy[i];
3124 data_reference_p dra = dr_info_a->dr;
3125 int dra_group_id = dr_info_a->group;
3126 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3127 stmt_vec_info lastinfo = NULL;
3128 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3129 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3130 {
3131 ++i;
3132 continue;
3133 }
3134 for (i = i + 1; i < datarefs_copy.length (); ++i)
3135 {
3136 dr_vec_info *dr_info_b = datarefs_copy[i];
3137 data_reference_p drb = dr_info_b->dr;
3138 int drb_group_id = dr_info_b->group;
3139 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3140 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3141 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3142 break;
3143
3144 /* ??? Imperfect sorting (non-compatible types, non-modulo
3145 accesses, same accesses) can lead to a group to be artificially
3146 split here as we don't just skip over those. If it really
3147 matters we can push those to a worklist and re-iterate
3148 over them. The we can just skip ahead to the next DR here. */
3149
3150 /* DRs in a different DR group should not be put into the same
3151 interleaving group. */
3152 if (dra_group_id != drb_group_id)
3153 break;
3154
3155 /* Check that the data-refs have same first location (except init)
3156 and they are both either store or load (not load and store,
3157 not masked loads or stores). */
3158 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3159 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3160 DR_BASE_ADDRESS (drb)) != 0
3161 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3162 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3163 break;
3164
3165 /* Check that the data-refs have the same constant size. */
3166 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3167 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3168 if (!tree_fits_uhwi_p (sza)
3169 || !tree_fits_uhwi_p (szb)
3170 || !tree_int_cst_equal (sza, szb))
3171 break;
3172
3173 /* Check that the data-refs have the same step. */
3174 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3175 break;
3176
3177 /* Check the types are compatible.
3178 ??? We don't distinguish this during sorting. */
3179 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3180 TREE_TYPE (DR_REF (drb))))
3181 break;
3182
3183 /* Check that the DR_INITs are compile-time constants. */
3184 if (!tree_fits_shwi_p (DR_INIT (dra))
3185 || !tree_fits_shwi_p (DR_INIT (drb)))
3186 break;
3187
3188 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3189 just hold extra information. */
3190 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3191 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3192 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3193 break;
3194
3195 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3196 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3197 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3198 HOST_WIDE_INT init_prev
3199 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3200 gcc_assert (init_a <= init_b
3201 && init_a <= init_prev
3202 && init_prev <= init_b);
3203
3204 /* Do not place the same access in the interleaving chain twice. */
3205 if (init_b == init_prev)
3206 {
3207 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3208 < gimple_uid (DR_STMT (drb)));
3209 /* Simply link in duplicates and fix up the chain below. */
3210 }
3211 else
3212 {
3213 /* If init_b == init_a + the size of the type * k, we have an
3214 interleaving, and DRA is accessed before DRB. */
3215 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3216 if (type_size_a == 0
3217 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3218 % type_size_a != 0))
3219 break;
3220
3221 /* If we have a store, the accesses are adjacent. This splits
3222 groups into chunks we support (we don't support vectorization
3223 of stores with gaps). */
3224 if (!DR_IS_READ (dra)
3225 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3226 != type_size_a))
3227 break;
3228
3229 /* If the step (if not zero or non-constant) is smaller than the
3230 difference between data-refs' inits this splits groups into
3231 suitable sizes. */
3232 if (tree_fits_shwi_p (DR_STEP (dra)))
3233 {
3234 unsigned HOST_WIDE_INT step
3235 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3236 if (step != 0
3237 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3238 break;
3239 }
3240 }
3241
3242 if (dump_enabled_p ())
3243 dump_printf_loc (MSG_NOTE, vect_location,
3244 DR_IS_READ (dra)
3245 ? "Detected interleaving load %T and %T\n"
3246 : "Detected interleaving store %T and %T\n",
3247 DR_REF (dra), DR_REF (drb));
3248
3249 /* Link the found element into the group list. */
3250 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3251 {
3252 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3253 lastinfo = stmtinfo_a;
3254 }
3255 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3256 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3257 lastinfo = stmtinfo_b;
3258
3259 if (! STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3260 {
3261 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3262 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3263
3264 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3265 dump_printf_loc (MSG_NOTE, vect_location,
3266 "Load suitable for SLP vectorization only.\n");
3267 }
3268
3269 if (init_b == init_prev
3270 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3271 && dump_enabled_p ())
3272 dump_printf_loc (MSG_NOTE, vect_location,
3273 "Queuing group with duplicate access for fixup\n");
3274 }
3275 }
3276
3277 /* Fixup groups with duplicate entries by splitting it. */
3278 while (1)
3279 {
3280 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3281 if (!(it != to_fixup.end ()))
3282 break;
3283 stmt_vec_info grp = *it;
3284 to_fixup.remove (grp);
3285
3286 /* Find the earliest duplicate group member. */
3287 unsigned first_duplicate = -1u;
3288 stmt_vec_info next, g = grp;
3289 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3290 {
3291 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3292 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3293 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3294 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3295 g = next;
3296 }
3297 if (first_duplicate == -1U)
3298 continue;
3299
3300 /* Then move all stmts after the first duplicate to a new group.
3301 Note this is a heuristic but one with the property that *it
3302 is fixed up completely. */
3303 g = grp;
3304 stmt_vec_info newgroup = NULL, ng = grp;
3305 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3306 {
3307 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3308 {
3309 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3310 if (!newgroup)
3311 {
3312 newgroup = next;
3313 STMT_VINFO_SLP_VECT_ONLY (newgroup)
3314 = STMT_VINFO_SLP_VECT_ONLY (grp);
3315 }
3316 else
3317 DR_GROUP_NEXT_ELEMENT (ng) = next;
3318 ng = next;
3319 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3320 }
3321 else
3322 g = DR_GROUP_NEXT_ELEMENT (g);
3323 }
3324 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3325
3326 /* Fixup the new group which still may contain duplicates. */
3327 to_fixup.add (newgroup);
3328 }
3329
3330 dr_vec_info *dr_info;
3331 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3332 {
3333 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3334 && !vect_analyze_data_ref_access (vinfo, dr_info))
3335 {
3336 if (dump_enabled_p ())
3337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3338 "not vectorized: complicated access pattern.\n");
3339
3340 if (is_a <bb_vec_info> (vinfo))
3341 {
3342 /* Mark the statement as not vectorizable. */
3343 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3344 continue;
3345 }
3346 else
3347 {
3348 datarefs_copy.release ();
3349 return opt_result::failure_at (dr_info->stmt->stmt,
3350 "not vectorized:"
3351 " complicated access pattern.\n");
3352 }
3353 }
3354 }
3355
3356 datarefs_copy.release ();
3357 return opt_result::success ();
3358 }
3359
3360 /* Function vect_vfa_segment_size.
3361
3362 Input:
3363 DR_INFO: The data reference.
3364 LENGTH_FACTOR: segment length to consider.
3365
3366 Return a value suitable for the dr_with_seg_len::seg_len field.
3367 This is the "distance travelled" by the pointer from the first
3368 iteration in the segment to the last. Note that it does not include
3369 the size of the access; in effect it only describes the first byte. */
3370
3371 static tree
3372 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3373 {
3374 length_factor = size_binop (MINUS_EXPR,
3375 fold_convert (sizetype, length_factor),
3376 size_one_node);
3377 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3378 length_factor);
3379 }
3380
3381 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3382 gives the worst-case number of bytes covered by the segment. */
3383
3384 static unsigned HOST_WIDE_INT
3385 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3386 {
3387 stmt_vec_info stmt_vinfo = dr_info->stmt;
3388 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3389 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3390 unsigned HOST_WIDE_INT access_size = ref_size;
3391 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3392 {
3393 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3394 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3395 }
3396 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3397 int misalignment;
3398 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3399 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3400 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3401 == dr_explicit_realign_optimized))
3402 {
3403 /* We might access a full vector's worth. */
3404 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3405 }
3406 return access_size;
3407 }
3408
3409 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3410 describes. */
3411
3412 static unsigned int
3413 vect_vfa_align (dr_vec_info *dr_info)
3414 {
3415 return dr_alignment (dr_info->dr);
3416 }
3417
3418 /* Function vect_no_alias_p.
3419
3420 Given data references A and B with equal base and offset, see whether
3421 the alias relation can be decided at compilation time. Return 1 if
3422 it can and the references alias, 0 if it can and the references do
3423 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3424 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3425 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3426
3427 static int
3428 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3429 tree segment_length_a, tree segment_length_b,
3430 unsigned HOST_WIDE_INT access_size_a,
3431 unsigned HOST_WIDE_INT access_size_b)
3432 {
3433 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3434 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3435 poly_uint64 const_length_a;
3436 poly_uint64 const_length_b;
3437
3438 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3439 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3440 [a, a+12) */
3441 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3442 {
3443 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3444 offset_a -= const_length_a;
3445 }
3446 else
3447 const_length_a = tree_to_poly_uint64 (segment_length_a);
3448 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3449 {
3450 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3451 offset_b -= const_length_b;
3452 }
3453 else
3454 const_length_b = tree_to_poly_uint64 (segment_length_b);
3455
3456 const_length_a += access_size_a;
3457 const_length_b += access_size_b;
3458
3459 if (ranges_known_overlap_p (offset_a, const_length_a,
3460 offset_b, const_length_b))
3461 return 1;
3462
3463 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3464 offset_b, const_length_b))
3465 return 0;
3466
3467 return -1;
3468 }
3469
3470 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3471 in DDR is >= VF. */
3472
3473 static bool
3474 dependence_distance_ge_vf (data_dependence_relation *ddr,
3475 unsigned int loop_depth, poly_uint64 vf)
3476 {
3477 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3478 || DDR_NUM_DIST_VECTS (ddr) == 0)
3479 return false;
3480
3481 /* If the dependence is exact, we should have limited the VF instead. */
3482 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3483
3484 unsigned int i;
3485 lambda_vector dist_v;
3486 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3487 {
3488 HOST_WIDE_INT dist = dist_v[loop_depth];
3489 if (dist != 0
3490 && !(dist > 0 && DDR_REVERSED_P (ddr))
3491 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3492 return false;
3493 }
3494
3495 if (dump_enabled_p ())
3496 dump_printf_loc (MSG_NOTE, vect_location,
3497 "dependence distance between %T and %T is >= VF\n",
3498 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3499
3500 return true;
3501 }
3502
3503 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3504
3505 static void
3506 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3507 {
3508 dump_printf (dump_kind, "%s (%T) >= ",
3509 lower_bound.unsigned_p ? "unsigned" : "abs",
3510 lower_bound.expr);
3511 dump_dec (dump_kind, lower_bound.min_value);
3512 }
3513
3514 /* Record that the vectorized loop requires the vec_lower_bound described
3515 by EXPR, UNSIGNED_P and MIN_VALUE. */
3516
3517 static void
3518 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3519 poly_uint64 min_value)
3520 {
3521 vec<vec_lower_bound> &lower_bounds
3522 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3523 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3524 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3525 {
3526 unsigned_p &= lower_bounds[i].unsigned_p;
3527 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3528 if (lower_bounds[i].unsigned_p != unsigned_p
3529 || maybe_lt (lower_bounds[i].min_value, min_value))
3530 {
3531 lower_bounds[i].unsigned_p = unsigned_p;
3532 lower_bounds[i].min_value = min_value;
3533 if (dump_enabled_p ())
3534 {
3535 dump_printf_loc (MSG_NOTE, vect_location,
3536 "updating run-time check to ");
3537 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3538 dump_printf (MSG_NOTE, "\n");
3539 }
3540 }
3541 return;
3542 }
3543
3544 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3545 if (dump_enabled_p ())
3546 {
3547 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3548 dump_lower_bound (MSG_NOTE, lower_bound);
3549 dump_printf (MSG_NOTE, "\n");
3550 }
3551 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3552 }
3553
3554 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3555 will span fewer than GAP bytes. */
3556
3557 static bool
3558 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3559 poly_int64 gap)
3560 {
3561 stmt_vec_info stmt_info = dr_info->stmt;
3562 HOST_WIDE_INT count
3563 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3564 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3565 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3566 return (estimated_poly_value (gap)
3567 <= count * vect_get_scalar_dr_size (dr_info));
3568 }
3569
3570 /* Return true if we know that there is no alias between DR_INFO_A and
3571 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3572 When returning true, set *LOWER_BOUND_OUT to this N. */
3573
3574 static bool
3575 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3576 poly_uint64 *lower_bound_out)
3577 {
3578 /* Check that there is a constant gap of known sign between DR_A
3579 and DR_B. */
3580 data_reference *dr_a = dr_info_a->dr;
3581 data_reference *dr_b = dr_info_b->dr;
3582 poly_int64 init_a, init_b;
3583 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3584 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3585 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3586 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3587 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3588 || !ordered_p (init_a, init_b))
3589 return false;
3590
3591 /* Sort DR_A and DR_B by the address they access. */
3592 if (maybe_lt (init_b, init_a))
3593 {
3594 std::swap (init_a, init_b);
3595 std::swap (dr_info_a, dr_info_b);
3596 std::swap (dr_a, dr_b);
3597 }
3598
3599 /* If the two accesses could be dependent within a scalar iteration,
3600 make sure that we'd retain their order. */
3601 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3602 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3603 return false;
3604
3605 /* There is no alias if abs (DR_STEP) is greater than or equal to
3606 the bytes spanned by the combination of the two accesses. */
3607 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3608 return true;
3609 }
3610
3611 /* Function vect_prune_runtime_alias_test_list.
3612
3613 Prune a list of ddrs to be tested at run-time by versioning for alias.
3614 Merge several alias checks into one if possible.
3615 Return FALSE if resulting list of ddrs is longer then allowed by
3616 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3617
3618 opt_result
3619 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3620 {
3621 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3622 hash_set <tree_pair_hash> compared_objects;
3623
3624 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3625 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3626 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3627 const vec<vec_object_pair> &check_unequal_addrs
3628 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3629 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3630 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3631
3632 ddr_p ddr;
3633 unsigned int i;
3634 tree length_factor;
3635
3636 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3637
3638 /* Step values are irrelevant for aliasing if the number of vector
3639 iterations is equal to the number of scalar iterations (which can
3640 happen for fully-SLP loops). */
3641 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3642
3643 if (!vf_one_p)
3644 {
3645 /* Convert the checks for nonzero steps into bound tests. */
3646 tree value;
3647 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3648 vect_check_lower_bound (loop_vinfo, value, true, 1);
3649 }
3650
3651 if (may_alias_ddrs.is_empty ())
3652 return opt_result::success ();
3653
3654 comp_alias_ddrs.create (may_alias_ddrs.length ());
3655
3656 unsigned int loop_depth
3657 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3658 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3659
3660 /* First, we collect all data ref pairs for aliasing checks. */
3661 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3662 {
3663 poly_uint64 lower_bound;
3664 tree segment_length_a, segment_length_b;
3665 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3666 unsigned HOST_WIDE_INT align_a, align_b;
3667
3668 /* Ignore the alias if the VF we chose ended up being no greater
3669 than the dependence distance. */
3670 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3671 continue;
3672
3673 if (DDR_OBJECT_A (ddr))
3674 {
3675 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3676 if (!compared_objects.add (new_pair))
3677 {
3678 if (dump_enabled_p ())
3679 dump_printf_loc (MSG_NOTE, vect_location,
3680 "checking that %T and %T"
3681 " have different addresses\n",
3682 new_pair.first, new_pair.second);
3683 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3684 }
3685 continue;
3686 }
3687
3688 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3689 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3690
3691 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3692 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3693
3694 bool preserves_scalar_order_p
3695 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3696 bool ignore_step_p
3697 = (vf_one_p
3698 && (preserves_scalar_order_p
3699 || operand_equal_p (DR_STEP (dr_info_a->dr),
3700 DR_STEP (dr_info_b->dr))));
3701
3702 /* Skip the pair if inter-iteration dependencies are irrelevant
3703 and intra-iteration dependencies are guaranteed to be honored. */
3704 if (ignore_step_p
3705 && (preserves_scalar_order_p
3706 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3707 &lower_bound)))
3708 {
3709 if (dump_enabled_p ())
3710 dump_printf_loc (MSG_NOTE, vect_location,
3711 "no need for alias check between "
3712 "%T and %T when VF is 1\n",
3713 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3714 continue;
3715 }
3716
3717 /* See whether we can handle the alias using a bounds check on
3718 the step, and whether that's likely to be the best approach.
3719 (It might not be, for example, if the minimum step is much larger
3720 than the number of bytes handled by one vector iteration.) */
3721 if (!ignore_step_p
3722 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3723 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3724 &lower_bound)
3725 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3726 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3727 {
3728 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3729 if (dump_enabled_p ())
3730 {
3731 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3732 "%T and %T when the step %T is outside ",
3733 DR_REF (dr_info_a->dr),
3734 DR_REF (dr_info_b->dr),
3735 DR_STEP (dr_info_a->dr));
3736 if (unsigned_p)
3737 dump_printf (MSG_NOTE, "[0");
3738 else
3739 {
3740 dump_printf (MSG_NOTE, "(");
3741 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3742 }
3743 dump_printf (MSG_NOTE, ", ");
3744 dump_dec (MSG_NOTE, lower_bound);
3745 dump_printf (MSG_NOTE, ")\n");
3746 }
3747 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3748 unsigned_p, lower_bound);
3749 continue;
3750 }
3751
3752 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3753 if (dr_group_first_a)
3754 {
3755 stmt_info_a = dr_group_first_a;
3756 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3757 }
3758
3759 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3760 if (dr_group_first_b)
3761 {
3762 stmt_info_b = dr_group_first_b;
3763 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3764 }
3765
3766 if (ignore_step_p)
3767 {
3768 segment_length_a = size_zero_node;
3769 segment_length_b = size_zero_node;
3770 }
3771 else
3772 {
3773 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3774 DR_STEP (dr_info_b->dr), 0))
3775 length_factor = scalar_loop_iters;
3776 else
3777 length_factor = size_int (vect_factor);
3778 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3779 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3780 }
3781 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3782 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3783 align_a = vect_vfa_align (dr_info_a);
3784 align_b = vect_vfa_align (dr_info_b);
3785
3786 /* See whether the alias is known at compilation time. */
3787 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3788 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3789 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3790 DR_OFFSET (dr_info_b->dr), 0)
3791 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3792 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3793 && poly_int_tree_p (segment_length_a)
3794 && poly_int_tree_p (segment_length_b))
3795 {
3796 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3797 segment_length_a,
3798 segment_length_b,
3799 access_size_a,
3800 access_size_b);
3801 if (res >= 0 && dump_enabled_p ())
3802 {
3803 dump_printf_loc (MSG_NOTE, vect_location,
3804 "can tell at compile time that %T and %T",
3805 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3806 if (res == 0)
3807 dump_printf (MSG_NOTE, " do not alias\n");
3808 else
3809 dump_printf (MSG_NOTE, " alias\n");
3810 }
3811
3812 if (res == 0)
3813 continue;
3814
3815 if (res == 1)
3816 return opt_result::failure_at (stmt_info_b->stmt,
3817 "not vectorized:"
3818 " compilation time alias: %G%G",
3819 stmt_info_a->stmt,
3820 stmt_info_b->stmt);
3821 }
3822
3823 /* dr_with_seg_len requires the alignment to apply to the segment length
3824 and access size, not just the start address. The access size can be
3825 smaller than the pointer alignment for grouped accesses and bitfield
3826 references; see PR115192 and PR116125 respectively. */
3827 align_a = std::min (align_a, least_bit_hwi (access_size_a));
3828 align_b = std::min (align_b, least_bit_hwi (access_size_b));
3829
3830 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3831 access_size_a, align_a);
3832 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3833 access_size_b, align_b);
3834 /* Canonicalize the order to be the one that's needed for accurate
3835 RAW, WAR and WAW flags, in cases where the data references are
3836 well-ordered. The order doesn't really matter otherwise,
3837 but we might as well be consistent. */
3838 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3839 std::swap (dr_a, dr_b);
3840
3841 dr_with_seg_len_pair_t dr_with_seg_len_pair
3842 (dr_a, dr_b, (preserves_scalar_order_p
3843 ? dr_with_seg_len_pair_t::WELL_ORDERED
3844 : dr_with_seg_len_pair_t::REORDERED));
3845
3846 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3847 }
3848
3849 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3850
3851 unsigned int count = (comp_alias_ddrs.length ()
3852 + check_unequal_addrs.length ());
3853
3854 if (count
3855 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3856 == VECT_COST_MODEL_VERY_CHEAP))
3857 return opt_result::failure_at
3858 (vect_location, "would need a runtime alias check\n");
3859
3860 if (dump_enabled_p ())
3861 dump_printf_loc (MSG_NOTE, vect_location,
3862 "improved number of alias checks from %d to %d\n",
3863 may_alias_ddrs.length (), count);
3864 unsigned limit = param_vect_max_version_for_alias_checks;
3865 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3866 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3867 if (count > limit)
3868 return opt_result::failure_at
3869 (vect_location,
3870 "number of versioning for alias run-time tests exceeds %d "
3871 "(--param vect-max-version-for-alias-checks)\n", limit);
3872
3873 return opt_result::success ();
3874 }
3875
3876 /* Check whether we can use an internal function for a gather load
3877 or scatter store. READ_P is true for loads and false for stores.
3878 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3879 the type of the memory elements being loaded or stored. OFFSET_TYPE
3880 is the type of the offset that is being applied to the invariant
3881 base address. SCALE is the amount by which the offset should
3882 be multiplied *after* it has been converted to address width.
3883
3884 Return true if the function is supported, storing the function id in
3885 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3886
3887 bool
3888 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3889 tree vectype, tree memory_type, tree offset_type,
3890 int scale, internal_fn *ifn_out,
3891 tree *offset_vectype_out)
3892 {
3893 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3894 unsigned int element_bits = vector_element_bits (vectype);
3895 if (element_bits != memory_bits)
3896 /* For now the vector elements must be the same width as the
3897 memory elements. */
3898 return false;
3899
3900 /* Work out which function we need. */
3901 internal_fn ifn, alt_ifn;
3902 if (read_p)
3903 {
3904 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3905 alt_ifn = IFN_MASK_GATHER_LOAD;
3906 }
3907 else
3908 {
3909 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3910 alt_ifn = IFN_MASK_SCATTER_STORE;
3911 }
3912
3913 for (;;)
3914 {
3915 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3916 if (!offset_vectype)
3917 return false;
3918
3919 /* Test whether the target supports this combination. */
3920 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3921 offset_vectype, scale))
3922 {
3923 *ifn_out = ifn;
3924 *offset_vectype_out = offset_vectype;
3925 return true;
3926 }
3927 else if (!masked_p
3928 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3929 memory_type,
3930 offset_vectype,
3931 scale))
3932 {
3933 *ifn_out = alt_ifn;
3934 *offset_vectype_out = offset_vectype;
3935 return true;
3936 }
3937
3938 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3939 && TYPE_PRECISION (offset_type) >= element_bits)
3940 return false;
3941
3942 offset_type = build_nonstandard_integer_type
3943 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3944 }
3945 }
3946
3947 /* STMT_INFO is a call to an internal gather load or scatter store function.
3948 Describe the operation in INFO. */
3949
3950 static void
3951 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3952 gather_scatter_info *info)
3953 {
3954 gcall *call = as_a <gcall *> (stmt_info->stmt);
3955 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3956 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3957
3958 info->ifn = gimple_call_internal_fn (call);
3959 info->decl = NULL_TREE;
3960 info->base = gimple_call_arg (call, 0);
3961 info->offset = gimple_call_arg (call, 1);
3962 info->offset_dt = vect_unknown_def_type;
3963 info->offset_vectype = NULL_TREE;
3964 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3965 info->element_type = TREE_TYPE (vectype);
3966 info->memory_type = TREE_TYPE (DR_REF (dr));
3967 }
3968
3969 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3970 gather load or scatter store. Describe the operation in *INFO if so. */
3971
3972 bool
3973 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3974 gather_scatter_info *info)
3975 {
3976 HOST_WIDE_INT scale = 1;
3977 poly_int64 pbitpos, pbitsize;
3978 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3979 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3980 tree offtype = NULL_TREE;
3981 tree decl = NULL_TREE, base, off;
3982 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3983 tree memory_type = TREE_TYPE (DR_REF (dr));
3984 machine_mode pmode;
3985 int punsignedp, reversep, pvolatilep = 0;
3986 internal_fn ifn;
3987 tree offset_vectype;
3988 bool masked_p = false;
3989
3990 /* See whether this is already a call to a gather/scatter internal function.
3991 If not, see whether it's a masked load or store. */
3992 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3993 if (call && gimple_call_internal_p (call))
3994 {
3995 ifn = gimple_call_internal_fn (call);
3996 if (internal_gather_scatter_fn_p (ifn))
3997 {
3998 vect_describe_gather_scatter_call (stmt_info, info);
3999 return true;
4000 }
4001 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4002 }
4003
4004 /* True if we should aim to use internal functions rather than
4005 built-in functions. */
4006 bool use_ifn_p = (DR_IS_READ (dr)
4007 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4008 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4009
4010 base = DR_REF (dr);
4011 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4012 see if we can use the def stmt of the address. */
4013 if (masked_p
4014 && TREE_CODE (base) == MEM_REF
4015 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4016 && integer_zerop (TREE_OPERAND (base, 1))
4017 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4018 {
4019 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4020 if (is_gimple_assign (def_stmt)
4021 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4022 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4023 }
4024
4025 /* The gather and scatter builtins need address of the form
4026 loop_invariant + vector * {1, 2, 4, 8}
4027 or
4028 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4029 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4030 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4031 multiplications and additions in it. To get a vector, we need
4032 a single SSA_NAME that will be defined in the loop and will
4033 contain everything that is not loop invariant and that can be
4034 vectorized. The following code attempts to find such a preexistng
4035 SSA_NAME OFF and put the loop invariants into a tree BASE
4036 that can be gimplified before the loop. */
4037 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4038 &punsignedp, &reversep, &pvolatilep);
4039 if (reversep)
4040 return false;
4041
4042 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4043
4044 if (TREE_CODE (base) == MEM_REF)
4045 {
4046 if (!integer_zerop (TREE_OPERAND (base, 1)))
4047 {
4048 if (off == NULL_TREE)
4049 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4050 else
4051 off = size_binop (PLUS_EXPR, off,
4052 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4053 }
4054 base = TREE_OPERAND (base, 0);
4055 }
4056 else
4057 base = build_fold_addr_expr (base);
4058
4059 if (off == NULL_TREE)
4060 off = size_zero_node;
4061
4062 /* If base is not loop invariant, either off is 0, then we start with just
4063 the constant offset in the loop invariant BASE and continue with base
4064 as OFF, otherwise give up.
4065 We could handle that case by gimplifying the addition of base + off
4066 into some SSA_NAME and use that as off, but for now punt. */
4067 if (!expr_invariant_in_loop_p (loop, base))
4068 {
4069 if (!integer_zerop (off))
4070 return false;
4071 off = base;
4072 base = size_int (pbytepos);
4073 }
4074 /* Otherwise put base + constant offset into the loop invariant BASE
4075 and continue with OFF. */
4076 else
4077 {
4078 base = fold_convert (sizetype, base);
4079 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4080 }
4081
4082 /* OFF at this point may be either a SSA_NAME or some tree expression
4083 from get_inner_reference. Try to peel off loop invariants from it
4084 into BASE as long as possible. */
4085 STRIP_NOPS (off);
4086 while (offtype == NULL_TREE)
4087 {
4088 enum tree_code code;
4089 tree op0, op1, add = NULL_TREE;
4090
4091 if (TREE_CODE (off) == SSA_NAME)
4092 {
4093 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4094
4095 if (expr_invariant_in_loop_p (loop, off))
4096 return false;
4097
4098 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4099 break;
4100
4101 op0 = gimple_assign_rhs1 (def_stmt);
4102 code = gimple_assign_rhs_code (def_stmt);
4103 op1 = gimple_assign_rhs2 (def_stmt);
4104 }
4105 else
4106 {
4107 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4108 return false;
4109 code = TREE_CODE (off);
4110 extract_ops_from_tree (off, &code, &op0, &op1);
4111 }
4112 switch (code)
4113 {
4114 case POINTER_PLUS_EXPR:
4115 case PLUS_EXPR:
4116 if (expr_invariant_in_loop_p (loop, op0))
4117 {
4118 add = op0;
4119 off = op1;
4120 do_add:
4121 add = fold_convert (sizetype, add);
4122 if (scale != 1)
4123 add = size_binop (MULT_EXPR, add, size_int (scale));
4124 base = size_binop (PLUS_EXPR, base, add);
4125 continue;
4126 }
4127 if (expr_invariant_in_loop_p (loop, op1))
4128 {
4129 add = op1;
4130 off = op0;
4131 goto do_add;
4132 }
4133 break;
4134 case MINUS_EXPR:
4135 if (expr_invariant_in_loop_p (loop, op1))
4136 {
4137 add = fold_convert (sizetype, op1);
4138 add = size_binop (MINUS_EXPR, size_zero_node, add);
4139 off = op0;
4140 goto do_add;
4141 }
4142 break;
4143 case MULT_EXPR:
4144 if (scale == 1 && tree_fits_shwi_p (op1))
4145 {
4146 int new_scale = tree_to_shwi (op1);
4147 /* Only treat this as a scaling operation if the target
4148 supports it for at least some offset type. */
4149 if (use_ifn_p
4150 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4151 masked_p, vectype, memory_type,
4152 signed_char_type_node,
4153 new_scale, &ifn,
4154 &offset_vectype)
4155 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4156 masked_p, vectype, memory_type,
4157 unsigned_char_type_node,
4158 new_scale, &ifn,
4159 &offset_vectype))
4160 break;
4161 scale = new_scale;
4162 off = op0;
4163 continue;
4164 }
4165 break;
4166 case SSA_NAME:
4167 off = op0;
4168 continue;
4169 CASE_CONVERT:
4170 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4171 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4172 break;
4173
4174 /* Don't include the conversion if the target is happy with
4175 the current offset type. */
4176 if (use_ifn_p
4177 && !POINTER_TYPE_P (TREE_TYPE (off))
4178 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4179 masked_p, vectype, memory_type,
4180 TREE_TYPE (off), scale, &ifn,
4181 &offset_vectype))
4182 break;
4183
4184 if (TYPE_PRECISION (TREE_TYPE (op0))
4185 == TYPE_PRECISION (TREE_TYPE (off)))
4186 {
4187 off = op0;
4188 continue;
4189 }
4190
4191 /* Include the conversion if it is widening and we're using
4192 the IFN path or the target can handle the converted from
4193 offset or the current size is not already the same as the
4194 data vector element size. */
4195 if ((TYPE_PRECISION (TREE_TYPE (op0))
4196 < TYPE_PRECISION (TREE_TYPE (off)))
4197 && (use_ifn_p
4198 || (DR_IS_READ (dr)
4199 ? (targetm.vectorize.builtin_gather
4200 && targetm.vectorize.builtin_gather (vectype,
4201 TREE_TYPE (op0),
4202 scale))
4203 : (targetm.vectorize.builtin_scatter
4204 && targetm.vectorize.builtin_scatter (vectype,
4205 TREE_TYPE (op0),
4206 scale)))
4207 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4208 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4209 {
4210 off = op0;
4211 offtype = TREE_TYPE (off);
4212 STRIP_NOPS (off);
4213 continue;
4214 }
4215 break;
4216 default:
4217 break;
4218 }
4219 break;
4220 }
4221
4222 /* If at the end OFF still isn't a SSA_NAME or isn't
4223 defined in the loop, punt. */
4224 if (TREE_CODE (off) != SSA_NAME
4225 || expr_invariant_in_loop_p (loop, off))
4226 return false;
4227
4228 if (offtype == NULL_TREE)
4229 offtype = TREE_TYPE (off);
4230
4231 if (use_ifn_p)
4232 {
4233 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4234 vectype, memory_type, offtype, scale,
4235 &ifn, &offset_vectype))
4236 ifn = IFN_LAST;
4237 decl = NULL_TREE;
4238 }
4239 else
4240 {
4241 if (DR_IS_READ (dr))
4242 {
4243 if (targetm.vectorize.builtin_gather)
4244 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4245 }
4246 else
4247 {
4248 if (targetm.vectorize.builtin_scatter)
4249 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4250 }
4251 ifn = IFN_LAST;
4252 /* The offset vector type will be read from DECL when needed. */
4253 offset_vectype = NULL_TREE;
4254 }
4255
4256 info->ifn = ifn;
4257 info->decl = decl;
4258 info->base = base;
4259 info->offset = off;
4260 info->offset_dt = vect_unknown_def_type;
4261 info->offset_vectype = offset_vectype;
4262 info->scale = scale;
4263 info->element_type = TREE_TYPE (vectype);
4264 info->memory_type = memory_type;
4265 return true;
4266 }
4267
4268 /* Find the data references in STMT, analyze them with respect to LOOP and
4269 append them to DATAREFS. Return false if datarefs in this stmt cannot
4270 be handled. */
4271
4272 opt_result
4273 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4274 vec<data_reference_p> *datarefs,
4275 vec<int> *dataref_groups, int group_id)
4276 {
4277 /* We can ignore clobbers for dataref analysis - they are removed during
4278 loop vectorization and BB vectorization checks dependences with a
4279 stmt walk. */
4280 if (gimple_clobber_p (stmt))
4281 return opt_result::success ();
4282
4283 if (gimple_has_volatile_ops (stmt))
4284 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4285 stmt);
4286
4287 if (stmt_can_throw_internal (cfun, stmt))
4288 return opt_result::failure_at (stmt,
4289 "not vectorized:"
4290 " statement can throw an exception: %G",
4291 stmt);
4292
4293 auto_vec<data_reference_p, 2> refs;
4294 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4295 if (!res)
4296 return res;
4297
4298 if (refs.is_empty ())
4299 return opt_result::success ();
4300
4301 if (refs.length () > 1)
4302 {
4303 while (!refs.is_empty ())
4304 free_data_ref (refs.pop ());
4305 return opt_result::failure_at (stmt,
4306 "not vectorized: more than one "
4307 "data ref in stmt: %G", stmt);
4308 }
4309
4310 data_reference_p dr = refs.pop ();
4311 if (gcall *call = dyn_cast <gcall *> (stmt))
4312 if (!gimple_call_internal_p (call)
4313 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4314 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4315 {
4316 free_data_ref (dr);
4317 return opt_result::failure_at (stmt,
4318 "not vectorized: dr in a call %G", stmt);
4319 }
4320
4321 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4322 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4323 {
4324 free_data_ref (dr);
4325 return opt_result::failure_at (stmt,
4326 "not vectorized:"
4327 " statement is bitfield access %G", stmt);
4328 }
4329
4330 if (DR_BASE_ADDRESS (dr)
4331 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4332 {
4333 free_data_ref (dr);
4334 return opt_result::failure_at (stmt,
4335 "not vectorized:"
4336 " base addr of dr is a constant\n");
4337 }
4338
4339 /* Check whether this may be a SIMD lane access and adjust the
4340 DR to make it easier for us to handle it. */
4341 if (loop
4342 && loop->simduid
4343 && (!DR_BASE_ADDRESS (dr)
4344 || !DR_OFFSET (dr)
4345 || !DR_INIT (dr)
4346 || !DR_STEP (dr)))
4347 {
4348 struct data_reference *newdr
4349 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4350 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4351 if (DR_BASE_ADDRESS (newdr)
4352 && DR_OFFSET (newdr)
4353 && DR_INIT (newdr)
4354 && DR_STEP (newdr)
4355 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4356 && integer_zerop (DR_STEP (newdr)))
4357 {
4358 tree base_address = DR_BASE_ADDRESS (newdr);
4359 tree off = DR_OFFSET (newdr);
4360 tree step = ssize_int (1);
4361 if (integer_zerop (off)
4362 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4363 {
4364 off = TREE_OPERAND (base_address, 1);
4365 base_address = TREE_OPERAND (base_address, 0);
4366 }
4367 STRIP_NOPS (off);
4368 if (TREE_CODE (off) == MULT_EXPR
4369 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4370 {
4371 step = TREE_OPERAND (off, 1);
4372 off = TREE_OPERAND (off, 0);
4373 STRIP_NOPS (off);
4374 }
4375 if (CONVERT_EXPR_P (off)
4376 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4377 < TYPE_PRECISION (TREE_TYPE (off))))
4378 off = TREE_OPERAND (off, 0);
4379 if (TREE_CODE (off) == SSA_NAME)
4380 {
4381 gimple *def = SSA_NAME_DEF_STMT (off);
4382 /* Look through widening conversion. */
4383 if (is_gimple_assign (def)
4384 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4385 {
4386 tree rhs1 = gimple_assign_rhs1 (def);
4387 if (TREE_CODE (rhs1) == SSA_NAME
4388 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4389 && (TYPE_PRECISION (TREE_TYPE (off))
4390 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4391 def = SSA_NAME_DEF_STMT (rhs1);
4392 }
4393 if (is_gimple_call (def)
4394 && gimple_call_internal_p (def)
4395 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4396 {
4397 tree arg = gimple_call_arg (def, 0);
4398 tree reft = TREE_TYPE (DR_REF (newdr));
4399 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4400 arg = SSA_NAME_VAR (arg);
4401 if (arg == loop->simduid
4402 /* For now. */
4403 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4404 {
4405 DR_BASE_ADDRESS (newdr) = base_address;
4406 DR_OFFSET (newdr) = ssize_int (0);
4407 DR_STEP (newdr) = step;
4408 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4409 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4410 /* Mark as simd-lane access. */
4411 tree arg2 = gimple_call_arg (def, 1);
4412 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4413 free_data_ref (dr);
4414 datarefs->safe_push (newdr);
4415 if (dataref_groups)
4416 dataref_groups->safe_push (group_id);
4417 return opt_result::success ();
4418 }
4419 }
4420 }
4421 }
4422 free_data_ref (newdr);
4423 }
4424
4425 datarefs->safe_push (dr);
4426 if (dataref_groups)
4427 dataref_groups->safe_push (group_id);
4428 return opt_result::success ();
4429 }
4430
4431 /* Function vect_analyze_data_refs.
4432
4433 Find all the data references in the loop or basic block.
4434
4435 The general structure of the analysis of data refs in the vectorizer is as
4436 follows:
4437 1- vect_analyze_data_refs(loop/bb): call
4438 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4439 in the loop/bb and their dependences.
4440 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4441 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4442 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4443
4444 */
4445
4446 opt_result
4447 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4448 {
4449 class loop *loop = NULL;
4450 unsigned int i;
4451 struct data_reference *dr;
4452 tree scalar_type;
4453
4454 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4455
4456 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4457 loop = LOOP_VINFO_LOOP (loop_vinfo);
4458
4459 /* Go through the data-refs, check that the analysis succeeded. Update
4460 pointer from stmt_vec_info struct to DR and vectype. */
4461
4462 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4463 FOR_EACH_VEC_ELT (datarefs, i, dr)
4464 {
4465 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4466 poly_uint64 vf;
4467
4468 gcc_assert (DR_REF (dr));
4469 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4470 gcc_assert (!stmt_info->dr_aux.dr);
4471 stmt_info->dr_aux.dr = dr;
4472 stmt_info->dr_aux.stmt = stmt_info;
4473
4474 /* Check that analysis of the data-ref succeeded. */
4475 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4476 || !DR_STEP (dr))
4477 {
4478 bool maybe_gather
4479 = DR_IS_READ (dr)
4480 && !TREE_THIS_VOLATILE (DR_REF (dr));
4481 bool maybe_scatter
4482 = DR_IS_WRITE (dr)
4483 && !TREE_THIS_VOLATILE (DR_REF (dr))
4484 && (targetm.vectorize.builtin_scatter != NULL
4485 || supports_vec_scatter_store_p ());
4486
4487 /* If target supports vector gather loads or scatter stores,
4488 see if they can't be used. */
4489 if (is_a <loop_vec_info> (vinfo)
4490 && !nested_in_vect_loop_p (loop, stmt_info))
4491 {
4492 if (maybe_gather || maybe_scatter)
4493 {
4494 if (maybe_gather)
4495 gatherscatter = GATHER;
4496 else
4497 gatherscatter = SCATTER;
4498 }
4499 }
4500
4501 if (gatherscatter == SG_NONE)
4502 {
4503 if (dump_enabled_p ())
4504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4505 "not vectorized: data ref analysis "
4506 "failed %G", stmt_info->stmt);
4507 if (is_a <bb_vec_info> (vinfo))
4508 {
4509 /* In BB vectorization the ref can still participate
4510 in dependence analysis, we just can't vectorize it. */
4511 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4512 continue;
4513 }
4514 return opt_result::failure_at (stmt_info->stmt,
4515 "not vectorized:"
4516 " data ref analysis failed: %G",
4517 stmt_info->stmt);
4518 }
4519 }
4520
4521 /* See if this was detected as SIMD lane access. */
4522 if (dr->aux == (void *)-1
4523 || dr->aux == (void *)-2
4524 || dr->aux == (void *)-3
4525 || dr->aux == (void *)-4)
4526 {
4527 if (nested_in_vect_loop_p (loop, stmt_info))
4528 return opt_result::failure_at (stmt_info->stmt,
4529 "not vectorized:"
4530 " data ref analysis failed: %G",
4531 stmt_info->stmt);
4532 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4533 = -(uintptr_t) dr->aux;
4534 }
4535
4536 tree base = get_base_address (DR_REF (dr));
4537 if (base && VAR_P (base) && DECL_NONALIASED (base))
4538 {
4539 if (dump_enabled_p ())
4540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4541 "not vectorized: base object not addressable "
4542 "for stmt: %G", stmt_info->stmt);
4543 if (is_a <bb_vec_info> (vinfo))
4544 {
4545 /* In BB vectorization the ref can still participate
4546 in dependence analysis, we just can't vectorize it. */
4547 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4548 continue;
4549 }
4550 return opt_result::failure_at (stmt_info->stmt,
4551 "not vectorized: base object not"
4552 " addressable for stmt: %G",
4553 stmt_info->stmt);
4554 }
4555
4556 if (is_a <loop_vec_info> (vinfo)
4557 && DR_STEP (dr)
4558 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4559 {
4560 if (nested_in_vect_loop_p (loop, stmt_info))
4561 return opt_result::failure_at (stmt_info->stmt,
4562 "not vectorized: "
4563 "not suitable for strided load %G",
4564 stmt_info->stmt);
4565 STMT_VINFO_STRIDED_P (stmt_info) = true;
4566 }
4567
4568 /* Update DR field in stmt_vec_info struct. */
4569
4570 /* If the dataref is in an inner-loop of the loop that is considered for
4571 for vectorization, we also want to analyze the access relative to
4572 the outer-loop (DR contains information only relative to the
4573 inner-most enclosing loop). We do that by building a reference to the
4574 first location accessed by the inner-loop, and analyze it relative to
4575 the outer-loop. */
4576 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4577 {
4578 /* Build a reference to the first location accessed by the
4579 inner loop: *(BASE + INIT + OFFSET). By construction,
4580 this address must be invariant in the inner loop, so we
4581 can consider it as being used in the outer loop. */
4582 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4583 tree offset = unshare_expr (DR_OFFSET (dr));
4584 tree init = unshare_expr (DR_INIT (dr));
4585 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4586 init, offset);
4587 tree init_addr = fold_build_pointer_plus (base, init_offset);
4588 tree init_ref = build_fold_indirect_ref (init_addr);
4589
4590 if (dump_enabled_p ())
4591 dump_printf_loc (MSG_NOTE, vect_location,
4592 "analyze in outer loop: %T\n", init_ref);
4593
4594 opt_result res
4595 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4596 init_ref, loop, stmt_info->stmt);
4597 if (!res)
4598 /* dr_analyze_innermost already explained the failure. */
4599 return res;
4600
4601 if (dump_enabled_p ())
4602 dump_printf_loc (MSG_NOTE, vect_location,
4603 "\touter base_address: %T\n"
4604 "\touter offset from base address: %T\n"
4605 "\touter constant offset from base address: %T\n"
4606 "\touter step: %T\n"
4607 "\touter base alignment: %d\n\n"
4608 "\touter base misalignment: %d\n"
4609 "\touter offset alignment: %d\n"
4610 "\touter step alignment: %d\n",
4611 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4612 STMT_VINFO_DR_OFFSET (stmt_info),
4613 STMT_VINFO_DR_INIT (stmt_info),
4614 STMT_VINFO_DR_STEP (stmt_info),
4615 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4616 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4617 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4618 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4619 }
4620
4621 /* Set vectype for STMT. */
4622 scalar_type = TREE_TYPE (DR_REF (dr));
4623 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4624 if (!vectype)
4625 {
4626 if (dump_enabled_p ())
4627 {
4628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4629 "not vectorized: no vectype for stmt: %G",
4630 stmt_info->stmt);
4631 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4632 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4633 scalar_type);
4634 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4635 }
4636
4637 if (is_a <bb_vec_info> (vinfo))
4638 {
4639 /* No vector type is fine, the ref can still participate
4640 in dependence analysis, we just can't vectorize it. */
4641 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4642 continue;
4643 }
4644 if (fatal)
4645 *fatal = false;
4646 return opt_result::failure_at (stmt_info->stmt,
4647 "not vectorized:"
4648 " no vectype for stmt: %G"
4649 " scalar_type: %T\n",
4650 stmt_info->stmt, scalar_type);
4651 }
4652 else
4653 {
4654 if (dump_enabled_p ())
4655 dump_printf_loc (MSG_NOTE, vect_location,
4656 "got vectype for stmt: %G%T\n",
4657 stmt_info->stmt, vectype);
4658 }
4659
4660 /* Adjust the minimal vectorization factor according to the
4661 vector type. */
4662 vf = TYPE_VECTOR_SUBPARTS (vectype);
4663 *min_vf = upper_bound (*min_vf, vf);
4664
4665 /* Leave the BB vectorizer to pick the vector type later, based on
4666 the final dataref group size and SLP node size. */
4667 if (is_a <loop_vec_info> (vinfo))
4668 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4669
4670 if (gatherscatter != SG_NONE)
4671 {
4672 gather_scatter_info gs_info;
4673 if (!vect_check_gather_scatter (stmt_info,
4674 as_a <loop_vec_info> (vinfo),
4675 &gs_info)
4676 || !get_vectype_for_scalar_type (vinfo,
4677 TREE_TYPE (gs_info.offset)))
4678 {
4679 if (fatal)
4680 *fatal = false;
4681 return opt_result::failure_at
4682 (stmt_info->stmt,
4683 (gatherscatter == GATHER)
4684 ? "not vectorized: not suitable for gather load %G"
4685 : "not vectorized: not suitable for scatter store %G",
4686 stmt_info->stmt);
4687 }
4688 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4689 }
4690 }
4691
4692 /* We used to stop processing and prune the list here. Verify we no
4693 longer need to. */
4694 gcc_assert (i == datarefs.length ());
4695
4696 return opt_result::success ();
4697 }
4698
4699
4700 /* Function vect_get_new_vect_var.
4701
4702 Returns a name for a new variable. The current naming scheme appends the
4703 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4704 the name of vectorizer generated variables, and appends that to NAME if
4705 provided. */
4706
4707 tree
4708 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4709 {
4710 const char *prefix;
4711 tree new_vect_var;
4712
4713 switch (var_kind)
4714 {
4715 case vect_simple_var:
4716 prefix = "vect";
4717 break;
4718 case vect_scalar_var:
4719 prefix = "stmp";
4720 break;
4721 case vect_mask_var:
4722 prefix = "mask";
4723 break;
4724 case vect_pointer_var:
4725 prefix = "vectp";
4726 break;
4727 default:
4728 gcc_unreachable ();
4729 }
4730
4731 if (name)
4732 {
4733 char* tmp = concat (prefix, "_", name, NULL);
4734 new_vect_var = create_tmp_reg (type, tmp);
4735 free (tmp);
4736 }
4737 else
4738 new_vect_var = create_tmp_reg (type, prefix);
4739
4740 return new_vect_var;
4741 }
4742
4743 /* Like vect_get_new_vect_var but return an SSA name. */
4744
4745 tree
4746 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4747 {
4748 const char *prefix;
4749 tree new_vect_var;
4750
4751 switch (var_kind)
4752 {
4753 case vect_simple_var:
4754 prefix = "vect";
4755 break;
4756 case vect_scalar_var:
4757 prefix = "stmp";
4758 break;
4759 case vect_pointer_var:
4760 prefix = "vectp";
4761 break;
4762 default:
4763 gcc_unreachable ();
4764 }
4765
4766 if (name)
4767 {
4768 char* tmp = concat (prefix, "_", name, NULL);
4769 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4770 free (tmp);
4771 }
4772 else
4773 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4774
4775 return new_vect_var;
4776 }
4777
4778 /* Duplicate points-to info on NAME from DR_INFO. */
4779
4780 static void
4781 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4782 {
4783 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4784 /* DR_PTR_INFO is for a base SSA name, not including constant or
4785 variable offsets in the ref so its alignment info does not apply. */
4786 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4787 }
4788
4789 /* Function vect_create_addr_base_for_vector_ref.
4790
4791 Create an expression that computes the address of the first memory location
4792 that will be accessed for a data reference.
4793
4794 Input:
4795 STMT_INFO: The statement containing the data reference.
4796 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4797 OFFSET: Optional. If supplied, it is be added to the initial address.
4798 LOOP: Specify relative to which loop-nest should the address be computed.
4799 For example, when the dataref is in an inner-loop nested in an
4800 outer-loop that is now being vectorized, LOOP can be either the
4801 outer-loop, or the inner-loop. The first memory location accessed
4802 by the following dataref ('in' points to short):
4803
4804 for (i=0; i<N; i++)
4805 for (j=0; j<M; j++)
4806 s += in[i+j]
4807
4808 is as follows:
4809 if LOOP=i_loop: &in (relative to i_loop)
4810 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4811
4812 Output:
4813 1. Return an SSA_NAME whose value is the address of the memory location of
4814 the first vector of the data reference.
4815 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4816 these statement(s) which define the returned SSA_NAME.
4817
4818 FORNOW: We are only handling array accesses with step 1. */
4819
4820 tree
4821 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4822 gimple_seq *new_stmt_list,
4823 tree offset)
4824 {
4825 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4826 struct data_reference *dr = dr_info->dr;
4827 const char *base_name;
4828 tree addr_base;
4829 tree dest;
4830 gimple_seq seq = NULL;
4831 tree vect_ptr_type;
4832 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4833 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4834
4835 tree data_ref_base = unshare_expr (drb->base_address);
4836 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4837 tree init = unshare_expr (drb->init);
4838
4839 if (loop_vinfo)
4840 base_name = get_name (data_ref_base);
4841 else
4842 {
4843 base_offset = ssize_int (0);
4844 init = ssize_int (0);
4845 base_name = get_name (DR_REF (dr));
4846 }
4847
4848 /* Create base_offset */
4849 base_offset = size_binop (PLUS_EXPR,
4850 fold_convert (sizetype, base_offset),
4851 fold_convert (sizetype, init));
4852
4853 if (offset)
4854 {
4855 offset = fold_convert (sizetype, offset);
4856 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4857 base_offset, offset);
4858 }
4859
4860 /* base + base_offset */
4861 if (loop_vinfo)
4862 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4863 else
4864 addr_base = build1 (ADDR_EXPR,
4865 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4866 /* Strip zero offset components since we don't need
4867 them and they can confuse late diagnostics if
4868 we CSE them wrongly. See PR106904 for example. */
4869 unshare_expr (strip_zero_offset_components
4870 (DR_REF (dr))));
4871
4872 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4873 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4874 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4875 gimple_seq_add_seq (new_stmt_list, seq);
4876
4877 if (DR_PTR_INFO (dr)
4878 && TREE_CODE (addr_base) == SSA_NAME
4879 /* We should only duplicate pointer info to newly created SSA names. */
4880 && SSA_NAME_VAR (addr_base) == dest)
4881 {
4882 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4883 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4884 }
4885
4886 if (dump_enabled_p ())
4887 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4888
4889 return addr_base;
4890 }
4891
4892
4893 /* Function vect_create_data_ref_ptr.
4894
4895 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4896 location accessed in the loop by STMT_INFO, along with the def-use update
4897 chain to appropriately advance the pointer through the loop iterations.
4898 Also set aliasing information for the pointer. This pointer is used by
4899 the callers to this function to create a memory reference expression for
4900 vector load/store access.
4901
4902 Input:
4903 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4904 GIMPLE_ASSIGN <name, data-ref> or
4905 GIMPLE_ASSIGN <data-ref, name>.
4906 2. AGGR_TYPE: the type of the reference, which should be either a vector
4907 or an array.
4908 3. AT_LOOP: the loop where the vector memref is to be created.
4909 4. OFFSET (optional): a byte offset to be added to the initial address
4910 accessed by the data-ref in STMT_INFO.
4911 5. BSI: location where the new stmts are to be placed if there is no loop
4912 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4913 pointing to the initial address.
4914 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4915 to the IV during each iteration of the loop. NULL says to move
4916 by one copy of AGGR_TYPE up or down, depending on the step of the
4917 data reference.
4918
4919 Output:
4920 1. Declare a new ptr to vector_type, and have it point to the base of the
4921 data reference (initial addressed accessed by the data reference).
4922 For example, for vector of type V8HI, the following code is generated:
4923
4924 v8hi *ap;
4925 ap = (v8hi *)initial_address;
4926
4927 if OFFSET is not supplied:
4928 initial_address = &a[init];
4929 if OFFSET is supplied:
4930 initial_address = &a[init] + OFFSET;
4931 if BYTE_OFFSET is supplied:
4932 initial_address = &a[init] + BYTE_OFFSET;
4933
4934 Return the initial_address in INITIAL_ADDRESS.
4935
4936 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4937 update the pointer in each iteration of the loop.
4938
4939 Return the increment stmt that updates the pointer in PTR_INCR.
4940
4941 3. Return the pointer. */
4942
4943 tree
4944 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4945 tree aggr_type, class loop *at_loop, tree offset,
4946 tree *initial_address, gimple_stmt_iterator *gsi,
4947 gimple **ptr_incr, bool only_init,
4948 tree iv_step)
4949 {
4950 const char *base_name;
4951 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4952 class loop *loop = NULL;
4953 bool nested_in_vect_loop = false;
4954 class loop *containing_loop = NULL;
4955 tree aggr_ptr_type;
4956 tree aggr_ptr;
4957 tree new_temp;
4958 gimple_seq new_stmt_list = NULL;
4959 edge pe = NULL;
4960 basic_block new_bb;
4961 tree aggr_ptr_init;
4962 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4963 struct data_reference *dr = dr_info->dr;
4964 tree aptr;
4965 gimple_stmt_iterator incr_gsi;
4966 bool insert_after;
4967 tree indx_before_incr, indx_after_incr;
4968 gimple *incr;
4969 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4970
4971 gcc_assert (iv_step != NULL_TREE
4972 || TREE_CODE (aggr_type) == ARRAY_TYPE
4973 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4974
4975 if (loop_vinfo)
4976 {
4977 loop = LOOP_VINFO_LOOP (loop_vinfo);
4978 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4979 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4980 pe = loop_preheader_edge (loop);
4981 }
4982 else
4983 {
4984 gcc_assert (bb_vinfo);
4985 only_init = true;
4986 *ptr_incr = NULL;
4987 }
4988
4989 /* Create an expression for the first address accessed by this load
4990 in LOOP. */
4991 base_name = get_name (DR_BASE_ADDRESS (dr));
4992
4993 if (dump_enabled_p ())
4994 {
4995 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4996 dump_printf_loc (MSG_NOTE, vect_location,
4997 "create %s-pointer variable to type: %T",
4998 get_tree_code_name (TREE_CODE (aggr_type)),
4999 aggr_type);
5000 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5001 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5002 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5003 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5004 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5005 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5006 else
5007 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5008 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5009 }
5010
5011 /* (1) Create the new aggregate-pointer variable.
5012 Vector and array types inherit the alias set of their component
5013 type by default so we need to use a ref-all pointer if the data
5014 reference does not conflict with the created aggregated data
5015 reference because it is not addressable. */
5016 bool need_ref_all = false;
5017 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5018 get_alias_set (DR_REF (dr))))
5019 need_ref_all = true;
5020 /* Likewise for any of the data references in the stmt group. */
5021 else if (DR_GROUP_SIZE (stmt_info) > 1)
5022 {
5023 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5024 do
5025 {
5026 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5027 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5028 get_alias_set (DR_REF (sdr))))
5029 {
5030 need_ref_all = true;
5031 break;
5032 }
5033 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5034 }
5035 while (sinfo);
5036 }
5037 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5038 need_ref_all);
5039 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5040
5041
5042 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5043 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5044 def-use update cycles for the pointer: one relative to the outer-loop
5045 (LOOP), which is what steps (3) and (4) below do. The other is relative
5046 to the inner-loop (which is the inner-most loop containing the dataref),
5047 and this is done be step (5) below.
5048
5049 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5050 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5051 redundant. Steps (3),(4) create the following:
5052
5053 vp0 = &base_addr;
5054 LOOP: vp1 = phi(vp0,vp2)
5055 ...
5056 ...
5057 vp2 = vp1 + step
5058 goto LOOP
5059
5060 If there is an inner-loop nested in loop, then step (5) will also be
5061 applied, and an additional update in the inner-loop will be created:
5062
5063 vp0 = &base_addr;
5064 LOOP: vp1 = phi(vp0,vp2)
5065 ...
5066 inner: vp3 = phi(vp1,vp4)
5067 vp4 = vp3 + inner_step
5068 if () goto inner
5069 ...
5070 vp2 = vp1 + step
5071 if () goto LOOP */
5072
5073 /* (2) Calculate the initial address of the aggregate-pointer, and set
5074 the aggregate-pointer to point to it before the loop. */
5075
5076 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5077
5078 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5079 stmt_info, &new_stmt_list,
5080 offset);
5081 if (new_stmt_list)
5082 {
5083 if (pe)
5084 {
5085 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5086 gcc_assert (!new_bb);
5087 }
5088 else
5089 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5090 }
5091
5092 *initial_address = new_temp;
5093 aggr_ptr_init = new_temp;
5094
5095 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5096 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5097 inner-loop nested in LOOP (during outer-loop vectorization). */
5098
5099 /* No update in loop is required. */
5100 if (only_init && (!loop_vinfo || at_loop == loop))
5101 aptr = aggr_ptr_init;
5102 else
5103 {
5104 /* Accesses to invariant addresses should be handled specially
5105 by the caller. */
5106 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5107 gcc_assert (!integer_zerop (step));
5108
5109 if (iv_step == NULL_TREE)
5110 {
5111 /* The step of the aggregate pointer is the type size,
5112 negated for downward accesses. */
5113 iv_step = TYPE_SIZE_UNIT (aggr_type);
5114 if (tree_int_cst_sgn (step) == -1)
5115 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5116 }
5117
5118 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5119
5120 create_iv (aggr_ptr_init,
5121 fold_convert (aggr_ptr_type, iv_step),
5122 aggr_ptr, loop, &incr_gsi, insert_after,
5123 &indx_before_incr, &indx_after_incr);
5124 incr = gsi_stmt (incr_gsi);
5125
5126 /* Copy the points-to information if it exists. */
5127 if (DR_PTR_INFO (dr))
5128 {
5129 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5130 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5131 }
5132 if (ptr_incr)
5133 *ptr_incr = incr;
5134
5135 aptr = indx_before_incr;
5136 }
5137
5138 if (!nested_in_vect_loop || only_init)
5139 return aptr;
5140
5141
5142 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5143 nested in LOOP, if exists. */
5144
5145 gcc_assert (nested_in_vect_loop);
5146 if (!only_init)
5147 {
5148 standard_iv_increment_position (containing_loop, &incr_gsi,
5149 &insert_after);
5150 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5151 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5152 &indx_after_incr);
5153 incr = gsi_stmt (incr_gsi);
5154
5155 /* Copy the points-to information if it exists. */
5156 if (DR_PTR_INFO (dr))
5157 {
5158 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5159 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5160 }
5161 if (ptr_incr)
5162 *ptr_incr = incr;
5163
5164 return indx_before_incr;
5165 }
5166 else
5167 gcc_unreachable ();
5168 }
5169
5170
5171 /* Function bump_vector_ptr
5172
5173 Increment a pointer (to a vector type) by vector-size. If requested,
5174 i.e. if PTR-INCR is given, then also connect the new increment stmt
5175 to the existing def-use update-chain of the pointer, by modifying
5176 the PTR_INCR as illustrated below:
5177
5178 The pointer def-use update-chain before this function:
5179 DATAREF_PTR = phi (p_0, p_2)
5180 ....
5181 PTR_INCR: p_2 = DATAREF_PTR + step
5182
5183 The pointer def-use update-chain after this function:
5184 DATAREF_PTR = phi (p_0, p_2)
5185 ....
5186 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5187 ....
5188 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5189
5190 Input:
5191 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5192 in the loop.
5193 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5194 the loop. The increment amount across iterations is expected
5195 to be vector_size.
5196 BSI - location where the new update stmt is to be placed.
5197 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5198 BUMP - optional. The offset by which to bump the pointer. If not given,
5199 the offset is assumed to be vector_size.
5200
5201 Output: Return NEW_DATAREF_PTR as illustrated above.
5202
5203 */
5204
5205 tree
5206 bump_vector_ptr (vec_info *vinfo,
5207 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5208 stmt_vec_info stmt_info, tree bump)
5209 {
5210 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5211 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5212 tree update = TYPE_SIZE_UNIT (vectype);
5213 gimple *incr_stmt;
5214 ssa_op_iter iter;
5215 use_operand_p use_p;
5216 tree new_dataref_ptr;
5217
5218 if (bump)
5219 update = bump;
5220
5221 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5222 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5223 else
5224 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5225 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5226 dataref_ptr, update);
5227 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5228 /* Fold the increment, avoiding excessive chains use-def chains of
5229 those, leading to compile-time issues for passes until the next
5230 forwprop pass which would do this as well. */
5231 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5232 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5233 {
5234 incr_stmt = gsi_stmt (fold_gsi);
5235 update_stmt (incr_stmt);
5236 }
5237
5238 /* Copy the points-to information if it exists. */
5239 if (DR_PTR_INFO (dr))
5240 {
5241 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5242 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5243 }
5244
5245 if (!ptr_incr)
5246 return new_dataref_ptr;
5247
5248 /* Update the vector-pointer's cross-iteration increment. */
5249 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5250 {
5251 tree use = USE_FROM_PTR (use_p);
5252
5253 if (use == dataref_ptr)
5254 SET_USE (use_p, new_dataref_ptr);
5255 else
5256 gcc_assert (operand_equal_p (use, update, 0));
5257 }
5258
5259 return new_dataref_ptr;
5260 }
5261
5262
5263 /* Copy memory reference info such as base/clique from the SRC reference
5264 to the DEST MEM_REF. */
5265
5266 void
5267 vect_copy_ref_info (tree dest, tree src)
5268 {
5269 if (TREE_CODE (dest) != MEM_REF)
5270 return;
5271
5272 tree src_base = src;
5273 while (handled_component_p (src_base))
5274 src_base = TREE_OPERAND (src_base, 0);
5275 if (TREE_CODE (src_base) != MEM_REF
5276 && TREE_CODE (src_base) != TARGET_MEM_REF)
5277 return;
5278
5279 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5280 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5281 }
5282
5283
5284 /* Function vect_create_destination_var.
5285
5286 Create a new temporary of type VECTYPE. */
5287
5288 tree
5289 vect_create_destination_var (tree scalar_dest, tree vectype)
5290 {
5291 tree vec_dest;
5292 const char *name;
5293 char *new_name;
5294 tree type;
5295 enum vect_var_kind kind;
5296
5297 kind = vectype
5298 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5299 ? vect_mask_var
5300 : vect_simple_var
5301 : vect_scalar_var;
5302 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5303
5304 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5305
5306 name = get_name (scalar_dest);
5307 if (name)
5308 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5309 else
5310 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5311 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5312 free (new_name);
5313
5314 return vec_dest;
5315 }
5316
5317 /* Function vect_grouped_store_supported.
5318
5319 Returns TRUE if interleave high and interleave low permutations
5320 are supported, and FALSE otherwise. */
5321
5322 bool
5323 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5324 {
5325 machine_mode mode = TYPE_MODE (vectype);
5326
5327 /* vect_permute_store_chain requires the group size to be equal to 3 or
5328 be a power of two. */
5329 if (count != 3 && exact_log2 (count) == -1)
5330 {
5331 if (dump_enabled_p ())
5332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5333 "the size of the group of accesses"
5334 " is not a power of 2 or not eqaul to 3\n");
5335 return false;
5336 }
5337
5338 /* Check that the permutation is supported. */
5339 if (VECTOR_MODE_P (mode))
5340 {
5341 unsigned int i;
5342 if (count == 3)
5343 {
5344 unsigned int j0 = 0, j1 = 0, j2 = 0;
5345 unsigned int i, j;
5346
5347 unsigned int nelt;
5348 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5349 {
5350 if (dump_enabled_p ())
5351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5352 "cannot handle groups of 3 stores for"
5353 " variable-length vectors\n");
5354 return false;
5355 }
5356
5357 vec_perm_builder sel (nelt, nelt, 1);
5358 sel.quick_grow (nelt);
5359 vec_perm_indices indices;
5360 for (j = 0; j < 3; j++)
5361 {
5362 int nelt0 = ((3 - j) * nelt) % 3;
5363 int nelt1 = ((3 - j) * nelt + 1) % 3;
5364 int nelt2 = ((3 - j) * nelt + 2) % 3;
5365 for (i = 0; i < nelt; i++)
5366 {
5367 if (3 * i + nelt0 < nelt)
5368 sel[3 * i + nelt0] = j0++;
5369 if (3 * i + nelt1 < nelt)
5370 sel[3 * i + nelt1] = nelt + j1++;
5371 if (3 * i + nelt2 < nelt)
5372 sel[3 * i + nelt2] = 0;
5373 }
5374 indices.new_vector (sel, 2, nelt);
5375 if (!can_vec_perm_const_p (mode, indices))
5376 {
5377 if (dump_enabled_p ())
5378 dump_printf (MSG_MISSED_OPTIMIZATION,
5379 "permutation op not supported by target.\n");
5380 return false;
5381 }
5382
5383 for (i = 0; i < nelt; i++)
5384 {
5385 if (3 * i + nelt0 < nelt)
5386 sel[3 * i + nelt0] = 3 * i + nelt0;
5387 if (3 * i + nelt1 < nelt)
5388 sel[3 * i + nelt1] = 3 * i + nelt1;
5389 if (3 * i + nelt2 < nelt)
5390 sel[3 * i + nelt2] = nelt + j2++;
5391 }
5392 indices.new_vector (sel, 2, nelt);
5393 if (!can_vec_perm_const_p (mode, indices))
5394 {
5395 if (dump_enabled_p ())
5396 dump_printf (MSG_MISSED_OPTIMIZATION,
5397 "permutation op not supported by target.\n");
5398 return false;
5399 }
5400 }
5401 return true;
5402 }
5403 else
5404 {
5405 /* If length is not equal to 3 then only power of 2 is supported. */
5406 gcc_assert (pow2p_hwi (count));
5407 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5408
5409 /* The encoding has 2 interleaved stepped patterns. */
5410 vec_perm_builder sel (nelt, 2, 3);
5411 sel.quick_grow (6);
5412 for (i = 0; i < 3; i++)
5413 {
5414 sel[i * 2] = i;
5415 sel[i * 2 + 1] = i + nelt;
5416 }
5417 vec_perm_indices indices (sel, 2, nelt);
5418 if (can_vec_perm_const_p (mode, indices))
5419 {
5420 for (i = 0; i < 6; i++)
5421 sel[i] += exact_div (nelt, 2);
5422 indices.new_vector (sel, 2, nelt);
5423 if (can_vec_perm_const_p (mode, indices))
5424 return true;
5425 }
5426 }
5427 }
5428
5429 if (dump_enabled_p ())
5430 dump_printf (MSG_MISSED_OPTIMIZATION,
5431 "permutation op not supported by target.\n");
5432 return false;
5433 }
5434
5435
5436 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5437 type VECTYPE. MASKED_P says whether the masked form is needed. */
5438
5439 bool
5440 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5441 bool masked_p)
5442 {
5443 if (masked_p)
5444 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5445 vec_mask_store_lanes_optab,
5446 vectype, count);
5447 else
5448 return vect_lanes_optab_supported_p ("vec_store_lanes",
5449 vec_store_lanes_optab,
5450 vectype, count);
5451 }
5452
5453
5454 /* Function vect_permute_store_chain.
5455
5456 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5457 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5458 the data correctly for the stores. Return the final references for stores
5459 in RESULT_CHAIN.
5460
5461 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5462 The input is 4 vectors each containing 8 elements. We assign a number to
5463 each element, the input sequence is:
5464
5465 1st vec: 0 1 2 3 4 5 6 7
5466 2nd vec: 8 9 10 11 12 13 14 15
5467 3rd vec: 16 17 18 19 20 21 22 23
5468 4th vec: 24 25 26 27 28 29 30 31
5469
5470 The output sequence should be:
5471
5472 1st vec: 0 8 16 24 1 9 17 25
5473 2nd vec: 2 10 18 26 3 11 19 27
5474 3rd vec: 4 12 20 28 5 13 21 30
5475 4th vec: 6 14 22 30 7 15 23 31
5476
5477 i.e., we interleave the contents of the four vectors in their order.
5478
5479 We use interleave_high/low instructions to create such output. The input of
5480 each interleave_high/low operation is two vectors:
5481 1st vec 2nd vec
5482 0 1 2 3 4 5 6 7
5483 the even elements of the result vector are obtained left-to-right from the
5484 high/low elements of the first vector. The odd elements of the result are
5485 obtained left-to-right from the high/low elements of the second vector.
5486 The output of interleave_high will be: 0 4 1 5
5487 and of interleave_low: 2 6 3 7
5488
5489
5490 The permutation is done in log LENGTH stages. In each stage interleave_high
5491 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5492 where the first argument is taken from the first half of DR_CHAIN and the
5493 second argument from it's second half.
5494 In our example,
5495
5496 I1: interleave_high (1st vec, 3rd vec)
5497 I2: interleave_low (1st vec, 3rd vec)
5498 I3: interleave_high (2nd vec, 4th vec)
5499 I4: interleave_low (2nd vec, 4th vec)
5500
5501 The output for the first stage is:
5502
5503 I1: 0 16 1 17 2 18 3 19
5504 I2: 4 20 5 21 6 22 7 23
5505 I3: 8 24 9 25 10 26 11 27
5506 I4: 12 28 13 29 14 30 15 31
5507
5508 The output of the second stage, i.e. the final result is:
5509
5510 I1: 0 8 16 24 1 9 17 25
5511 I2: 2 10 18 26 3 11 19 27
5512 I3: 4 12 20 28 5 13 21 30
5513 I4: 6 14 22 30 7 15 23 31. */
5514
5515 void
5516 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5517 unsigned int length,
5518 stmt_vec_info stmt_info,
5519 gimple_stmt_iterator *gsi,
5520 vec<tree> *result_chain)
5521 {
5522 tree vect1, vect2, high, low;
5523 gimple *perm_stmt;
5524 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5525 tree perm_mask_low, perm_mask_high;
5526 tree data_ref;
5527 tree perm3_mask_low, perm3_mask_high;
5528 unsigned int i, j, n, log_length = exact_log2 (length);
5529
5530 result_chain->quick_grow (length);
5531 memcpy (result_chain->address (), dr_chain.address (),
5532 length * sizeof (tree));
5533
5534 if (length == 3)
5535 {
5536 /* vect_grouped_store_supported ensures that this is constant. */
5537 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5538 unsigned int j0 = 0, j1 = 0, j2 = 0;
5539
5540 vec_perm_builder sel (nelt, nelt, 1);
5541 sel.quick_grow (nelt);
5542 vec_perm_indices indices;
5543 for (j = 0; j < 3; j++)
5544 {
5545 int nelt0 = ((3 - j) * nelt) % 3;
5546 int nelt1 = ((3 - j) * nelt + 1) % 3;
5547 int nelt2 = ((3 - j) * nelt + 2) % 3;
5548
5549 for (i = 0; i < nelt; i++)
5550 {
5551 if (3 * i + nelt0 < nelt)
5552 sel[3 * i + nelt0] = j0++;
5553 if (3 * i + nelt1 < nelt)
5554 sel[3 * i + nelt1] = nelt + j1++;
5555 if (3 * i + nelt2 < nelt)
5556 sel[3 * i + nelt2] = 0;
5557 }
5558 indices.new_vector (sel, 2, nelt);
5559 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5560
5561 for (i = 0; i < nelt; i++)
5562 {
5563 if (3 * i + nelt0 < nelt)
5564 sel[3 * i + nelt0] = 3 * i + nelt0;
5565 if (3 * i + nelt1 < nelt)
5566 sel[3 * i + nelt1] = 3 * i + nelt1;
5567 if (3 * i + nelt2 < nelt)
5568 sel[3 * i + nelt2] = nelt + j2++;
5569 }
5570 indices.new_vector (sel, 2, nelt);
5571 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5572
5573 vect1 = dr_chain[0];
5574 vect2 = dr_chain[1];
5575
5576 /* Create interleaving stmt:
5577 low = VEC_PERM_EXPR <vect1, vect2,
5578 {j, nelt, *, j + 1, nelt + j + 1, *,
5579 j + 2, nelt + j + 2, *, ...}> */
5580 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5581 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5582 vect2, perm3_mask_low);
5583 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5584
5585 vect1 = data_ref;
5586 vect2 = dr_chain[2];
5587 /* Create interleaving stmt:
5588 low = VEC_PERM_EXPR <vect1, vect2,
5589 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5590 6, 7, nelt + j + 2, ...}> */
5591 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5592 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5593 vect2, perm3_mask_high);
5594 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5595 (*result_chain)[j] = data_ref;
5596 }
5597 }
5598 else
5599 {
5600 /* If length is not equal to 3 then only power of 2 is supported. */
5601 gcc_assert (pow2p_hwi (length));
5602
5603 /* The encoding has 2 interleaved stepped patterns. */
5604 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5605 vec_perm_builder sel (nelt, 2, 3);
5606 sel.quick_grow (6);
5607 for (i = 0; i < 3; i++)
5608 {
5609 sel[i * 2] = i;
5610 sel[i * 2 + 1] = i + nelt;
5611 }
5612 vec_perm_indices indices (sel, 2, nelt);
5613 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5614
5615 for (i = 0; i < 6; i++)
5616 sel[i] += exact_div (nelt, 2);
5617 indices.new_vector (sel, 2, nelt);
5618 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5619
5620 for (i = 0, n = log_length; i < n; i++)
5621 {
5622 for (j = 0; j < length/2; j++)
5623 {
5624 vect1 = dr_chain[j];
5625 vect2 = dr_chain[j+length/2];
5626
5627 /* Create interleaving stmt:
5628 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5629 ...}> */
5630 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5631 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5632 vect2, perm_mask_high);
5633 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5634 (*result_chain)[2*j] = high;
5635
5636 /* Create interleaving stmt:
5637 low = VEC_PERM_EXPR <vect1, vect2,
5638 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5639 ...}> */
5640 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5641 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5642 vect2, perm_mask_low);
5643 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5644 (*result_chain)[2*j+1] = low;
5645 }
5646 memcpy (dr_chain.address (), result_chain->address (),
5647 length * sizeof (tree));
5648 }
5649 }
5650 }
5651
5652 /* Function vect_setup_realignment
5653
5654 This function is called when vectorizing an unaligned load using
5655 the dr_explicit_realign[_optimized] scheme.
5656 This function generates the following code at the loop prolog:
5657
5658 p = initial_addr;
5659 x msq_init = *(floor(p)); # prolog load
5660 realignment_token = call target_builtin;
5661 loop:
5662 x msq = phi (msq_init, ---)
5663
5664 The stmts marked with x are generated only for the case of
5665 dr_explicit_realign_optimized.
5666
5667 The code above sets up a new (vector) pointer, pointing to the first
5668 location accessed by STMT_INFO, and a "floor-aligned" load using that
5669 pointer. It also generates code to compute the "realignment-token"
5670 (if the relevant target hook was defined), and creates a phi-node at the
5671 loop-header bb whose arguments are the result of the prolog-load (created
5672 by this function) and the result of a load that takes place in the loop
5673 (to be created by the caller to this function).
5674
5675 For the case of dr_explicit_realign_optimized:
5676 The caller to this function uses the phi-result (msq) to create the
5677 realignment code inside the loop, and sets up the missing phi argument,
5678 as follows:
5679 loop:
5680 msq = phi (msq_init, lsq)
5681 lsq = *(floor(p')); # load in loop
5682 result = realign_load (msq, lsq, realignment_token);
5683
5684 For the case of dr_explicit_realign:
5685 loop:
5686 msq = *(floor(p)); # load in loop
5687 p' = p + (VS-1);
5688 lsq = *(floor(p')); # load in loop
5689 result = realign_load (msq, lsq, realignment_token);
5690
5691 Input:
5692 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5693 a memory location that may be unaligned.
5694 BSI - place where new code is to be inserted.
5695 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5696 is used.
5697
5698 Output:
5699 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5700 target hook, if defined.
5701 Return value - the result of the loop-header phi node. */
5702
5703 tree
5704 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5705 gimple_stmt_iterator *gsi, tree *realignment_token,
5706 enum dr_alignment_support alignment_support_scheme,
5707 tree init_addr,
5708 class loop **at_loop)
5709 {
5710 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5711 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5712 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5713 struct data_reference *dr = dr_info->dr;
5714 class loop *loop = NULL;
5715 edge pe = NULL;
5716 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5717 tree vec_dest;
5718 gimple *inc;
5719 tree ptr;
5720 tree data_ref;
5721 basic_block new_bb;
5722 tree msq_init = NULL_TREE;
5723 tree new_temp;
5724 gphi *phi_stmt;
5725 tree msq = NULL_TREE;
5726 gimple_seq stmts = NULL;
5727 bool compute_in_loop = false;
5728 bool nested_in_vect_loop = false;
5729 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5730 class loop *loop_for_initial_load = NULL;
5731
5732 if (loop_vinfo)
5733 {
5734 loop = LOOP_VINFO_LOOP (loop_vinfo);
5735 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5736 }
5737
5738 gcc_assert (alignment_support_scheme == dr_explicit_realign
5739 || alignment_support_scheme == dr_explicit_realign_optimized);
5740
5741 /* We need to generate three things:
5742 1. the misalignment computation
5743 2. the extra vector load (for the optimized realignment scheme).
5744 3. the phi node for the two vectors from which the realignment is
5745 done (for the optimized realignment scheme). */
5746
5747 /* 1. Determine where to generate the misalignment computation.
5748
5749 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5750 calculation will be generated by this function, outside the loop (in the
5751 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5752 caller, inside the loop.
5753
5754 Background: If the misalignment remains fixed throughout the iterations of
5755 the loop, then both realignment schemes are applicable, and also the
5756 misalignment computation can be done outside LOOP. This is because we are
5757 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5758 are a multiple of VS (the Vector Size), and therefore the misalignment in
5759 different vectorized LOOP iterations is always the same.
5760 The problem arises only if the memory access is in an inner-loop nested
5761 inside LOOP, which is now being vectorized using outer-loop vectorization.
5762 This is the only case when the misalignment of the memory access may not
5763 remain fixed throughout the iterations of the inner-loop (as explained in
5764 detail in vect_supportable_dr_alignment). In this case, not only is the
5765 optimized realignment scheme not applicable, but also the misalignment
5766 computation (and generation of the realignment token that is passed to
5767 REALIGN_LOAD) have to be done inside the loop.
5768
5769 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5770 or not, which in turn determines if the misalignment is computed inside
5771 the inner-loop, or outside LOOP. */
5772
5773 if (init_addr != NULL_TREE || !loop_vinfo)
5774 {
5775 compute_in_loop = true;
5776 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5777 }
5778
5779
5780 /* 2. Determine where to generate the extra vector load.
5781
5782 For the optimized realignment scheme, instead of generating two vector
5783 loads in each iteration, we generate a single extra vector load in the
5784 preheader of the loop, and in each iteration reuse the result of the
5785 vector load from the previous iteration. In case the memory access is in
5786 an inner-loop nested inside LOOP, which is now being vectorized using
5787 outer-loop vectorization, we need to determine whether this initial vector
5788 load should be generated at the preheader of the inner-loop, or can be
5789 generated at the preheader of LOOP. If the memory access has no evolution
5790 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5791 to be generated inside LOOP (in the preheader of the inner-loop). */
5792
5793 if (nested_in_vect_loop)
5794 {
5795 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5796 bool invariant_in_outerloop =
5797 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5798 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5799 }
5800 else
5801 loop_for_initial_load = loop;
5802 if (at_loop)
5803 *at_loop = loop_for_initial_load;
5804
5805 if (loop_for_initial_load)
5806 pe = loop_preheader_edge (loop_for_initial_load);
5807
5808 /* 3. For the case of the optimized realignment, create the first vector
5809 load at the loop preheader. */
5810
5811 if (alignment_support_scheme == dr_explicit_realign_optimized)
5812 {
5813 /* Create msq_init = *(floor(p1)) in the loop preheader */
5814 gassign *new_stmt;
5815
5816 gcc_assert (!compute_in_loop);
5817 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5818 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5819 loop_for_initial_load, NULL_TREE,
5820 &init_addr, NULL, &inc, true);
5821 if (TREE_CODE (ptr) == SSA_NAME)
5822 new_temp = copy_ssa_name (ptr);
5823 else
5824 new_temp = make_ssa_name (TREE_TYPE (ptr));
5825 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5826 tree type = TREE_TYPE (ptr);
5827 new_stmt = gimple_build_assign
5828 (new_temp, BIT_AND_EXPR, ptr,
5829 fold_build2 (MINUS_EXPR, type,
5830 build_int_cst (type, 0),
5831 build_int_cst (type, align)));
5832 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5833 gcc_assert (!new_bb);
5834 data_ref
5835 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5836 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5837 vect_copy_ref_info (data_ref, DR_REF (dr));
5838 new_stmt = gimple_build_assign (vec_dest, data_ref);
5839 new_temp = make_ssa_name (vec_dest, new_stmt);
5840 gimple_assign_set_lhs (new_stmt, new_temp);
5841 if (pe)
5842 {
5843 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5844 gcc_assert (!new_bb);
5845 }
5846 else
5847 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5848
5849 msq_init = gimple_assign_lhs (new_stmt);
5850 }
5851
5852 /* 4. Create realignment token using a target builtin, if available.
5853 It is done either inside the containing loop, or before LOOP (as
5854 determined above). */
5855
5856 if (targetm.vectorize.builtin_mask_for_load)
5857 {
5858 gcall *new_stmt;
5859 tree builtin_decl;
5860
5861 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5862 if (!init_addr)
5863 {
5864 /* Generate the INIT_ADDR computation outside LOOP. */
5865 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5866 stmt_info, &stmts,
5867 NULL_TREE);
5868 if (loop)
5869 {
5870 pe = loop_preheader_edge (loop);
5871 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5872 gcc_assert (!new_bb);
5873 }
5874 else
5875 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5876 }
5877
5878 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5879 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5880 vec_dest =
5881 vect_create_destination_var (scalar_dest,
5882 gimple_call_return_type (new_stmt));
5883 new_temp = make_ssa_name (vec_dest, new_stmt);
5884 gimple_call_set_lhs (new_stmt, new_temp);
5885
5886 if (compute_in_loop)
5887 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5888 else
5889 {
5890 /* Generate the misalignment computation outside LOOP. */
5891 pe = loop_preheader_edge (loop);
5892 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5893 gcc_assert (!new_bb);
5894 }
5895
5896 *realignment_token = gimple_call_lhs (new_stmt);
5897
5898 /* The result of the CALL_EXPR to this builtin is determined from
5899 the value of the parameter and no global variables are touched
5900 which makes the builtin a "const" function. Requiring the
5901 builtin to have the "const" attribute makes it unnecessary
5902 to call mark_call_clobbered. */
5903 gcc_assert (TREE_READONLY (builtin_decl));
5904 }
5905
5906 if (alignment_support_scheme == dr_explicit_realign)
5907 return msq;
5908
5909 gcc_assert (!compute_in_loop);
5910 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5911
5912
5913 /* 5. Create msq = phi <msq_init, lsq> in loop */
5914
5915 pe = loop_preheader_edge (containing_loop);
5916 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5917 msq = make_ssa_name (vec_dest);
5918 phi_stmt = create_phi_node (msq, containing_loop->header);
5919 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5920
5921 return msq;
5922 }
5923
5924
5925 /* Function vect_grouped_load_supported.
5926
5927 COUNT is the size of the load group (the number of statements plus the
5928 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5929 only one statement, with a gap of COUNT - 1.
5930
5931 Returns true if a suitable permute exists. */
5932
5933 bool
5934 vect_grouped_load_supported (tree vectype, bool single_element_p,
5935 unsigned HOST_WIDE_INT count)
5936 {
5937 machine_mode mode = TYPE_MODE (vectype);
5938
5939 /* If this is single-element interleaving with an element distance
5940 that leaves unused vector loads around punt - we at least create
5941 very sub-optimal code in that case (and blow up memory,
5942 see PR65518). */
5943 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5944 {
5945 if (dump_enabled_p ())
5946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5947 "single-element interleaving not supported "
5948 "for not adjacent vector loads\n");
5949 return false;
5950 }
5951
5952 /* vect_permute_load_chain requires the group size to be equal to 3 or
5953 be a power of two. */
5954 if (count != 3 && exact_log2 (count) == -1)
5955 {
5956 if (dump_enabled_p ())
5957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5958 "the size of the group of accesses"
5959 " is not a power of 2 or not equal to 3\n");
5960 return false;
5961 }
5962
5963 /* Check that the permutation is supported. */
5964 if (VECTOR_MODE_P (mode))
5965 {
5966 unsigned int i, j;
5967 if (count == 3)
5968 {
5969 unsigned int nelt;
5970 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5971 {
5972 if (dump_enabled_p ())
5973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5974 "cannot handle groups of 3 loads for"
5975 " variable-length vectors\n");
5976 return false;
5977 }
5978
5979 vec_perm_builder sel (nelt, nelt, 1);
5980 sel.quick_grow (nelt);
5981 vec_perm_indices indices;
5982 unsigned int k;
5983 for (k = 0; k < 3; k++)
5984 {
5985 for (i = 0; i < nelt; i++)
5986 if (3 * i + k < 2 * nelt)
5987 sel[i] = 3 * i + k;
5988 else
5989 sel[i] = 0;
5990 indices.new_vector (sel, 2, nelt);
5991 if (!can_vec_perm_const_p (mode, indices))
5992 {
5993 if (dump_enabled_p ())
5994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5995 "shuffle of 3 loads is not supported by"
5996 " target\n");
5997 return false;
5998 }
5999 for (i = 0, j = 0; i < nelt; i++)
6000 if (3 * i + k < 2 * nelt)
6001 sel[i] = i;
6002 else
6003 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6004 indices.new_vector (sel, 2, nelt);
6005 if (!can_vec_perm_const_p (mode, indices))
6006 {
6007 if (dump_enabled_p ())
6008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6009 "shuffle of 3 loads is not supported by"
6010 " target\n");
6011 return false;
6012 }
6013 }
6014 return true;
6015 }
6016 else
6017 {
6018 /* If length is not equal to 3 then only power of 2 is supported. */
6019 gcc_assert (pow2p_hwi (count));
6020 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6021
6022 /* The encoding has a single stepped pattern. */
6023 vec_perm_builder sel (nelt, 1, 3);
6024 sel.quick_grow (3);
6025 for (i = 0; i < 3; i++)
6026 sel[i] = i * 2;
6027 vec_perm_indices indices (sel, 2, nelt);
6028 if (can_vec_perm_const_p (mode, indices))
6029 {
6030 for (i = 0; i < 3; i++)
6031 sel[i] = i * 2 + 1;
6032 indices.new_vector (sel, 2, nelt);
6033 if (can_vec_perm_const_p (mode, indices))
6034 return true;
6035 }
6036 }
6037 }
6038
6039 if (dump_enabled_p ())
6040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6041 "extract even/odd not supported by target\n");
6042 return false;
6043 }
6044
6045 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6046 type VECTYPE. MASKED_P says whether the masked form is needed. */
6047
6048 bool
6049 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6050 bool masked_p)
6051 {
6052 if (masked_p)
6053 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6054 vec_mask_load_lanes_optab,
6055 vectype, count);
6056 else
6057 return vect_lanes_optab_supported_p ("vec_load_lanes",
6058 vec_load_lanes_optab,
6059 vectype, count);
6060 }
6061
6062 /* Function vect_permute_load_chain.
6063
6064 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6065 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6066 the input data correctly. Return the final references for loads in
6067 RESULT_CHAIN.
6068
6069 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6070 The input is 4 vectors each containing 8 elements. We assign a number to each
6071 element, the input sequence is:
6072
6073 1st vec: 0 1 2 3 4 5 6 7
6074 2nd vec: 8 9 10 11 12 13 14 15
6075 3rd vec: 16 17 18 19 20 21 22 23
6076 4th vec: 24 25 26 27 28 29 30 31
6077
6078 The output sequence should be:
6079
6080 1st vec: 0 4 8 12 16 20 24 28
6081 2nd vec: 1 5 9 13 17 21 25 29
6082 3rd vec: 2 6 10 14 18 22 26 30
6083 4th vec: 3 7 11 15 19 23 27 31
6084
6085 i.e., the first output vector should contain the first elements of each
6086 interleaving group, etc.
6087
6088 We use extract_even/odd instructions to create such output. The input of
6089 each extract_even/odd operation is two vectors
6090 1st vec 2nd vec
6091 0 1 2 3 4 5 6 7
6092
6093 and the output is the vector of extracted even/odd elements. The output of
6094 extract_even will be: 0 2 4 6
6095 and of extract_odd: 1 3 5 7
6096
6097
6098 The permutation is done in log LENGTH stages. In each stage extract_even
6099 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6100 their order. In our example,
6101
6102 E1: extract_even (1st vec, 2nd vec)
6103 E2: extract_odd (1st vec, 2nd vec)
6104 E3: extract_even (3rd vec, 4th vec)
6105 E4: extract_odd (3rd vec, 4th vec)
6106
6107 The output for the first stage will be:
6108
6109 E1: 0 2 4 6 8 10 12 14
6110 E2: 1 3 5 7 9 11 13 15
6111 E3: 16 18 20 22 24 26 28 30
6112 E4: 17 19 21 23 25 27 29 31
6113
6114 In order to proceed and create the correct sequence for the next stage (or
6115 for the correct output, if the second stage is the last one, as in our
6116 example), we first put the output of extract_even operation and then the
6117 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6118 The input for the second stage is:
6119
6120 1st vec (E1): 0 2 4 6 8 10 12 14
6121 2nd vec (E3): 16 18 20 22 24 26 28 30
6122 3rd vec (E2): 1 3 5 7 9 11 13 15
6123 4th vec (E4): 17 19 21 23 25 27 29 31
6124
6125 The output of the second stage:
6126
6127 E1: 0 4 8 12 16 20 24 28
6128 E2: 2 6 10 14 18 22 26 30
6129 E3: 1 5 9 13 17 21 25 29
6130 E4: 3 7 11 15 19 23 27 31
6131
6132 And RESULT_CHAIN after reordering:
6133
6134 1st vec (E1): 0 4 8 12 16 20 24 28
6135 2nd vec (E3): 1 5 9 13 17 21 25 29
6136 3rd vec (E2): 2 6 10 14 18 22 26 30
6137 4th vec (E4): 3 7 11 15 19 23 27 31. */
6138
6139 static void
6140 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6141 unsigned int length,
6142 stmt_vec_info stmt_info,
6143 gimple_stmt_iterator *gsi,
6144 vec<tree> *result_chain)
6145 {
6146 tree data_ref, first_vect, second_vect;
6147 tree perm_mask_even, perm_mask_odd;
6148 tree perm3_mask_low, perm3_mask_high;
6149 gimple *perm_stmt;
6150 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6151 unsigned int i, j, log_length = exact_log2 (length);
6152
6153 result_chain->quick_grow (length);
6154 memcpy (result_chain->address (), dr_chain.address (),
6155 length * sizeof (tree));
6156
6157 if (length == 3)
6158 {
6159 /* vect_grouped_load_supported ensures that this is constant. */
6160 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6161 unsigned int k;
6162
6163 vec_perm_builder sel (nelt, nelt, 1);
6164 sel.quick_grow (nelt);
6165 vec_perm_indices indices;
6166 for (k = 0; k < 3; k++)
6167 {
6168 for (i = 0; i < nelt; i++)
6169 if (3 * i + k < 2 * nelt)
6170 sel[i] = 3 * i + k;
6171 else
6172 sel[i] = 0;
6173 indices.new_vector (sel, 2, nelt);
6174 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6175
6176 for (i = 0, j = 0; i < nelt; i++)
6177 if (3 * i + k < 2 * nelt)
6178 sel[i] = i;
6179 else
6180 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6181 indices.new_vector (sel, 2, nelt);
6182 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6183
6184 first_vect = dr_chain[0];
6185 second_vect = dr_chain[1];
6186
6187 /* Create interleaving stmt (low part of):
6188 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6189 ...}> */
6190 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6191 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6192 second_vect, perm3_mask_low);
6193 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6194
6195 /* Create interleaving stmt (high part of):
6196 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6197 ...}> */
6198 first_vect = data_ref;
6199 second_vect = dr_chain[2];
6200 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6201 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6202 second_vect, perm3_mask_high);
6203 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6204 (*result_chain)[k] = data_ref;
6205 }
6206 }
6207 else
6208 {
6209 /* If length is not equal to 3 then only power of 2 is supported. */
6210 gcc_assert (pow2p_hwi (length));
6211
6212 /* The encoding has a single stepped pattern. */
6213 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6214 vec_perm_builder sel (nelt, 1, 3);
6215 sel.quick_grow (3);
6216 for (i = 0; i < 3; ++i)
6217 sel[i] = i * 2;
6218 vec_perm_indices indices (sel, 2, nelt);
6219 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6220
6221 for (i = 0; i < 3; ++i)
6222 sel[i] = i * 2 + 1;
6223 indices.new_vector (sel, 2, nelt);
6224 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6225
6226 for (i = 0; i < log_length; i++)
6227 {
6228 for (j = 0; j < length; j += 2)
6229 {
6230 first_vect = dr_chain[j];
6231 second_vect = dr_chain[j+1];
6232
6233 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6234 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6235 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6236 first_vect, second_vect,
6237 perm_mask_even);
6238 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6239 (*result_chain)[j/2] = data_ref;
6240
6241 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6242 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6243 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6244 first_vect, second_vect,
6245 perm_mask_odd);
6246 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6247 (*result_chain)[j/2+length/2] = data_ref;
6248 }
6249 memcpy (dr_chain.address (), result_chain->address (),
6250 length * sizeof (tree));
6251 }
6252 }
6253 }
6254
6255 /* Function vect_shift_permute_load_chain.
6256
6257 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6258 sequence of stmts to reorder the input data accordingly.
6259 Return the final references for loads in RESULT_CHAIN.
6260 Return true if successed, false otherwise.
6261
6262 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6263 The input is 3 vectors each containing 8 elements. We assign a
6264 number to each element, the input sequence is:
6265
6266 1st vec: 0 1 2 3 4 5 6 7
6267 2nd vec: 8 9 10 11 12 13 14 15
6268 3rd vec: 16 17 18 19 20 21 22 23
6269
6270 The output sequence should be:
6271
6272 1st vec: 0 3 6 9 12 15 18 21
6273 2nd vec: 1 4 7 10 13 16 19 22
6274 3rd vec: 2 5 8 11 14 17 20 23
6275
6276 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6277
6278 First we shuffle all 3 vectors to get correct elements order:
6279
6280 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6281 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6282 3rd vec: (16 19 22) (17 20 23) (18 21)
6283
6284 Next we unite and shift vector 3 times:
6285
6286 1st step:
6287 shift right by 6 the concatenation of:
6288 "1st vec" and "2nd vec"
6289 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6290 "2nd vec" and "3rd vec"
6291 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6292 "3rd vec" and "1st vec"
6293 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6294 | New vectors |
6295
6296 So that now new vectors are:
6297
6298 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6299 2nd vec: (10 13) (16 19 22) (17 20 23)
6300 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6301
6302 2nd step:
6303 shift right by 5 the concatenation of:
6304 "1st vec" and "3rd vec"
6305 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6306 "2nd vec" and "1st vec"
6307 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6308 "3rd vec" and "2nd vec"
6309 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6310 | New vectors |
6311
6312 So that now new vectors are:
6313
6314 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6315 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6316 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6317
6318 3rd step:
6319 shift right by 5 the concatenation of:
6320 "1st vec" and "1st vec"
6321 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6322 shift right by 3 the concatenation of:
6323 "2nd vec" and "2nd vec"
6324 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6325 | New vectors |
6326
6327 So that now all vectors are READY:
6328 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6329 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6330 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6331
6332 This algorithm is faster than one in vect_permute_load_chain if:
6333 1. "shift of a concatination" is faster than general permutation.
6334 This is usually so.
6335 2. The TARGET machine can't execute vector instructions in parallel.
6336 This is because each step of the algorithm depends on previous.
6337 The algorithm in vect_permute_load_chain is much more parallel.
6338
6339 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6340 */
6341
6342 static bool
6343 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6344 unsigned int length,
6345 stmt_vec_info stmt_info,
6346 gimple_stmt_iterator *gsi,
6347 vec<tree> *result_chain)
6348 {
6349 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6350 tree perm2_mask1, perm2_mask2, perm3_mask;
6351 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6352 gimple *perm_stmt;
6353
6354 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6355 unsigned int i;
6356 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6357
6358 unsigned HOST_WIDE_INT nelt, vf;
6359 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6360 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6361 /* Not supported for variable-length vectors. */
6362 return false;
6363
6364 vec_perm_builder sel (nelt, nelt, 1);
6365 sel.quick_grow (nelt);
6366
6367 result_chain->quick_grow (length);
6368 memcpy (result_chain->address (), dr_chain.address (),
6369 length * sizeof (tree));
6370
6371 if (pow2p_hwi (length) && vf > 4)
6372 {
6373 unsigned int j, log_length = exact_log2 (length);
6374 for (i = 0; i < nelt / 2; ++i)
6375 sel[i] = i * 2;
6376 for (i = 0; i < nelt / 2; ++i)
6377 sel[nelt / 2 + i] = i * 2 + 1;
6378 vec_perm_indices indices (sel, 2, nelt);
6379 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6380 {
6381 if (dump_enabled_p ())
6382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6383 "shuffle of 2 fields structure is not \
6384 supported by target\n");
6385 return false;
6386 }
6387 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6388
6389 for (i = 0; i < nelt / 2; ++i)
6390 sel[i] = i * 2 + 1;
6391 for (i = 0; i < nelt / 2; ++i)
6392 sel[nelt / 2 + i] = i * 2;
6393 indices.new_vector (sel, 2, nelt);
6394 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6395 {
6396 if (dump_enabled_p ())
6397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6398 "shuffle of 2 fields structure is not \
6399 supported by target\n");
6400 return false;
6401 }
6402 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6403
6404 /* Generating permutation constant to shift all elements.
6405 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6406 for (i = 0; i < nelt; i++)
6407 sel[i] = nelt / 2 + i;
6408 indices.new_vector (sel, 2, nelt);
6409 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6410 {
6411 if (dump_enabled_p ())
6412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6413 "shift permutation is not supported by target\n");
6414 return false;
6415 }
6416 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6417
6418 /* Generating permutation constant to select vector from 2.
6419 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6420 for (i = 0; i < nelt / 2; i++)
6421 sel[i] = i;
6422 for (i = nelt / 2; i < nelt; i++)
6423 sel[i] = nelt + i;
6424 indices.new_vector (sel, 2, nelt);
6425 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6426 {
6427 if (dump_enabled_p ())
6428 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6429 "select is not supported by target\n");
6430 return false;
6431 }
6432 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6433
6434 for (i = 0; i < log_length; i++)
6435 {
6436 for (j = 0; j < length; j += 2)
6437 {
6438 first_vect = dr_chain[j];
6439 second_vect = dr_chain[j + 1];
6440
6441 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6442 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6443 first_vect, first_vect,
6444 perm2_mask1);
6445 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6446 vect[0] = data_ref;
6447
6448 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6449 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6450 second_vect, second_vect,
6451 perm2_mask2);
6452 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6453 vect[1] = data_ref;
6454
6455 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6456 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6457 vect[0], vect[1], shift1_mask);
6458 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6459 (*result_chain)[j/2 + length/2] = data_ref;
6460
6461 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6462 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6463 vect[0], vect[1], select_mask);
6464 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6465 (*result_chain)[j/2] = data_ref;
6466 }
6467 memcpy (dr_chain.address (), result_chain->address (),
6468 length * sizeof (tree));
6469 }
6470 return true;
6471 }
6472 if (length == 3 && vf > 2)
6473 {
6474 unsigned int k = 0, l = 0;
6475
6476 /* Generating permutation constant to get all elements in rigth order.
6477 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6478 for (i = 0; i < nelt; i++)
6479 {
6480 if (3 * k + (l % 3) >= nelt)
6481 {
6482 k = 0;
6483 l += (3 - (nelt % 3));
6484 }
6485 sel[i] = 3 * k + (l % 3);
6486 k++;
6487 }
6488 vec_perm_indices indices (sel, 2, nelt);
6489 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6490 {
6491 if (dump_enabled_p ())
6492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6493 "shuffle of 3 fields structure is not \
6494 supported by target\n");
6495 return false;
6496 }
6497 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6498
6499 /* Generating permutation constant to shift all elements.
6500 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6501 for (i = 0; i < nelt; i++)
6502 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6503 indices.new_vector (sel, 2, nelt);
6504 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6505 {
6506 if (dump_enabled_p ())
6507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6508 "shift permutation is not supported by target\n");
6509 return false;
6510 }
6511 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6512
6513 /* Generating permutation constant to shift all elements.
6514 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6515 for (i = 0; i < nelt; i++)
6516 sel[i] = 2 * (nelt / 3) + 1 + i;
6517 indices.new_vector (sel, 2, nelt);
6518 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6519 {
6520 if (dump_enabled_p ())
6521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6522 "shift permutation is not supported by target\n");
6523 return false;
6524 }
6525 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6526
6527 /* Generating permutation constant to shift all elements.
6528 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6529 for (i = 0; i < nelt; i++)
6530 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6531 indices.new_vector (sel, 2, nelt);
6532 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6533 {
6534 if (dump_enabled_p ())
6535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6536 "shift permutation is not supported by target\n");
6537 return false;
6538 }
6539 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6540
6541 /* Generating permutation constant to shift all elements.
6542 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6543 for (i = 0; i < nelt; i++)
6544 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6545 indices.new_vector (sel, 2, nelt);
6546 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6547 {
6548 if (dump_enabled_p ())
6549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6550 "shift permutation is not supported by target\n");
6551 return false;
6552 }
6553 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6554
6555 for (k = 0; k < 3; k++)
6556 {
6557 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6558 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6559 dr_chain[k], dr_chain[k],
6560 perm3_mask);
6561 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6562 vect[k] = data_ref;
6563 }
6564
6565 for (k = 0; k < 3; k++)
6566 {
6567 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6568 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6569 vect[k % 3], vect[(k + 1) % 3],
6570 shift1_mask);
6571 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6572 vect_shift[k] = data_ref;
6573 }
6574
6575 for (k = 0; k < 3; k++)
6576 {
6577 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6578 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6579 vect_shift[(4 - k) % 3],
6580 vect_shift[(3 - k) % 3],
6581 shift2_mask);
6582 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6583 vect[k] = data_ref;
6584 }
6585
6586 (*result_chain)[3 - (nelt % 3)] = vect[2];
6587
6588 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6589 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6590 vect[0], shift3_mask);
6591 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6592 (*result_chain)[nelt % 3] = data_ref;
6593
6594 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6595 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6596 vect[1], shift4_mask);
6597 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6598 (*result_chain)[0] = data_ref;
6599 return true;
6600 }
6601 return false;
6602 }
6603
6604 /* Function vect_transform_grouped_load.
6605
6606 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6607 to perform their permutation and ascribe the result vectorized statements to
6608 the scalar statements.
6609 */
6610
6611 void
6612 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6613 vec<tree> dr_chain,
6614 int size, gimple_stmt_iterator *gsi)
6615 {
6616 machine_mode mode;
6617 vec<tree> result_chain = vNULL;
6618
6619 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6620 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6621 vectors, that are ready for vector computation. */
6622 result_chain.create (size);
6623
6624 /* If reassociation width for vector type is 2 or greater target machine can
6625 execute 2 or more vector instructions in parallel. Otherwise try to
6626 get chain for loads group using vect_shift_permute_load_chain. */
6627 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6628 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6629 || pow2p_hwi (size)
6630 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6631 gsi, &result_chain))
6632 vect_permute_load_chain (vinfo, dr_chain,
6633 size, stmt_info, gsi, &result_chain);
6634 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6635 result_chain.release ();
6636 }
6637
6638 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6639 generated as part of the vectorization of STMT_INFO. Assign the statement
6640 for each vector to the associated scalar statement. */
6641
6642 void
6643 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6644 vec<tree> result_chain)
6645 {
6646 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6647 unsigned int i, gap_count;
6648 tree tmp_data_ref;
6649
6650 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6651 Since we scan the chain starting from it's first node, their order
6652 corresponds the order of data-refs in RESULT_CHAIN. */
6653 stmt_vec_info next_stmt_info = first_stmt_info;
6654 gap_count = 1;
6655 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6656 {
6657 if (!next_stmt_info)
6658 break;
6659
6660 /* Skip the gaps. Loads created for the gaps will be removed by dead
6661 code elimination pass later. No need to check for the first stmt in
6662 the group, since it always exists.
6663 DR_GROUP_GAP is the number of steps in elements from the previous
6664 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6665 correspond to the gaps. */
6666 if (next_stmt_info != first_stmt_info
6667 && gap_count < DR_GROUP_GAP (next_stmt_info))
6668 {
6669 gap_count++;
6670 continue;
6671 }
6672
6673 /* ??? The following needs cleanup after the removal of
6674 DR_GROUP_SAME_DR_STMT. */
6675 if (next_stmt_info)
6676 {
6677 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6678 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6679 copies, and we put the new vector statement last. */
6680 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6681
6682 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6683 gap_count = 1;
6684 }
6685 }
6686 }
6687
6688 /* Function vect_force_dr_alignment_p.
6689
6690 Returns whether the alignment of a DECL can be forced to be aligned
6691 on ALIGNMENT bit boundary. */
6692
6693 bool
6694 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6695 {
6696 if (!VAR_P (decl))
6697 return false;
6698
6699 if (decl_in_symtab_p (decl)
6700 && !symtab_node::get (decl)->can_increase_alignment_p ())
6701 return false;
6702
6703 if (TREE_STATIC (decl))
6704 return (known_le (alignment,
6705 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6706 else
6707 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6708 }
6709
6710 /* Return whether the data reference DR_INFO is supported with respect to its
6711 alignment.
6712 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6713 it is aligned, i.e., check if it is possible to vectorize it with different
6714 alignment. */
6715
6716 enum dr_alignment_support
6717 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6718 tree vectype, int misalignment)
6719 {
6720 data_reference *dr = dr_info->dr;
6721 stmt_vec_info stmt_info = dr_info->stmt;
6722 machine_mode mode = TYPE_MODE (vectype);
6723 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6724 class loop *vect_loop = NULL;
6725 bool nested_in_vect_loop = false;
6726
6727 if (misalignment == 0)
6728 return dr_aligned;
6729
6730 /* For now assume all conditional loads/stores support unaligned
6731 access without any special code. */
6732 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6733 if (gimple_call_internal_p (stmt)
6734 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6735 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6736 return dr_unaligned_supported;
6737
6738 if (loop_vinfo)
6739 {
6740 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6741 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6742 }
6743
6744 /* Possibly unaligned access. */
6745
6746 /* We can choose between using the implicit realignment scheme (generating
6747 a misaligned_move stmt) and the explicit realignment scheme (generating
6748 aligned loads with a REALIGN_LOAD). There are two variants to the
6749 explicit realignment scheme: optimized, and unoptimized.
6750 We can optimize the realignment only if the step between consecutive
6751 vector loads is equal to the vector size. Since the vector memory
6752 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6753 is guaranteed that the misalignment amount remains the same throughout the
6754 execution of the vectorized loop. Therefore, we can create the
6755 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6756 at the loop preheader.
6757
6758 However, in the case of outer-loop vectorization, when vectorizing a
6759 memory access in the inner-loop nested within the LOOP that is now being
6760 vectorized, while it is guaranteed that the misalignment of the
6761 vectorized memory access will remain the same in different outer-loop
6762 iterations, it is *not* guaranteed that is will remain the same throughout
6763 the execution of the inner-loop. This is because the inner-loop advances
6764 with the original scalar step (and not in steps of VS). If the inner-loop
6765 step happens to be a multiple of VS, then the misalignment remains fixed
6766 and we can use the optimized realignment scheme. For example:
6767
6768 for (i=0; i<N; i++)
6769 for (j=0; j<M; j++)
6770 s += a[i+j];
6771
6772 When vectorizing the i-loop in the above example, the step between
6773 consecutive vector loads is 1, and so the misalignment does not remain
6774 fixed across the execution of the inner-loop, and the realignment cannot
6775 be optimized (as illustrated in the following pseudo vectorized loop):
6776
6777 for (i=0; i<N; i+=4)
6778 for (j=0; j<M; j++){
6779 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6780 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6781 // (assuming that we start from an aligned address).
6782 }
6783
6784 We therefore have to use the unoptimized realignment scheme:
6785
6786 for (i=0; i<N; i+=4)
6787 for (j=k; j<M; j+=4)
6788 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6789 // that the misalignment of the initial address is
6790 // 0).
6791
6792 The loop can then be vectorized as follows:
6793
6794 for (k=0; k<4; k++){
6795 rt = get_realignment_token (&vp[k]);
6796 for (i=0; i<N; i+=4){
6797 v1 = vp[i+k];
6798 for (j=k; j<M; j+=4){
6799 v2 = vp[i+j+VS-1];
6800 va = REALIGN_LOAD <v1,v2,rt>;
6801 vs += va;
6802 v1 = v2;
6803 }
6804 }
6805 } */
6806
6807 if (DR_IS_READ (dr))
6808 {
6809 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6810 && (!targetm.vectorize.builtin_mask_for_load
6811 || targetm.vectorize.builtin_mask_for_load ()))
6812 {
6813 /* If we are doing SLP then the accesses need not have the
6814 same alignment, instead it depends on the SLP group size. */
6815 if (loop_vinfo
6816 && STMT_SLP_TYPE (stmt_info)
6817 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6818 * (DR_GROUP_SIZE
6819 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6820 TYPE_VECTOR_SUBPARTS (vectype)))
6821 ;
6822 else if (!loop_vinfo
6823 || (nested_in_vect_loop
6824 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6825 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6826 return dr_explicit_realign;
6827 else
6828 return dr_explicit_realign_optimized;
6829 }
6830 }
6831
6832 bool is_packed = false;
6833 tree type = TREE_TYPE (DR_REF (dr));
6834 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6835 is_packed = not_size_aligned (DR_REF (dr));
6836 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6837 is_packed))
6838 return dr_unaligned_supported;
6839
6840 /* Unsupported. */
6841 return dr_unaligned_unsupported;
6842 }
6843