1 1.1 mrg /* Loop interchange. 2 1.1.1.4 mrg Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 1.1 mrg Contributed by ARM Ltd. 4 1.1 mrg 5 1.1 mrg This file is part of GCC. 6 1.1 mrg 7 1.1 mrg GCC is free software; you can redistribute it and/or modify it 8 1.1 mrg under the terms of the GNU General Public License as published by the 9 1.1 mrg Free Software Foundation; either version 3, or (at your option) any 10 1.1 mrg later version. 11 1.1 mrg 12 1.1 mrg GCC is distributed in the hope that it will be useful, but WITHOUT 13 1.1 mrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 1.1 mrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 1.1 mrg for more details. 16 1.1 mrg 17 1.1 mrg You should have received a copy of the GNU General Public License 18 1.1 mrg along with GCC; see the file COPYING3. If not see 19 1.1 mrg <http://www.gnu.org/licenses/>. */ 20 1.1 mrg 21 1.1 mrg #include "config.h" 22 1.1 mrg #include "system.h" 23 1.1 mrg #include "coretypes.h" 24 1.1 mrg #include "backend.h" 25 1.1 mrg #include "is-a.h" 26 1.1 mrg #include "tree.h" 27 1.1 mrg #include "gimple.h" 28 1.1 mrg #include "tree-pass.h" 29 1.1 mrg #include "ssa.h" 30 1.1 mrg #include "gimple-pretty-print.h" 31 1.1 mrg #include "fold-const.h" 32 1.1 mrg #include "gimplify.h" 33 1.1 mrg #include "gimple-iterator.h" 34 1.1 mrg #include "gimplify-me.h" 35 1.1 mrg #include "cfgloop.h" 36 1.1 mrg #include "tree-ssa.h" 37 1.1 mrg #include "tree-scalar-evolution.h" 38 1.1 mrg #include "tree-ssa-loop-manip.h" 39 1.1 mrg #include "tree-ssa-loop-niter.h" 40 1.1 mrg #include "tree-ssa-loop-ivopts.h" 41 1.1 mrg #include "tree-ssa-dce.h" 42 1.1 mrg #include "tree-data-ref.h" 43 1.1 mrg #include "tree-vectorizer.h" 44 1.1 mrg 45 1.1 mrg /* This pass performs loop interchange: for example, the loop nest 46 1.1 mrg 47 1.1 mrg for (int j = 0; j < N; j++) 48 1.1 mrg for (int k = 0; k < N; k++) 49 1.1 mrg for (int i = 0; i < N; i++) 50 1.1 mrg c[i][j] = c[i][j] + a[i][k]*b[k][j]; 51 1.1 mrg 52 1.1 mrg is transformed to 53 1.1 mrg 54 1.1 mrg for (int i = 0; i < N; i++) 55 1.1 mrg for (int j = 0; j < N; j++) 56 1.1 mrg for (int k = 0; k < N; k++) 57 1.1 mrg c[i][j] = c[i][j] + a[i][k]*b[k][j]; 58 1.1 mrg 59 1.1 mrg This pass implements loop interchange in the following steps: 60 1.1 mrg 61 1.1 mrg 1) Find perfect loop nest for each innermost loop and compute data 62 1.1 mrg dependence relations for it. For above example, loop nest is 63 1.1 mrg <loop_j, loop_k, loop_i>. 64 1.1 mrg 2) From innermost to outermost loop, this pass tries to interchange 65 1.1 mrg each loop pair. For above case, it firstly tries to interchange 66 1.1 mrg <loop_k, loop_i> and loop nest becomes <loop_j, loop_i, loop_k>. 67 1.1 mrg Then it tries to interchange <loop_j, loop_i> and loop nest becomes 68 1.1 mrg <loop_i, loop_j, loop_k>. The overall effect is to move innermost 69 1.1 mrg loop to the outermost position. For loop pair <loop_i, loop_j> 70 1.1 mrg to be interchanged, we: 71 1.1 mrg 3) Check if data dependence relations are valid for loop interchange. 72 1.1 mrg 4) Check if both loops can be interchanged in terms of transformation. 73 1.1 mrg 5) Check if interchanging the two loops is profitable. 74 1.1 mrg 6) Interchange the two loops by mapping induction variables. 75 1.1 mrg 76 1.1 mrg This pass also handles reductions in loop nest. So far we only support 77 1.1 mrg simple reduction of inner loop and double reduction of the loop nest. */ 78 1.1 mrg 79 1.1 mrg /* Maximum number of stmts in each loop that should be interchanged. */ 80 1.1.1.3 mrg #define MAX_NUM_STMT (param_loop_interchange_max_num_stmts) 81 1.1 mrg /* Maximum number of data references in loop nest. */ 82 1.1.1.3 mrg #define MAX_DATAREFS (param_loop_max_datarefs_for_datadeps) 83 1.1 mrg 84 1.1 mrg /* Comparison ratio of access stride between inner/outer loops to be 85 1.1 mrg interchanged. This is the minimum stride ratio for loop interchange 86 1.1 mrg to be profitable. */ 87 1.1.1.3 mrg #define OUTER_STRIDE_RATIO (param_loop_interchange_stride_ratio) 88 1.1 mrg /* The same as above, but we require higher ratio for interchanging the 89 1.1 mrg innermost two loops. */ 90 1.1 mrg #define INNER_STRIDE_RATIO ((OUTER_STRIDE_RATIO) + 1) 91 1.1 mrg 92 1.1 mrg /* Comparison ratio of stmt cost between inner/outer loops. Loops won't 93 1.1 mrg be interchanged if outer loop has too many stmts. */ 94 1.1 mrg #define STMT_COST_RATIO (3) 95 1.1 mrg 96 1.1 mrg /* Vector of strides that DR accesses in each level loop of a loop nest. */ 97 1.1 mrg #define DR_ACCESS_STRIDE(dr) ((vec<tree> *) dr->aux) 98 1.1 mrg 99 1.1 mrg /* Structure recording loop induction variable. */ 100 1.1 mrg typedef struct induction 101 1.1 mrg { 102 1.1 mrg /* IV itself. */ 103 1.1 mrg tree var; 104 1.1 mrg /* IV's initializing value, which is the init arg of the IV PHI node. */ 105 1.1 mrg tree init_val; 106 1.1 mrg /* IV's initializing expr, which is (the expanded result of) init_val. */ 107 1.1 mrg tree init_expr; 108 1.1 mrg /* IV's step. */ 109 1.1 mrg tree step; 110 1.1 mrg } *induction_p; 111 1.1 mrg 112 1.1 mrg /* Enum type for loop reduction variable. */ 113 1.1 mrg enum reduction_type 114 1.1 mrg { 115 1.1 mrg UNKNOWN_RTYPE = 0, 116 1.1 mrg SIMPLE_RTYPE, 117 1.1 mrg DOUBLE_RTYPE 118 1.1 mrg }; 119 1.1 mrg 120 1.1 mrg /* Structure recording loop reduction variable. */ 121 1.1 mrg typedef struct reduction 122 1.1 mrg { 123 1.1 mrg /* Reduction itself. */ 124 1.1 mrg tree var; 125 1.1 mrg /* PHI node defining reduction variable. */ 126 1.1 mrg gphi *phi; 127 1.1 mrg /* Init and next variables of the reduction. */ 128 1.1 mrg tree init; 129 1.1 mrg tree next; 130 1.1 mrg /* Lcssa PHI node if reduction is used outside of its definition loop. */ 131 1.1 mrg gphi *lcssa_phi; 132 1.1 mrg /* Stmts defining init and next. */ 133 1.1 mrg gimple *producer; 134 1.1 mrg gimple *consumer; 135 1.1 mrg /* If init is loaded from memory, this is the loading memory reference. */ 136 1.1 mrg tree init_ref; 137 1.1 mrg /* If reduction is finally stored to memory, this is the stored memory 138 1.1 mrg reference. */ 139 1.1 mrg tree fini_ref; 140 1.1 mrg enum reduction_type type; 141 1.1 mrg } *reduction_p; 142 1.1 mrg 143 1.1 mrg 144 1.1 mrg /* Dump reduction RE. */ 145 1.1 mrg 146 1.1 mrg static void 147 1.1 mrg dump_reduction (reduction_p re) 148 1.1 mrg { 149 1.1 mrg if (re->type == SIMPLE_RTYPE) 150 1.1 mrg fprintf (dump_file, " Simple reduction: "); 151 1.1 mrg else if (re->type == DOUBLE_RTYPE) 152 1.1 mrg fprintf (dump_file, " Double reduction: "); 153 1.1 mrg else 154 1.1 mrg fprintf (dump_file, " Unknown reduction: "); 155 1.1 mrg 156 1.1 mrg print_gimple_stmt (dump_file, re->phi, 0); 157 1.1 mrg } 158 1.1 mrg 159 1.1 mrg /* Dump LOOP's induction IV. */ 160 1.1 mrg static void 161 1.1.1.3 mrg dump_induction (class loop *loop, induction_p iv) 162 1.1 mrg { 163 1.1 mrg fprintf (dump_file, " Induction: "); 164 1.1 mrg print_generic_expr (dump_file, iv->var, TDF_SLIM); 165 1.1 mrg fprintf (dump_file, " = {"); 166 1.1 mrg print_generic_expr (dump_file, iv->init_expr, TDF_SLIM); 167 1.1 mrg fprintf (dump_file, ", "); 168 1.1 mrg print_generic_expr (dump_file, iv->step, TDF_SLIM); 169 1.1 mrg fprintf (dump_file, "}_%d\n", loop->num); 170 1.1 mrg } 171 1.1 mrg 172 1.1 mrg /* Loop candidate for interchange. */ 173 1.1 mrg 174 1.1.1.3 mrg class loop_cand 175 1.1 mrg { 176 1.1.1.3 mrg public: 177 1.1.1.3 mrg loop_cand (class loop *, class loop *); 178 1.1 mrg ~loop_cand (); 179 1.1 mrg 180 1.1 mrg reduction_p find_reduction_by_stmt (gimple *); 181 1.1 mrg void classify_simple_reduction (reduction_p); 182 1.1 mrg bool analyze_iloop_reduction_var (tree); 183 1.1 mrg bool analyze_oloop_reduction_var (loop_cand *, tree); 184 1.1 mrg bool analyze_induction_var (tree, tree); 185 1.1 mrg bool analyze_carried_vars (loop_cand *); 186 1.1 mrg bool analyze_lcssa_phis (void); 187 1.1 mrg bool can_interchange_p (loop_cand *); 188 1.1 mrg void undo_simple_reduction (reduction_p, bitmap); 189 1.1 mrg 190 1.1 mrg /* The loop itself. */ 191 1.1.1.3 mrg class loop *m_loop; 192 1.1 mrg /* The outer loop for interchange. It equals to loop if this loop cand 193 1.1 mrg itself represents the outer loop. */ 194 1.1.1.3 mrg class loop *m_outer; 195 1.1 mrg /* Vector of induction variables in loop. */ 196 1.1 mrg vec<induction_p> m_inductions; 197 1.1 mrg /* Vector of reduction variables in loop. */ 198 1.1 mrg vec<reduction_p> m_reductions; 199 1.1 mrg /* Lcssa PHI nodes of this loop. */ 200 1.1 mrg vec<gphi *> m_lcssa_nodes; 201 1.1 mrg /* Single exit edge of this loop. */ 202 1.1 mrg edge m_exit; 203 1.1 mrg /* Basic blocks of this loop. */ 204 1.1 mrg basic_block *m_bbs; 205 1.1 mrg /* Number of stmts of this loop. Inner loops' stmts are not included. */ 206 1.1 mrg int m_num_stmts; 207 1.1 mrg /* Number of constant initialized simple reduction. */ 208 1.1 mrg int m_const_init_reduc; 209 1.1 mrg }; 210 1.1 mrg 211 1.1 mrg /* Constructor. */ 212 1.1 mrg 213 1.1.1.3 mrg loop_cand::loop_cand (class loop *loop, class loop *outer) 214 1.1 mrg : m_loop (loop), m_outer (outer), m_exit (single_exit (loop)), 215 1.1 mrg m_bbs (get_loop_body (loop)), m_num_stmts (0), m_const_init_reduc (0) 216 1.1 mrg { 217 1.1 mrg m_inductions.create (3); 218 1.1 mrg m_reductions.create (3); 219 1.1 mrg m_lcssa_nodes.create (3); 220 1.1 mrg } 221 1.1 mrg 222 1.1 mrg /* Destructor. */ 223 1.1 mrg 224 1.1 mrg loop_cand::~loop_cand () 225 1.1 mrg { 226 1.1 mrg induction_p iv; 227 1.1 mrg for (unsigned i = 0; m_inductions.iterate (i, &iv); ++i) 228 1.1 mrg free (iv); 229 1.1 mrg 230 1.1 mrg reduction_p re; 231 1.1 mrg for (unsigned i = 0; m_reductions.iterate (i, &re); ++i) 232 1.1 mrg free (re); 233 1.1 mrg 234 1.1 mrg m_inductions.release (); 235 1.1 mrg m_reductions.release (); 236 1.1 mrg m_lcssa_nodes.release (); 237 1.1 mrg free (m_bbs); 238 1.1 mrg } 239 1.1 mrg 240 1.1 mrg /* Return single use stmt of VAR in LOOP, otherwise return NULL. */ 241 1.1 mrg 242 1.1 mrg static gimple * 243 1.1.1.3 mrg single_use_in_loop (tree var, class loop *loop) 244 1.1 mrg { 245 1.1 mrg gimple *stmt, *res = NULL; 246 1.1 mrg use_operand_p use_p; 247 1.1 mrg imm_use_iterator iterator; 248 1.1 mrg 249 1.1 mrg FOR_EACH_IMM_USE_FAST (use_p, iterator, var) 250 1.1 mrg { 251 1.1 mrg stmt = USE_STMT (use_p); 252 1.1 mrg if (is_gimple_debug (stmt)) 253 1.1 mrg continue; 254 1.1 mrg 255 1.1 mrg if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt))) 256 1.1 mrg continue; 257 1.1 mrg 258 1.1 mrg if (res) 259 1.1 mrg return NULL; 260 1.1 mrg 261 1.1 mrg res = stmt; 262 1.1 mrg } 263 1.1 mrg return res; 264 1.1 mrg } 265 1.1 mrg 266 1.1 mrg /* Return true if E is unsupported in loop interchange, i.e, E is a complex 267 1.1 mrg edge or part of irreducible loop. */ 268 1.1 mrg 269 1.1 mrg static inline bool 270 1.1 mrg unsupported_edge (edge e) 271 1.1 mrg { 272 1.1 mrg return (e->flags & (EDGE_COMPLEX | EDGE_IRREDUCIBLE_LOOP)); 273 1.1 mrg } 274 1.1 mrg 275 1.1 mrg /* Return the reduction if STMT is one of its lcssa PHI, producer or consumer 276 1.1 mrg stmt. */ 277 1.1 mrg 278 1.1 mrg reduction_p 279 1.1 mrg loop_cand::find_reduction_by_stmt (gimple *stmt) 280 1.1 mrg { 281 1.1 mrg gphi *phi = dyn_cast <gphi *> (stmt); 282 1.1 mrg reduction_p re; 283 1.1 mrg 284 1.1 mrg for (unsigned i = 0; m_reductions.iterate (i, &re); ++i) 285 1.1 mrg if ((phi != NULL && phi == re->lcssa_phi) 286 1.1 mrg || (stmt == re->producer || stmt == re->consumer)) 287 1.1 mrg return re; 288 1.1 mrg 289 1.1 mrg return NULL; 290 1.1 mrg } 291 1.1 mrg 292 1.1 mrg /* Return true if current loop_cand be interchanged. ILOOP is not NULL if 293 1.1 mrg current loop_cand is outer loop in loop nest. */ 294 1.1 mrg 295 1.1 mrg bool 296 1.1 mrg loop_cand::can_interchange_p (loop_cand *iloop) 297 1.1 mrg { 298 1.1 mrg /* For now we only support at most one reduction. */ 299 1.1 mrg unsigned allowed_reduction_num = 1; 300 1.1 mrg 301 1.1 mrg /* Only support reduction if the loop nest to be interchanged is the 302 1.1 mrg innermostin two loops. */ 303 1.1 mrg if ((iloop == NULL && m_loop->inner != NULL) 304 1.1 mrg || (iloop != NULL && iloop->m_loop->inner != NULL)) 305 1.1 mrg allowed_reduction_num = 0; 306 1.1 mrg 307 1.1 mrg if (m_reductions.length () > allowed_reduction_num 308 1.1 mrg || (m_reductions.length () == 1 309 1.1 mrg && m_reductions[0]->type == UNKNOWN_RTYPE)) 310 1.1 mrg return false; 311 1.1 mrg 312 1.1 mrg /* Only support lcssa PHI node which is for reduction. */ 313 1.1 mrg if (m_lcssa_nodes.length () > allowed_reduction_num) 314 1.1 mrg return false; 315 1.1 mrg 316 1.1 mrg /* Check if basic block has any unsupported operation. Note basic blocks 317 1.1 mrg of inner loops are not checked here. */ 318 1.1 mrg for (unsigned i = 0; i < m_loop->num_nodes; i++) 319 1.1 mrg { 320 1.1 mrg basic_block bb = m_bbs[i]; 321 1.1 mrg gphi_iterator psi; 322 1.1 mrg gimple_stmt_iterator gsi; 323 1.1 mrg 324 1.1 mrg /* Skip basic blocks of inner loops. */ 325 1.1 mrg if (bb->loop_father != m_loop) 326 1.1 mrg continue; 327 1.1 mrg 328 1.1 mrg for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 329 1.1 mrg { 330 1.1 mrg gimple *stmt = gsi_stmt (gsi); 331 1.1 mrg if (is_gimple_debug (stmt)) 332 1.1 mrg continue; 333 1.1 mrg 334 1.1 mrg if (gimple_has_side_effects (stmt)) 335 1.1 mrg return false; 336 1.1 mrg 337 1.1 mrg m_num_stmts++; 338 1.1 mrg if (gcall *call = dyn_cast <gcall *> (stmt)) 339 1.1 mrg { 340 1.1 mrg /* In basic block of outer loop, the call should be cheap since 341 1.1 mrg it will be moved to inner loop. */ 342 1.1 mrg if (iloop != NULL 343 1.1 mrg && !gimple_inexpensive_call_p (call)) 344 1.1 mrg return false; 345 1.1 mrg continue; 346 1.1 mrg } 347 1.1 mrg 348 1.1 mrg if (!iloop || !gimple_vuse (stmt)) 349 1.1 mrg continue; 350 1.1 mrg 351 1.1 mrg /* Support stmt accessing memory in outer loop only if it is for 352 1.1 mrg inner loop's reduction. */ 353 1.1 mrg if (iloop->find_reduction_by_stmt (stmt)) 354 1.1 mrg continue; 355 1.1 mrg 356 1.1 mrg tree lhs; 357 1.1 mrg /* Support loop invariant memory reference if it's only used once by 358 1.1 mrg inner loop. */ 359 1.1 mrg /* ??? How's this checking for invariantness? */ 360 1.1 mrg if (gimple_assign_single_p (stmt) 361 1.1 mrg && (lhs = gimple_assign_lhs (stmt)) != NULL_TREE 362 1.1 mrg && TREE_CODE (lhs) == SSA_NAME 363 1.1 mrg && single_use_in_loop (lhs, iloop->m_loop)) 364 1.1 mrg continue; 365 1.1 mrg 366 1.1 mrg return false; 367 1.1 mrg } 368 1.1 mrg /* Check if loop has too many stmts. */ 369 1.1 mrg if (m_num_stmts > MAX_NUM_STMT) 370 1.1 mrg return false; 371 1.1 mrg 372 1.1 mrg /* Allow PHI nodes in any basic block of inner loop, PHI nodes in outer 373 1.1 mrg loop's header, or PHI nodes in dest bb of inner loop's exit edge. */ 374 1.1 mrg if (!iloop || bb == m_loop->header 375 1.1 mrg || bb == iloop->m_exit->dest) 376 1.1 mrg continue; 377 1.1 mrg 378 1.1 mrg /* Don't allow any other PHI nodes. */ 379 1.1 mrg for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi)) 380 1.1 mrg if (!virtual_operand_p (PHI_RESULT (psi.phi ()))) 381 1.1 mrg return false; 382 1.1 mrg } 383 1.1 mrg 384 1.1 mrg return true; 385 1.1 mrg } 386 1.1 mrg 387 1.1 mrg /* Programmers and optimizers (like loop store motion) may optimize code: 388 1.1 mrg 389 1.1 mrg for (int i = 0; i < N; i++) 390 1.1 mrg for (int j = 0; j < N; j++) 391 1.1 mrg a[i] += b[j][i] * c[j][i]; 392 1.1 mrg 393 1.1 mrg into reduction: 394 1.1 mrg 395 1.1 mrg for (int i = 0; i < N; i++) 396 1.1 mrg { 397 1.1 mrg // producer. Note sum can be intitialized to a constant. 398 1.1 mrg int sum = a[i]; 399 1.1 mrg for (int j = 0; j < N; j++) 400 1.1 mrg { 401 1.1 mrg sum += b[j][i] * c[j][i]; 402 1.1 mrg } 403 1.1 mrg // consumer. 404 1.1 mrg a[i] = sum; 405 1.1 mrg } 406 1.1 mrg 407 1.1 mrg The result code can't be interchanged without undoing the optimization. 408 1.1 mrg This function classifies this kind reduction and records information so 409 1.1 mrg that we can undo the store motion during interchange. */ 410 1.1 mrg 411 1.1 mrg void 412 1.1 mrg loop_cand::classify_simple_reduction (reduction_p re) 413 1.1 mrg { 414 1.1 mrg gimple *producer, *consumer; 415 1.1 mrg 416 1.1 mrg /* Check init variable of reduction and how it is initialized. */ 417 1.1 mrg if (TREE_CODE (re->init) == SSA_NAME) 418 1.1 mrg { 419 1.1 mrg producer = SSA_NAME_DEF_STMT (re->init); 420 1.1 mrg re->producer = producer; 421 1.1 mrg basic_block bb = gimple_bb (producer); 422 1.1 mrg if (!bb || bb->loop_father != m_outer) 423 1.1 mrg return; 424 1.1 mrg 425 1.1 mrg if (!gimple_assign_load_p (producer)) 426 1.1 mrg return; 427 1.1 mrg 428 1.1 mrg re->init_ref = gimple_assign_rhs1 (producer); 429 1.1 mrg } 430 1.1 mrg else if (CONSTANT_CLASS_P (re->init)) 431 1.1 mrg m_const_init_reduc++; 432 1.1 mrg else 433 1.1 mrg return; 434 1.1 mrg 435 1.1 mrg /* Check how reduction variable is used. */ 436 1.1 mrg consumer = single_use_in_loop (PHI_RESULT (re->lcssa_phi), m_outer); 437 1.1 mrg if (!consumer 438 1.1 mrg || !gimple_store_p (consumer)) 439 1.1 mrg return; 440 1.1 mrg 441 1.1 mrg re->fini_ref = gimple_get_lhs (consumer); 442 1.1 mrg re->consumer = consumer; 443 1.1 mrg 444 1.1 mrg /* Simple reduction with constant initializer. */ 445 1.1 mrg if (!re->init_ref) 446 1.1 mrg { 447 1.1 mrg gcc_assert (CONSTANT_CLASS_P (re->init)); 448 1.1 mrg re->init_ref = unshare_expr (re->fini_ref); 449 1.1 mrg } 450 1.1 mrg 451 1.1 mrg /* Require memory references in producer and consumer are the same so 452 1.1 mrg that we can undo reduction during interchange. */ 453 1.1 mrg if (re->init_ref && !operand_equal_p (re->init_ref, re->fini_ref, 0)) 454 1.1 mrg return; 455 1.1 mrg 456 1.1 mrg re->type = SIMPLE_RTYPE; 457 1.1 mrg } 458 1.1 mrg 459 1.1 mrg /* Analyze reduction variable VAR for inner loop of the loop nest to be 460 1.1 mrg interchanged. Return true if analysis succeeds. */ 461 1.1 mrg 462 1.1 mrg bool 463 1.1 mrg loop_cand::analyze_iloop_reduction_var (tree var) 464 1.1 mrg { 465 1.1 mrg gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var)); 466 1.1 mrg gphi *lcssa_phi = NULL, *use_phi; 467 1.1 mrg tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop)); 468 1.1 mrg tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop)); 469 1.1 mrg reduction_p re; 470 1.1 mrg gimple *stmt, *next_def, *single_use = NULL; 471 1.1 mrg use_operand_p use_p; 472 1.1 mrg imm_use_iterator iterator; 473 1.1 mrg 474 1.1 mrg if (TREE_CODE (next) != SSA_NAME) 475 1.1 mrg return false; 476 1.1 mrg 477 1.1 mrg next_def = SSA_NAME_DEF_STMT (next); 478 1.1 mrg basic_block bb = gimple_bb (next_def); 479 1.1 mrg if (!bb || !flow_bb_inside_loop_p (m_loop, bb)) 480 1.1 mrg return false; 481 1.1 mrg 482 1.1 mrg /* In restricted reduction, the var is (and must be) used in defining 483 1.1 mrg the updated var. The process can be depicted as below: 484 1.1 mrg 485 1.1 mrg var ;; = PHI<init, next> 486 1.1 mrg | 487 1.1 mrg | 488 1.1 mrg v 489 1.1 mrg +---------------------+ 490 1.1 mrg | reduction operators | <-- other operands 491 1.1 mrg +---------------------+ 492 1.1 mrg | 493 1.1 mrg | 494 1.1 mrg v 495 1.1 mrg next 496 1.1 mrg 497 1.1 mrg In terms loop interchange, we don't change how NEXT is computed based 498 1.1 mrg on VAR and OTHER OPERANDS. In case of double reduction in loop nest 499 1.1 mrg to be interchanged, we don't changed it at all. In the case of simple 500 1.1 mrg reduction in inner loop, we only make change how VAR/NEXT is loaded or 501 1.1 mrg stored. With these conditions, we can relax restrictions on reduction 502 1.1 mrg in a way that reduction operation is seen as black box. In general, 503 1.1 mrg we can ignore reassociation of reduction operator; we can handle fake 504 1.1 mrg reductions in which VAR is not even used to compute NEXT. */ 505 1.1 mrg if (! single_imm_use (var, &use_p, &single_use) 506 1.1 mrg || ! flow_bb_inside_loop_p (m_loop, gimple_bb (single_use))) 507 1.1 mrg return false; 508 1.1 mrg 509 1.1 mrg /* Check the reduction operation. We require a left-associative operation. 510 1.1 mrg For FP math we also need to be allowed to associate operations. */ 511 1.1 mrg if (gassign *ass = dyn_cast <gassign *> (single_use)) 512 1.1 mrg { 513 1.1 mrg enum tree_code code = gimple_assign_rhs_code (ass); 514 1.1 mrg if (! (associative_tree_code (code) 515 1.1 mrg || (code == MINUS_EXPR 516 1.1 mrg && use_p->use == gimple_assign_rhs1_ptr (ass))) 517 1.1 mrg || (FLOAT_TYPE_P (TREE_TYPE (var)) 518 1.1 mrg && ! flag_associative_math)) 519 1.1 mrg return false; 520 1.1 mrg } 521 1.1 mrg else 522 1.1 mrg return false; 523 1.1 mrg 524 1.1 mrg /* Handle and verify a series of stmts feeding the reduction op. */ 525 1.1 mrg if (single_use != next_def 526 1.1.1.2 mrg && !check_reduction_path (dump_user_location_t (), m_loop, phi, next, 527 1.1 mrg gimple_assign_rhs_code (single_use))) 528 1.1 mrg return false; 529 1.1 mrg 530 1.1 mrg /* Only support cases in which INIT is used in inner loop. */ 531 1.1 mrg if (TREE_CODE (init) == SSA_NAME) 532 1.1 mrg FOR_EACH_IMM_USE_FAST (use_p, iterator, init) 533 1.1 mrg { 534 1.1 mrg stmt = USE_STMT (use_p); 535 1.1 mrg if (is_gimple_debug (stmt)) 536 1.1 mrg continue; 537 1.1 mrg 538 1.1 mrg if (!flow_bb_inside_loop_p (m_loop, gimple_bb (stmt))) 539 1.1 mrg return false; 540 1.1 mrg } 541 1.1 mrg 542 1.1 mrg FOR_EACH_IMM_USE_FAST (use_p, iterator, next) 543 1.1 mrg { 544 1.1 mrg stmt = USE_STMT (use_p); 545 1.1 mrg if (is_gimple_debug (stmt)) 546 1.1 mrg continue; 547 1.1 mrg 548 1.1 mrg /* Or else it's used in PHI itself. */ 549 1.1 mrg use_phi = dyn_cast <gphi *> (stmt); 550 1.1 mrg if (use_phi == phi) 551 1.1 mrg continue; 552 1.1 mrg 553 1.1 mrg if (use_phi != NULL 554 1.1 mrg && lcssa_phi == NULL 555 1.1 mrg && gimple_bb (stmt) == m_exit->dest 556 1.1 mrg && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next) 557 1.1 mrg lcssa_phi = use_phi; 558 1.1 mrg else 559 1.1 mrg return false; 560 1.1 mrg } 561 1.1 mrg if (!lcssa_phi) 562 1.1 mrg return false; 563 1.1 mrg 564 1.1 mrg re = XCNEW (struct reduction); 565 1.1 mrg re->var = var; 566 1.1 mrg re->init = init; 567 1.1 mrg re->next = next; 568 1.1 mrg re->phi = phi; 569 1.1 mrg re->lcssa_phi = lcssa_phi; 570 1.1 mrg 571 1.1 mrg classify_simple_reduction (re); 572 1.1 mrg 573 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 574 1.1 mrg dump_reduction (re); 575 1.1 mrg 576 1.1 mrg m_reductions.safe_push (re); 577 1.1 mrg return true; 578 1.1 mrg } 579 1.1 mrg 580 1.1 mrg /* Analyze reduction variable VAR for outer loop of the loop nest to be 581 1.1 mrg interchanged. ILOOP is not NULL and points to inner loop. For the 582 1.1 mrg moment, we only support double reduction for outer loop, like: 583 1.1 mrg 584 1.1 mrg for (int i = 0; i < n; i++) 585 1.1 mrg { 586 1.1 mrg int sum = 0; 587 1.1 mrg 588 1.1 mrg for (int j = 0; j < n; j++) // outer loop 589 1.1 mrg for (int k = 0; k < n; k++) // inner loop 590 1.1 mrg sum += a[i][k]*b[k][j]; 591 1.1 mrg 592 1.1 mrg s[i] = sum; 593 1.1 mrg } 594 1.1 mrg 595 1.1 mrg Note the innermost two loops are the loop nest to be interchanged. 596 1.1 mrg Return true if analysis succeeds. */ 597 1.1 mrg 598 1.1 mrg bool 599 1.1 mrg loop_cand::analyze_oloop_reduction_var (loop_cand *iloop, tree var) 600 1.1 mrg { 601 1.1 mrg gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var)); 602 1.1 mrg gphi *lcssa_phi = NULL, *use_phi; 603 1.1 mrg tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop)); 604 1.1 mrg tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop)); 605 1.1 mrg reduction_p re; 606 1.1 mrg gimple *stmt, *next_def; 607 1.1 mrg use_operand_p use_p; 608 1.1 mrg imm_use_iterator iterator; 609 1.1 mrg 610 1.1 mrg if (TREE_CODE (next) != SSA_NAME) 611 1.1 mrg return false; 612 1.1 mrg 613 1.1 mrg next_def = SSA_NAME_DEF_STMT (next); 614 1.1 mrg basic_block bb = gimple_bb (next_def); 615 1.1 mrg if (!bb || !flow_bb_inside_loop_p (m_loop, bb)) 616 1.1 mrg return false; 617 1.1 mrg 618 1.1 mrg /* Find inner loop's simple reduction that uses var as initializer. */ 619 1.1 mrg reduction_p inner_re = NULL; 620 1.1 mrg for (unsigned i = 0; iloop->m_reductions.iterate (i, &inner_re); ++i) 621 1.1 mrg if (inner_re->init == var || operand_equal_p (inner_re->init, var, 0)) 622 1.1 mrg break; 623 1.1 mrg 624 1.1 mrg if (inner_re == NULL 625 1.1 mrg || inner_re->type != UNKNOWN_RTYPE 626 1.1 mrg || inner_re->producer != phi) 627 1.1 mrg return false; 628 1.1 mrg 629 1.1 mrg /* In case of double reduction, outer loop's reduction should be updated 630 1.1 mrg by inner loop's simple reduction. */ 631 1.1 mrg if (next_def != inner_re->lcssa_phi) 632 1.1 mrg return false; 633 1.1 mrg 634 1.1 mrg /* Outer loop's reduction should only be used to initialize inner loop's 635 1.1 mrg simple reduction. */ 636 1.1 mrg if (! single_imm_use (var, &use_p, &stmt) 637 1.1 mrg || stmt != inner_re->phi) 638 1.1 mrg return false; 639 1.1 mrg 640 1.1 mrg /* Check this reduction is correctly used outside of loop via lcssa phi. */ 641 1.1 mrg FOR_EACH_IMM_USE_FAST (use_p, iterator, next) 642 1.1 mrg { 643 1.1 mrg stmt = USE_STMT (use_p); 644 1.1 mrg if (is_gimple_debug (stmt)) 645 1.1 mrg continue; 646 1.1 mrg 647 1.1 mrg /* Or else it's used in PHI itself. */ 648 1.1 mrg use_phi = dyn_cast <gphi *> (stmt); 649 1.1 mrg if (use_phi == phi) 650 1.1 mrg continue; 651 1.1 mrg 652 1.1 mrg if (lcssa_phi == NULL 653 1.1 mrg && use_phi != NULL 654 1.1 mrg && gimple_bb (stmt) == m_exit->dest 655 1.1 mrg && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next) 656 1.1 mrg lcssa_phi = use_phi; 657 1.1 mrg else 658 1.1 mrg return false; 659 1.1 mrg } 660 1.1 mrg if (!lcssa_phi) 661 1.1 mrg return false; 662 1.1 mrg 663 1.1 mrg re = XCNEW (struct reduction); 664 1.1 mrg re->var = var; 665 1.1 mrg re->init = init; 666 1.1 mrg re->next = next; 667 1.1 mrg re->phi = phi; 668 1.1 mrg re->lcssa_phi = lcssa_phi; 669 1.1 mrg re->type = DOUBLE_RTYPE; 670 1.1 mrg inner_re->type = DOUBLE_RTYPE; 671 1.1 mrg 672 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 673 1.1 mrg dump_reduction (re); 674 1.1 mrg 675 1.1 mrg m_reductions.safe_push (re); 676 1.1 mrg return true; 677 1.1 mrg } 678 1.1 mrg 679 1.1 mrg /* Return true if VAR is induction variable of current loop whose scev is 680 1.1 mrg specified by CHREC. */ 681 1.1 mrg 682 1.1 mrg bool 683 1.1 mrg loop_cand::analyze_induction_var (tree var, tree chrec) 684 1.1 mrg { 685 1.1 mrg gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var)); 686 1.1 mrg tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop)); 687 1.1 mrg 688 1.1 mrg /* Var is loop invariant, though it's unlikely to happen. */ 689 1.1 mrg if (tree_does_not_contain_chrecs (chrec)) 690 1.1 mrg { 691 1.1 mrg /* Punt on floating point invariants if honoring signed zeros, 692 1.1 mrg representing that as + 0.0 would change the result if init 693 1.1 mrg is -0.0. Similarly for SNaNs it can raise exception. */ 694 1.1 mrg if (HONOR_SIGNED_ZEROS (chrec) || HONOR_SNANS (chrec)) 695 1.1 mrg return false; 696 1.1 mrg struct induction *iv = XCNEW (struct induction); 697 1.1 mrg iv->var = var; 698 1.1 mrg iv->init_val = init; 699 1.1 mrg iv->init_expr = chrec; 700 1.1 mrg iv->step = build_zero_cst (TREE_TYPE (chrec)); 701 1.1 mrg m_inductions.safe_push (iv); 702 1.1 mrg return true; 703 1.1 mrg } 704 1.1 mrg 705 1.1 mrg if (TREE_CODE (chrec) != POLYNOMIAL_CHREC 706 1.1 mrg || CHREC_VARIABLE (chrec) != (unsigned) m_loop->num 707 1.1 mrg || tree_contains_chrecs (CHREC_LEFT (chrec), NULL) 708 1.1 mrg || tree_contains_chrecs (CHREC_RIGHT (chrec), NULL)) 709 1.1 mrg return false; 710 1.1 mrg 711 1.1 mrg struct induction *iv = XCNEW (struct induction); 712 1.1 mrg iv->var = var; 713 1.1 mrg iv->init_val = init; 714 1.1 mrg iv->init_expr = CHREC_LEFT (chrec); 715 1.1 mrg iv->step = CHREC_RIGHT (chrec); 716 1.1 mrg 717 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 718 1.1 mrg dump_induction (m_loop, iv); 719 1.1 mrg 720 1.1 mrg m_inductions.safe_push (iv); 721 1.1 mrg return true; 722 1.1 mrg } 723 1.1 mrg 724 1.1 mrg /* Return true if all loop carried variables defined in loop header can 725 1.1 mrg be successfully analyzed. */ 726 1.1 mrg 727 1.1 mrg bool 728 1.1 mrg loop_cand::analyze_carried_vars (loop_cand *iloop) 729 1.1 mrg { 730 1.1 mrg edge e = loop_preheader_edge (m_outer); 731 1.1 mrg gphi_iterator gsi; 732 1.1 mrg 733 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 734 1.1 mrg fprintf (dump_file, "\nLoop(%d) carried vars:\n", m_loop->num); 735 1.1 mrg 736 1.1 mrg for (gsi = gsi_start_phis (m_loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) 737 1.1 mrg { 738 1.1 mrg gphi *phi = gsi.phi (); 739 1.1 mrg 740 1.1 mrg tree var = PHI_RESULT (phi); 741 1.1 mrg if (virtual_operand_p (var)) 742 1.1 mrg continue; 743 1.1 mrg 744 1.1 mrg tree chrec = analyze_scalar_evolution (m_loop, var); 745 1.1 mrg chrec = instantiate_scev (e, m_loop, chrec); 746 1.1 mrg 747 1.1 mrg /* Analyze var as reduction variable. */ 748 1.1 mrg if (chrec_contains_undetermined (chrec) 749 1.1 mrg || chrec_contains_symbols_defined_in_loop (chrec, m_outer->num)) 750 1.1 mrg { 751 1.1 mrg if (iloop && !analyze_oloop_reduction_var (iloop, var)) 752 1.1 mrg return false; 753 1.1 mrg if (!iloop && !analyze_iloop_reduction_var (var)) 754 1.1 mrg return false; 755 1.1 mrg } 756 1.1 mrg /* Analyze var as induction variable. */ 757 1.1 mrg else if (!analyze_induction_var (var, chrec)) 758 1.1 mrg return false; 759 1.1 mrg } 760 1.1 mrg 761 1.1 mrg return true; 762 1.1 mrg } 763 1.1 mrg 764 1.1 mrg /* Return TRUE if loop closed PHI nodes can be analyzed successfully. */ 765 1.1 mrg 766 1.1 mrg bool 767 1.1 mrg loop_cand::analyze_lcssa_phis (void) 768 1.1 mrg { 769 1.1 mrg gphi_iterator gsi; 770 1.1 mrg for (gsi = gsi_start_phis (m_exit->dest); !gsi_end_p (gsi); gsi_next (&gsi)) 771 1.1 mrg { 772 1.1 mrg gphi *phi = gsi.phi (); 773 1.1 mrg 774 1.1 mrg if (virtual_operand_p (PHI_RESULT (phi))) 775 1.1 mrg continue; 776 1.1 mrg 777 1.1 mrg /* TODO: We only support lcssa phi for reduction for now. */ 778 1.1 mrg if (!find_reduction_by_stmt (phi)) 779 1.1 mrg return false; 780 1.1 mrg } 781 1.1 mrg 782 1.1 mrg return true; 783 1.1 mrg } 784 1.1 mrg 785 1.1 mrg /* CONSUMER is a stmt in BB storing reduction result into memory object. 786 1.1 mrg When the reduction is intialized from constant value, we need to add 787 1.1 mrg a stmt loading from the memory object to target basic block in inner 788 1.1 mrg loop during undoing the reduction. Problem is that memory reference 789 1.1 mrg may use ssa variables not dominating the target basic block. This 790 1.1 mrg function finds all stmts on which CONSUMER depends in basic block BB, 791 1.1 mrg records and returns them via STMTS. */ 792 1.1 mrg 793 1.1 mrg static void 794 1.1 mrg find_deps_in_bb_for_stmt (gimple_seq *stmts, basic_block bb, gimple *consumer) 795 1.1 mrg { 796 1.1 mrg auto_vec<gimple *, 4> worklist; 797 1.1 mrg use_operand_p use_p; 798 1.1 mrg ssa_op_iter iter; 799 1.1 mrg gimple *stmt, *def_stmt; 800 1.1 mrg gimple_stmt_iterator gsi; 801 1.1 mrg 802 1.1 mrg /* First clear flag for stmts in bb. */ 803 1.1 mrg for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 804 1.1 mrg gimple_set_plf (gsi_stmt (gsi), GF_PLF_1, false); 805 1.1 mrg 806 1.1 mrg /* DFS search all depended stmts in bb and mark flag for these stmts. */ 807 1.1 mrg worklist.safe_push (consumer); 808 1.1 mrg while (!worklist.is_empty ()) 809 1.1 mrg { 810 1.1 mrg stmt = worklist.pop (); 811 1.1 mrg FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE) 812 1.1 mrg { 813 1.1 mrg def_stmt = SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p)); 814 1.1 mrg 815 1.1 mrg if (is_a <gphi *> (def_stmt) 816 1.1 mrg || gimple_bb (def_stmt) != bb 817 1.1 mrg || gimple_plf (def_stmt, GF_PLF_1)) 818 1.1 mrg continue; 819 1.1 mrg 820 1.1 mrg worklist.safe_push (def_stmt); 821 1.1 mrg } 822 1.1 mrg gimple_set_plf (stmt, GF_PLF_1, true); 823 1.1 mrg } 824 1.1 mrg for (gsi = gsi_start_nondebug_bb (bb); 825 1.1 mrg !gsi_end_p (gsi) && (stmt = gsi_stmt (gsi)) != consumer;) 826 1.1 mrg { 827 1.1 mrg /* Move dep stmts to sequence STMTS. */ 828 1.1 mrg if (gimple_plf (stmt, GF_PLF_1)) 829 1.1 mrg { 830 1.1 mrg gsi_remove (&gsi, false); 831 1.1 mrg gimple_seq_add_stmt_without_update (stmts, stmt); 832 1.1 mrg } 833 1.1 mrg else 834 1.1 mrg gsi_next_nondebug (&gsi); 835 1.1 mrg } 836 1.1 mrg } 837 1.1 mrg 838 1.1 mrg /* User can write, optimizers can generate simple reduction RE for inner 839 1.1 mrg loop. In order to make interchange valid, we have to undo reduction by 840 1.1 mrg moving producer and consumer stmts into the inner loop. For example, 841 1.1 mrg below code: 842 1.1 mrg 843 1.1 mrg init = MEM_REF[idx]; //producer 844 1.1 mrg loop: 845 1.1 mrg var = phi<init, next> 846 1.1 mrg next = var op ... 847 1.1 mrg reduc_sum = phi<next> 848 1.1 mrg MEM_REF[idx] = reduc_sum //consumer 849 1.1 mrg 850 1.1 mrg is transformed into: 851 1.1 mrg 852 1.1 mrg loop: 853 1.1 mrg new_var = MEM_REF[idx]; //producer after moving 854 1.1 mrg next = new_var op ... 855 1.1 mrg MEM_REF[idx] = next; //consumer after moving 856 1.1 mrg 857 1.1 mrg Note if the reduction variable is initialized to constant, like: 858 1.1 mrg 859 1.1 mrg var = phi<0.0, next> 860 1.1 mrg 861 1.1 mrg we compute new_var as below: 862 1.1 mrg 863 1.1 mrg loop: 864 1.1 mrg tmp = MEM_REF[idx]; 865 1.1 mrg new_var = !first_iteration ? tmp : 0.0; 866 1.1 mrg 867 1.1 mrg so that the initial const is used in the first iteration of loop. Also 868 1.1 mrg record ssa variables for dead code elimination in DCE_SEEDS. */ 869 1.1 mrg 870 1.1 mrg void 871 1.1 mrg loop_cand::undo_simple_reduction (reduction_p re, bitmap dce_seeds) 872 1.1 mrg { 873 1.1 mrg gimple *stmt; 874 1.1 mrg gimple_stmt_iterator from, to = gsi_after_labels (m_loop->header); 875 1.1 mrg gimple_seq stmts = NULL; 876 1.1 mrg tree new_var; 877 1.1 mrg 878 1.1 mrg /* Prepare the initialization stmts and insert it to inner loop. */ 879 1.1 mrg if (re->producer != NULL) 880 1.1 mrg { 881 1.1 mrg gimple_set_vuse (re->producer, NULL_TREE); 882 1.1.1.3 mrg update_stmt (re->producer); 883 1.1 mrg from = gsi_for_stmt (re->producer); 884 1.1 mrg gsi_remove (&from, false); 885 1.1 mrg gimple_seq_add_stmt_without_update (&stmts, re->producer); 886 1.1 mrg new_var = re->init; 887 1.1 mrg } 888 1.1 mrg else 889 1.1 mrg { 890 1.1 mrg /* Find all stmts on which expression "MEM_REF[idx]" depends. */ 891 1.1 mrg find_deps_in_bb_for_stmt (&stmts, gimple_bb (re->consumer), re->consumer); 892 1.1 mrg /* Because we generate new stmt loading from the MEM_REF to TMP. */ 893 1.1 mrg tree cond, tmp = copy_ssa_name (re->var); 894 1.1 mrg stmt = gimple_build_assign (tmp, re->init_ref); 895 1.1 mrg gimple_seq_add_stmt_without_update (&stmts, stmt); 896 1.1 mrg 897 1.1 mrg /* Init new_var to MEM_REF or CONST depending on if it is the first 898 1.1 mrg iteration. */ 899 1.1 mrg induction_p iv = m_inductions[0]; 900 1.1 mrg cond = fold_build2 (NE_EXPR, boolean_type_node, iv->var, iv->init_val); 901 1.1 mrg new_var = copy_ssa_name (re->var); 902 1.1 mrg stmt = gimple_build_assign (new_var, COND_EXPR, cond, tmp, re->init); 903 1.1 mrg gimple_seq_add_stmt_without_update (&stmts, stmt); 904 1.1 mrg } 905 1.1 mrg gsi_insert_seq_before (&to, stmts, GSI_SAME_STMT); 906 1.1 mrg 907 1.1 mrg /* Replace all uses of reduction var with new variable. */ 908 1.1 mrg use_operand_p use_p; 909 1.1 mrg imm_use_iterator iterator; 910 1.1 mrg FOR_EACH_IMM_USE_STMT (stmt, iterator, re->var) 911 1.1 mrg { 912 1.1 mrg FOR_EACH_IMM_USE_ON_STMT (use_p, iterator) 913 1.1 mrg SET_USE (use_p, new_var); 914 1.1 mrg 915 1.1 mrg update_stmt (stmt); 916 1.1 mrg } 917 1.1 mrg 918 1.1 mrg /* Move consumer stmt into inner loop, just after reduction next's def. */ 919 1.1 mrg unlink_stmt_vdef (re->consumer); 920 1.1 mrg release_ssa_name (gimple_vdef (re->consumer)); 921 1.1 mrg gimple_set_vdef (re->consumer, NULL_TREE); 922 1.1 mrg gimple_set_vuse (re->consumer, NULL_TREE); 923 1.1 mrg gimple_assign_set_rhs1 (re->consumer, re->next); 924 1.1.1.3 mrg update_stmt (re->consumer); 925 1.1 mrg from = gsi_for_stmt (re->consumer); 926 1.1 mrg to = gsi_for_stmt (SSA_NAME_DEF_STMT (re->next)); 927 1.1 mrg gsi_move_after (&from, &to); 928 1.1 mrg 929 1.1 mrg /* Mark the reduction variables for DCE. */ 930 1.1 mrg bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (re->var)); 931 1.1 mrg bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (PHI_RESULT (re->lcssa_phi))); 932 1.1 mrg } 933 1.1 mrg 934 1.1 mrg /* Free DATAREFS and its auxiliary memory. */ 935 1.1 mrg 936 1.1 mrg static void 937 1.1 mrg free_data_refs_with_aux (vec<data_reference_p> datarefs) 938 1.1 mrg { 939 1.1 mrg data_reference_p dr; 940 1.1 mrg for (unsigned i = 0; datarefs.iterate (i, &dr); ++i) 941 1.1 mrg if (dr->aux != NULL) 942 1.1 mrg { 943 1.1 mrg DR_ACCESS_STRIDE (dr)->release (); 944 1.1 mrg delete (vec<tree> *) dr->aux; 945 1.1 mrg } 946 1.1 mrg 947 1.1 mrg free_data_refs (datarefs); 948 1.1 mrg } 949 1.1 mrg 950 1.1 mrg /* Class for loop interchange transformation. */ 951 1.1 mrg 952 1.1 mrg class tree_loop_interchange 953 1.1 mrg { 954 1.1 mrg public: 955 1.1.1.3 mrg tree_loop_interchange (vec<class loop *> loop_nest) 956 1.1 mrg : m_loop_nest (loop_nest), m_niters_iv_var (NULL_TREE), 957 1.1 mrg m_dce_seeds (BITMAP_ALLOC (NULL)) { } 958 1.1 mrg ~tree_loop_interchange () { BITMAP_FREE (m_dce_seeds); } 959 1.1 mrg bool interchange (vec<data_reference_p>, vec<ddr_p>); 960 1.1 mrg 961 1.1 mrg private: 962 1.1 mrg void update_data_info (unsigned, unsigned, vec<data_reference_p>, vec<ddr_p>); 963 1.1 mrg bool valid_data_dependences (unsigned, unsigned, vec<ddr_p>); 964 1.1 mrg void interchange_loops (loop_cand &, loop_cand &); 965 1.1 mrg void map_inductions_to_loop (loop_cand &, loop_cand &); 966 1.1.1.3 mrg void move_code_to_inner_loop (class loop *, class loop *, basic_block *); 967 1.1 mrg 968 1.1 mrg /* The whole loop nest in which interchange is ongoing. */ 969 1.1.1.3 mrg vec<class loop *> m_loop_nest; 970 1.1 mrg /* We create new IV which is only used in loop's exit condition check. 971 1.1 mrg In case of 3-level loop nest interchange, when we interchange the 972 1.1 mrg innermost two loops, new IV created in the middle level loop does 973 1.1 mrg not need to be preserved in interchanging the outermost two loops 974 1.1 mrg later. We record the IV so that it can be skipped. */ 975 1.1 mrg tree m_niters_iv_var; 976 1.1 mrg /* Bitmap of seed variables for dead code elimination after interchange. */ 977 1.1 mrg bitmap m_dce_seeds; 978 1.1 mrg }; 979 1.1 mrg 980 1.1 mrg /* Update data refs' access stride and dependence information after loop 981 1.1 mrg interchange. I_IDX/O_IDX gives indices of interchanged loops in loop 982 1.1 mrg nest. DATAREFS are data references. DDRS are data dependences. */ 983 1.1 mrg 984 1.1 mrg void 985 1.1 mrg tree_loop_interchange::update_data_info (unsigned i_idx, unsigned o_idx, 986 1.1 mrg vec<data_reference_p> datarefs, 987 1.1 mrg vec<ddr_p> ddrs) 988 1.1 mrg { 989 1.1 mrg struct data_reference *dr; 990 1.1 mrg struct data_dependence_relation *ddr; 991 1.1 mrg 992 1.1 mrg /* Update strides of data references. */ 993 1.1 mrg for (unsigned i = 0; datarefs.iterate (i, &dr); ++i) 994 1.1 mrg { 995 1.1 mrg vec<tree> *stride = DR_ACCESS_STRIDE (dr); 996 1.1 mrg gcc_assert (stride->length () > i_idx); 997 1.1 mrg std::swap ((*stride)[i_idx], (*stride)[o_idx]); 998 1.1 mrg } 999 1.1 mrg /* Update data dependences. */ 1000 1.1 mrg for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i) 1001 1.1 mrg if (DDR_ARE_DEPENDENT (ddr) != chrec_known) 1002 1.1 mrg { 1003 1.1 mrg for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j) 1004 1.1 mrg { 1005 1.1 mrg lambda_vector dist_vect = DDR_DIST_VECT (ddr, j); 1006 1.1 mrg std::swap (dist_vect[i_idx], dist_vect[o_idx]); 1007 1.1 mrg } 1008 1.1 mrg } 1009 1.1 mrg } 1010 1.1 mrg 1011 1.1 mrg /* Check data dependence relations, return TRUE if it's valid to interchange 1012 1.1 mrg two loops specified by I_IDX/O_IDX. Theoretically, interchanging the two 1013 1.1 mrg loops is valid only if dist vector, after interchanging, doesn't have '>' 1014 1.1 mrg as the leftmost non-'=' direction. Practically, this function have been 1015 1.1 mrg conservative here by not checking some valid cases. */ 1016 1.1 mrg 1017 1.1 mrg bool 1018 1.1 mrg tree_loop_interchange::valid_data_dependences (unsigned i_idx, unsigned o_idx, 1019 1.1 mrg vec<ddr_p> ddrs) 1020 1.1 mrg { 1021 1.1 mrg struct data_dependence_relation *ddr; 1022 1.1 mrg 1023 1.1 mrg for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i) 1024 1.1 mrg { 1025 1.1 mrg /* Skip no-dependence case. */ 1026 1.1 mrg if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 1027 1.1 mrg continue; 1028 1.1 mrg 1029 1.1 mrg for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j) 1030 1.1 mrg { 1031 1.1 mrg lambda_vector dist_vect = DDR_DIST_VECT (ddr, j); 1032 1.1 mrg unsigned level = dependence_level (dist_vect, m_loop_nest.length ()); 1033 1.1 mrg 1034 1.1 mrg /* If there is no carried dependence. */ 1035 1.1 mrg if (level == 0) 1036 1.1 mrg continue; 1037 1.1 mrg 1038 1.1 mrg level --; 1039 1.1 mrg 1040 1.1 mrg /* If dependence is not carried by any loop in between the two 1041 1.1 mrg loops [oloop, iloop] to interchange. */ 1042 1.1 mrg if (level < o_idx || level > i_idx) 1043 1.1 mrg continue; 1044 1.1 mrg 1045 1.1 mrg /* Be conservative, skip case if either direction at i_idx/o_idx 1046 1.1 mrg levels is not '=' or '<'. */ 1047 1.1.1.3 mrg if ((!DDR_REVERSED_P (ddr) && dist_vect[i_idx] < 0) 1048 1.1.1.3 mrg || (DDR_REVERSED_P (ddr) && dist_vect[i_idx] > 0) 1049 1.1.1.3 mrg || (!DDR_REVERSED_P (ddr) && dist_vect[o_idx] < 0) 1050 1.1.1.3 mrg || (DDR_REVERSED_P (ddr) && dist_vect[o_idx] > 0)) 1051 1.1 mrg return false; 1052 1.1 mrg } 1053 1.1 mrg } 1054 1.1 mrg 1055 1.1 mrg return true; 1056 1.1 mrg } 1057 1.1 mrg 1058 1.1 mrg /* Interchange two loops specified by ILOOP and OLOOP. */ 1059 1.1 mrg 1060 1.1 mrg void 1061 1.1 mrg tree_loop_interchange::interchange_loops (loop_cand &iloop, loop_cand &oloop) 1062 1.1 mrg { 1063 1.1 mrg reduction_p re; 1064 1.1 mrg gimple_stmt_iterator gsi; 1065 1.1 mrg tree i_niters, o_niters, var_after; 1066 1.1 mrg 1067 1.1 mrg /* Undo inner loop's simple reduction. */ 1068 1.1 mrg for (unsigned i = 0; iloop.m_reductions.iterate (i, &re); ++i) 1069 1.1 mrg if (re->type != DOUBLE_RTYPE) 1070 1.1 mrg { 1071 1.1 mrg if (re->producer) 1072 1.1 mrg reset_debug_uses (re->producer); 1073 1.1 mrg 1074 1.1 mrg iloop.undo_simple_reduction (re, m_dce_seeds); 1075 1.1 mrg } 1076 1.1 mrg 1077 1.1 mrg /* Only need to reset debug uses for double reduction. */ 1078 1.1 mrg for (unsigned i = 0; oloop.m_reductions.iterate (i, &re); ++i) 1079 1.1 mrg { 1080 1.1 mrg gcc_assert (re->type == DOUBLE_RTYPE); 1081 1.1 mrg reset_debug_uses (SSA_NAME_DEF_STMT (re->var)); 1082 1.1 mrg reset_debug_uses (SSA_NAME_DEF_STMT (re->next)); 1083 1.1 mrg } 1084 1.1 mrg 1085 1.1 mrg /* Prepare niters for both loops. */ 1086 1.1.1.3 mrg class loop *loop_nest = m_loop_nest[0]; 1087 1.1 mrg edge instantiate_below = loop_preheader_edge (loop_nest); 1088 1.1 mrg gsi = gsi_last_bb (loop_preheader_edge (loop_nest)->src); 1089 1.1 mrg i_niters = number_of_latch_executions (iloop.m_loop); 1090 1.1 mrg i_niters = analyze_scalar_evolution (loop_outer (iloop.m_loop), i_niters); 1091 1.1 mrg i_niters = instantiate_scev (instantiate_below, loop_outer (iloop.m_loop), 1092 1.1 mrg i_niters); 1093 1.1 mrg i_niters = force_gimple_operand_gsi (&gsi, unshare_expr (i_niters), true, 1094 1.1 mrg NULL_TREE, false, GSI_CONTINUE_LINKING); 1095 1.1 mrg o_niters = number_of_latch_executions (oloop.m_loop); 1096 1.1 mrg if (oloop.m_loop != loop_nest) 1097 1.1 mrg { 1098 1.1 mrg o_niters = analyze_scalar_evolution (loop_outer (oloop.m_loop), o_niters); 1099 1.1 mrg o_niters = instantiate_scev (instantiate_below, loop_outer (oloop.m_loop), 1100 1.1 mrg o_niters); 1101 1.1 mrg } 1102 1.1 mrg o_niters = force_gimple_operand_gsi (&gsi, unshare_expr (o_niters), true, 1103 1.1 mrg NULL_TREE, false, GSI_CONTINUE_LINKING); 1104 1.1 mrg 1105 1.1 mrg /* Move src's code to tgt loop. This is necessary when src is the outer 1106 1.1 mrg loop and tgt is the inner loop. */ 1107 1.1 mrg move_code_to_inner_loop (oloop.m_loop, iloop.m_loop, oloop.m_bbs); 1108 1.1 mrg 1109 1.1 mrg /* Map outer loop's IV to inner loop, and vice versa. */ 1110 1.1 mrg map_inductions_to_loop (oloop, iloop); 1111 1.1 mrg map_inductions_to_loop (iloop, oloop); 1112 1.1 mrg 1113 1.1 mrg /* Create canonical IV for both loops. Note canonical IV for outer/inner 1114 1.1 mrg loop is actually from inner/outer loop. Also we record the new IV 1115 1.1 mrg created for the outer loop so that it can be skipped in later loop 1116 1.1 mrg interchange. */ 1117 1.1 mrg create_canonical_iv (oloop.m_loop, oloop.m_exit, 1118 1.1 mrg i_niters, &m_niters_iv_var, &var_after); 1119 1.1 mrg bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after)); 1120 1.1 mrg create_canonical_iv (iloop.m_loop, iloop.m_exit, 1121 1.1 mrg o_niters, NULL, &var_after); 1122 1.1 mrg bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after)); 1123 1.1 mrg 1124 1.1 mrg /* Scrap niters estimation of interchanged loops. */ 1125 1.1 mrg iloop.m_loop->any_upper_bound = false; 1126 1.1 mrg iloop.m_loop->any_likely_upper_bound = false; 1127 1.1 mrg free_numbers_of_iterations_estimates (iloop.m_loop); 1128 1.1 mrg oloop.m_loop->any_upper_bound = false; 1129 1.1 mrg oloop.m_loop->any_likely_upper_bound = false; 1130 1.1 mrg free_numbers_of_iterations_estimates (oloop.m_loop); 1131 1.1 mrg 1132 1.1 mrg /* Clear all cached scev information. This is expensive but shouldn't be 1133 1.1 mrg a problem given we interchange in very limited times. */ 1134 1.1 mrg scev_reset_htab (); 1135 1.1 mrg 1136 1.1 mrg /* ??? The association between the loop data structure and the 1137 1.1 mrg CFG changed, so what was loop N at the source level is now 1138 1.1 mrg loop M. We should think of retaining the association or breaking 1139 1.1 mrg it fully by creating a new loop instead of re-using the "wrong" one. */ 1140 1.1 mrg } 1141 1.1 mrg 1142 1.1 mrg /* Map induction variables of SRC loop to TGT loop. The function firstly 1143 1.1 mrg creates the same IV of SRC loop in TGT loop, then deletes the original 1144 1.1 mrg IV and re-initialize it using the newly created IV. For example, loop 1145 1.1 mrg nest: 1146 1.1 mrg 1147 1.1 mrg for (i = 0; i < N; i++) 1148 1.1 mrg for (j = 0; j < M; j++) 1149 1.1 mrg { 1150 1.1 mrg //use of i; 1151 1.1 mrg //use of j; 1152 1.1 mrg } 1153 1.1 mrg 1154 1.1 mrg will be transformed into: 1155 1.1 mrg 1156 1.1 mrg for (jj = 0; jj < M; jj++) 1157 1.1 mrg for (ii = 0; ii < N; ii++) 1158 1.1 mrg { 1159 1.1 mrg //use of ii; 1160 1.1 mrg //use of jj; 1161 1.1 mrg } 1162 1.1 mrg 1163 1.1 mrg after loop interchange. */ 1164 1.1 mrg 1165 1.1 mrg void 1166 1.1 mrg tree_loop_interchange::map_inductions_to_loop (loop_cand &src, loop_cand &tgt) 1167 1.1 mrg { 1168 1.1 mrg induction_p iv; 1169 1.1 mrg edge e = tgt.m_exit; 1170 1.1 mrg gimple_stmt_iterator incr_pos = gsi_last_bb (e->src), gsi; 1171 1.1 mrg 1172 1.1 mrg /* Map source loop's IV to target loop. */ 1173 1.1 mrg for (unsigned i = 0; src.m_inductions.iterate (i, &iv); ++i) 1174 1.1 mrg { 1175 1.1 mrg gimple *use_stmt, *stmt = SSA_NAME_DEF_STMT (iv->var); 1176 1.1 mrg gcc_assert (is_a <gphi *> (stmt)); 1177 1.1 mrg 1178 1.1 mrg use_operand_p use_p; 1179 1.1 mrg /* Only map original IV to target loop. */ 1180 1.1 mrg if (m_niters_iv_var != iv->var) 1181 1.1 mrg { 1182 1.1 mrg /* Map the IV by creating the same one in target loop. */ 1183 1.1 mrg tree var_before, var_after; 1184 1.1 mrg tree base = unshare_expr (iv->init_expr); 1185 1.1 mrg tree step = unshare_expr (iv->step); 1186 1.1 mrg create_iv (base, step, SSA_NAME_VAR (iv->var), 1187 1.1 mrg tgt.m_loop, &incr_pos, false, &var_before, &var_after); 1188 1.1 mrg bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before)); 1189 1.1 mrg bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after)); 1190 1.1 mrg 1191 1.1 mrg /* Replace uses of the original IV var with newly created IV var. */ 1192 1.1 mrg imm_use_iterator imm_iter; 1193 1.1 mrg FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, iv->var) 1194 1.1 mrg { 1195 1.1 mrg FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 1196 1.1 mrg SET_USE (use_p, var_before); 1197 1.1 mrg 1198 1.1 mrg update_stmt (use_stmt); 1199 1.1 mrg } 1200 1.1 mrg } 1201 1.1 mrg 1202 1.1 mrg /* Mark all uses for DCE. */ 1203 1.1 mrg ssa_op_iter op_iter; 1204 1.1 mrg FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, op_iter, SSA_OP_USE) 1205 1.1 mrg { 1206 1.1 mrg tree use = USE_FROM_PTR (use_p); 1207 1.1 mrg if (TREE_CODE (use) == SSA_NAME 1208 1.1 mrg && ! SSA_NAME_IS_DEFAULT_DEF (use)) 1209 1.1 mrg bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (use)); 1210 1.1 mrg } 1211 1.1 mrg 1212 1.1 mrg /* Delete definition of the original IV in the source loop. */ 1213 1.1 mrg gsi = gsi_for_stmt (stmt); 1214 1.1 mrg remove_phi_node (&gsi, true); 1215 1.1 mrg } 1216 1.1 mrg } 1217 1.1 mrg 1218 1.1 mrg /* Move stmts of outer loop to inner loop. */ 1219 1.1 mrg 1220 1.1 mrg void 1221 1.1.1.3 mrg tree_loop_interchange::move_code_to_inner_loop (class loop *outer, 1222 1.1.1.3 mrg class loop *inner, 1223 1.1 mrg basic_block *outer_bbs) 1224 1.1 mrg { 1225 1.1 mrg basic_block oloop_exit_bb = single_exit (outer)->src; 1226 1.1 mrg gimple_stmt_iterator gsi, to; 1227 1.1 mrg 1228 1.1 mrg for (unsigned i = 0; i < outer->num_nodes; i++) 1229 1.1 mrg { 1230 1.1 mrg basic_block bb = outer_bbs[i]; 1231 1.1 mrg 1232 1.1 mrg /* Skip basic blocks of inner loop. */ 1233 1.1 mrg if (flow_bb_inside_loop_p (inner, bb)) 1234 1.1 mrg continue; 1235 1.1 mrg 1236 1.1 mrg /* Move code from header/latch to header/latch. */ 1237 1.1 mrg if (bb == outer->header) 1238 1.1 mrg to = gsi_after_labels (inner->header); 1239 1.1 mrg else if (bb == outer->latch) 1240 1.1 mrg to = gsi_after_labels (inner->latch); 1241 1.1 mrg else 1242 1.1 mrg /* Otherwise, simply move to exit->src. */ 1243 1.1 mrg to = gsi_last_bb (single_exit (inner)->src); 1244 1.1 mrg 1245 1.1 mrg for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) 1246 1.1 mrg { 1247 1.1 mrg gimple *stmt = gsi_stmt (gsi); 1248 1.1 mrg 1249 1.1 mrg if (oloop_exit_bb == bb 1250 1.1 mrg && stmt == gsi_stmt (gsi_last_bb (oloop_exit_bb))) 1251 1.1 mrg { 1252 1.1 mrg gsi_next (&gsi); 1253 1.1 mrg continue; 1254 1.1 mrg } 1255 1.1 mrg 1256 1.1 mrg if (gimple_vdef (stmt)) 1257 1.1 mrg { 1258 1.1 mrg unlink_stmt_vdef (stmt); 1259 1.1 mrg release_ssa_name (gimple_vdef (stmt)); 1260 1.1 mrg gimple_set_vdef (stmt, NULL_TREE); 1261 1.1 mrg } 1262 1.1.1.3 mrg if (gimple_vuse (stmt)) 1263 1.1.1.3 mrg { 1264 1.1.1.3 mrg gimple_set_vuse (stmt, NULL_TREE); 1265 1.1.1.3 mrg update_stmt (stmt); 1266 1.1.1.3 mrg } 1267 1.1 mrg 1268 1.1 mrg reset_debug_uses (stmt); 1269 1.1 mrg gsi_move_before (&gsi, &to); 1270 1.1 mrg } 1271 1.1 mrg } 1272 1.1 mrg } 1273 1.1 mrg 1274 1.1 mrg /* Given data reference DR in LOOP_NEST, the function computes DR's access 1275 1.1 mrg stride at each level of loop from innermost LOOP to outer. On success, 1276 1.1 mrg it saves access stride at each level loop in a vector which is pointed 1277 1.1 mrg by DR->aux. For example: 1278 1.1 mrg 1279 1.1 mrg int arr[100][100][100]; 1280 1.1 mrg for (i = 0; i < 100; i++) ;(DR->aux)strides[0] = 40000 1281 1.1 mrg for (j = 100; j > 0; j--) ;(DR->aux)strides[1] = 400 1282 1.1 mrg for (k = 0; k < 100; k++) ;(DR->aux)strides[2] = 4 1283 1.1 mrg arr[i][j - 1][k] = 0; */ 1284 1.1 mrg 1285 1.1 mrg static void 1286 1.1.1.4 mrg compute_access_stride (class loop *&loop_nest, class loop *loop, 1287 1.1 mrg data_reference_p dr) 1288 1.1 mrg { 1289 1.1 mrg vec<tree> *strides = new vec<tree> (); 1290 1.1.1.4 mrg dr->aux = strides; 1291 1.1 mrg 1292 1.1.1.4 mrg basic_block bb = gimple_bb (DR_STMT (dr)); 1293 1.1.1.4 mrg if (!flow_bb_inside_loop_p (loop_nest, bb)) 1294 1.1.1.4 mrg return; 1295 1.1 mrg while (!flow_bb_inside_loop_p (loop, bb)) 1296 1.1 mrg { 1297 1.1 mrg strides->safe_push (build_int_cst (sizetype, 0)); 1298 1.1 mrg loop = loop_outer (loop); 1299 1.1 mrg } 1300 1.1 mrg gcc_assert (loop == bb->loop_father); 1301 1.1 mrg 1302 1.1 mrg tree ref = DR_REF (dr); 1303 1.1 mrg if (TREE_CODE (ref) == COMPONENT_REF 1304 1.1 mrg && DECL_BIT_FIELD (TREE_OPERAND (ref, 1))) 1305 1.1 mrg { 1306 1.1 mrg /* We can't take address of bitfields. If the bitfield is at constant 1307 1.1 mrg offset from the start of the struct, just use address of the 1308 1.1 mrg struct, for analysis of the strides that shouldn't matter. */ 1309 1.1 mrg if (!TREE_OPERAND (ref, 2) 1310 1.1 mrg || TREE_CODE (TREE_OPERAND (ref, 2)) == INTEGER_CST) 1311 1.1 mrg ref = TREE_OPERAND (ref, 0); 1312 1.1 mrg /* Otherwise, if we have a bit field representative, use that. */ 1313 1.1 mrg else if (DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1)) 1314 1.1 mrg != NULL_TREE) 1315 1.1 mrg { 1316 1.1 mrg tree repr = DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1)); 1317 1.1 mrg ref = build3 (COMPONENT_REF, TREE_TYPE (repr), TREE_OPERAND (ref, 0), 1318 1.1 mrg repr, TREE_OPERAND (ref, 2)); 1319 1.1 mrg } 1320 1.1 mrg /* Otherwise punt. */ 1321 1.1 mrg else 1322 1.1.1.4 mrg return; 1323 1.1 mrg } 1324 1.1 mrg tree scev_base = build_fold_addr_expr (ref); 1325 1.1 mrg tree scev = analyze_scalar_evolution (loop, scev_base); 1326 1.1.1.4 mrg if (chrec_contains_undetermined (scev)) 1327 1.1.1.4 mrg return; 1328 1.1.1.4 mrg 1329 1.1.1.4 mrg tree orig_scev = scev; 1330 1.1.1.4 mrg do 1331 1.1.1.4 mrg { 1332 1.1.1.4 mrg scev = instantiate_scev (loop_preheader_edge (loop_nest), 1333 1.1.1.4 mrg loop, orig_scev); 1334 1.1.1.4 mrg if (! chrec_contains_undetermined (scev)) 1335 1.1.1.4 mrg break; 1336 1.1.1.4 mrg 1337 1.1.1.4 mrg /* If we couldn't instantiate for the desired nest, shrink it. */ 1338 1.1.1.4 mrg if (loop_nest == loop) 1339 1.1.1.4 mrg return; 1340 1.1.1.4 mrg loop_nest = loop_nest->inner; 1341 1.1.1.4 mrg } while (1); 1342 1.1.1.4 mrg 1343 1.1.1.4 mrg tree sl = scev; 1344 1.1.1.4 mrg class loop *expected = loop; 1345 1.1.1.4 mrg while (TREE_CODE (sl) == POLYNOMIAL_CHREC) 1346 1.1 mrg { 1347 1.1.1.4 mrg class loop *sl_loop = get_chrec_loop (sl); 1348 1.1.1.4 mrg while (sl_loop != expected) 1349 1.1 mrg { 1350 1.1.1.4 mrg strides->safe_push (size_int (0)); 1351 1.1 mrg expected = loop_outer (expected); 1352 1.1 mrg } 1353 1.1.1.4 mrg strides->safe_push (CHREC_RIGHT (sl)); 1354 1.1.1.4 mrg sl = CHREC_LEFT (sl); 1355 1.1.1.4 mrg expected = loop_outer (expected); 1356 1.1 mrg } 1357 1.1.1.4 mrg if (! tree_contains_chrecs (sl, NULL)) 1358 1.1.1.4 mrg while (expected != loop_outer (loop_nest)) 1359 1.1.1.4 mrg { 1360 1.1.1.4 mrg strides->safe_push (size_int (0)); 1361 1.1.1.4 mrg expected = loop_outer (expected); 1362 1.1.1.4 mrg } 1363 1.1 mrg } 1364 1.1 mrg 1365 1.1 mrg /* Given loop nest LOOP_NEST with innermost LOOP, the function computes 1366 1.1 mrg access strides with respect to each level loop for all data refs in 1367 1.1 mrg DATAREFS from inner loop to outer loop. On success, it returns the 1368 1.1 mrg outermost loop that access strides can be computed successfully for 1369 1.1 mrg all data references. If access strides cannot be computed at least 1370 1.1 mrg for two levels of loop for any data reference, it returns NULL. */ 1371 1.1 mrg 1372 1.1.1.3 mrg static class loop * 1373 1.1.1.3 mrg compute_access_strides (class loop *loop_nest, class loop *loop, 1374 1.1 mrg vec<data_reference_p> datarefs) 1375 1.1 mrg { 1376 1.1 mrg unsigned i, j, num_loops = (unsigned) -1; 1377 1.1 mrg data_reference_p dr; 1378 1.1 mrg vec<tree> *stride; 1379 1.1 mrg 1380 1.1.1.4 mrg class loop *interesting_loop_nest = loop_nest; 1381 1.1 mrg for (i = 0; datarefs.iterate (i, &dr); ++i) 1382 1.1 mrg { 1383 1.1.1.4 mrg compute_access_stride (interesting_loop_nest, loop, dr); 1384 1.1 mrg stride = DR_ACCESS_STRIDE (dr); 1385 1.1 mrg if (stride->length () < num_loops) 1386 1.1 mrg { 1387 1.1 mrg num_loops = stride->length (); 1388 1.1 mrg if (num_loops < 2) 1389 1.1 mrg return NULL; 1390 1.1 mrg } 1391 1.1 mrg } 1392 1.1 mrg 1393 1.1 mrg for (i = 0; datarefs.iterate (i, &dr); ++i) 1394 1.1 mrg { 1395 1.1 mrg stride = DR_ACCESS_STRIDE (dr); 1396 1.1 mrg if (stride->length () > num_loops) 1397 1.1 mrg stride->truncate (num_loops); 1398 1.1 mrg 1399 1.1 mrg for (j = 0; j < (num_loops >> 1); ++j) 1400 1.1 mrg std::swap ((*stride)[j], (*stride)[num_loops - j - 1]); 1401 1.1 mrg } 1402 1.1 mrg 1403 1.1 mrg loop = superloop_at_depth (loop, loop_depth (loop) + 1 - num_loops); 1404 1.1 mrg gcc_assert (loop_nest == loop || flow_loop_nested_p (loop_nest, loop)); 1405 1.1 mrg return loop; 1406 1.1 mrg } 1407 1.1 mrg 1408 1.1 mrg /* Prune access strides for data references in DATAREFS by removing strides 1409 1.1 mrg of loops that isn't in current LOOP_NEST. */ 1410 1.1 mrg 1411 1.1 mrg static void 1412 1.1.1.3 mrg prune_access_strides_not_in_loop (class loop *loop_nest, 1413 1.1.1.3 mrg class loop *innermost, 1414 1.1 mrg vec<data_reference_p> datarefs) 1415 1.1 mrg { 1416 1.1 mrg data_reference_p dr; 1417 1.1 mrg unsigned num_loops = loop_depth (innermost) - loop_depth (loop_nest) + 1; 1418 1.1 mrg gcc_assert (num_loops > 1); 1419 1.1 mrg 1420 1.1 mrg /* Block remove strides of loops that is not in current loop nest. */ 1421 1.1 mrg for (unsigned i = 0; datarefs.iterate (i, &dr); ++i) 1422 1.1 mrg { 1423 1.1 mrg vec<tree> *stride = DR_ACCESS_STRIDE (dr); 1424 1.1 mrg if (stride->length () > num_loops) 1425 1.1 mrg stride->block_remove (0, stride->length () - num_loops); 1426 1.1 mrg } 1427 1.1 mrg } 1428 1.1 mrg 1429 1.1 mrg /* Dump access strides for all DATAREFS. */ 1430 1.1 mrg 1431 1.1 mrg static void 1432 1.1 mrg dump_access_strides (vec<data_reference_p> datarefs) 1433 1.1 mrg { 1434 1.1 mrg data_reference_p dr; 1435 1.1 mrg fprintf (dump_file, "Access Strides for DRs:\n"); 1436 1.1 mrg for (unsigned i = 0; datarefs.iterate (i, &dr); ++i) 1437 1.1 mrg { 1438 1.1 mrg fprintf (dump_file, " "); 1439 1.1 mrg print_generic_expr (dump_file, DR_REF (dr), TDF_SLIM); 1440 1.1 mrg fprintf (dump_file, ":\t\t<"); 1441 1.1 mrg 1442 1.1 mrg vec<tree> *stride = DR_ACCESS_STRIDE (dr); 1443 1.1 mrg unsigned num_loops = stride->length (); 1444 1.1 mrg for (unsigned j = 0; j < num_loops; ++j) 1445 1.1 mrg { 1446 1.1 mrg print_generic_expr (dump_file, (*stride)[j], TDF_SLIM); 1447 1.1 mrg fprintf (dump_file, "%s", (j < num_loops - 1) ? ",\t" : ">\n"); 1448 1.1 mrg } 1449 1.1 mrg } 1450 1.1 mrg } 1451 1.1 mrg 1452 1.1 mrg /* Return true if it's profitable to interchange two loops whose index 1453 1.1 mrg in whole loop nest vector are I_IDX/O_IDX respectively. The function 1454 1.1 mrg computes and compares three types information from all DATAREFS: 1455 1.1 mrg 1) Access stride for loop I_IDX and O_IDX. 1456 1.1 mrg 2) Number of invariant memory references with respect to I_IDX before 1457 1.1 mrg and after loop interchange. 1458 1.1 mrg 3) Flags indicating if all memory references access sequential memory 1459 1.1 mrg in ILOOP, before and after loop interchange. 1460 1.1 mrg If INNMOST_LOOP_P is true, the two loops for interchanging are the two 1461 1.1 mrg innermost loops in loop nest. This function also dumps information if 1462 1.1 mrg DUMP_INFO_P is true. */ 1463 1.1 mrg 1464 1.1 mrg static bool 1465 1.1 mrg should_interchange_loops (unsigned i_idx, unsigned o_idx, 1466 1.1 mrg vec<data_reference_p> datarefs, 1467 1.1 mrg unsigned i_stmt_cost, unsigned o_stmt_cost, 1468 1.1 mrg bool innermost_loops_p, bool dump_info_p = true) 1469 1.1 mrg { 1470 1.1 mrg unsigned HOST_WIDE_INT ratio; 1471 1.1 mrg unsigned i, j, num_old_inv_drs = 0, num_new_inv_drs = 0; 1472 1.1 mrg struct data_reference *dr; 1473 1.1 mrg bool all_seq_dr_before_p = true, all_seq_dr_after_p = true; 1474 1.1 mrg widest_int iloop_strides = 0, oloop_strides = 0; 1475 1.1 mrg unsigned num_unresolved_drs = 0; 1476 1.1 mrg unsigned num_resolved_ok_drs = 0; 1477 1.1 mrg unsigned num_resolved_not_ok_drs = 0; 1478 1.1 mrg 1479 1.1 mrg if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS)) 1480 1.1 mrg fprintf (dump_file, "\nData ref strides:\n\tmem_ref:\t\tiloop\toloop\n"); 1481 1.1 mrg 1482 1.1 mrg for (i = 0; datarefs.iterate (i, &dr); ++i) 1483 1.1 mrg { 1484 1.1 mrg vec<tree> *stride = DR_ACCESS_STRIDE (dr); 1485 1.1 mrg tree iloop_stride = (*stride)[i_idx], oloop_stride = (*stride)[o_idx]; 1486 1.1 mrg 1487 1.1 mrg bool subloop_stride_p = false; 1488 1.1 mrg /* Data ref can't be invariant or sequential access at current loop if 1489 1.1 mrg its address changes with respect to any subloops. */ 1490 1.1 mrg for (j = i_idx + 1; j < stride->length (); ++j) 1491 1.1 mrg if (!integer_zerop ((*stride)[j])) 1492 1.1 mrg { 1493 1.1 mrg subloop_stride_p = true; 1494 1.1 mrg break; 1495 1.1 mrg } 1496 1.1 mrg 1497 1.1 mrg if (integer_zerop (iloop_stride)) 1498 1.1 mrg { 1499 1.1 mrg if (!subloop_stride_p) 1500 1.1 mrg num_old_inv_drs++; 1501 1.1 mrg } 1502 1.1 mrg if (integer_zerop (oloop_stride)) 1503 1.1 mrg { 1504 1.1 mrg if (!subloop_stride_p) 1505 1.1 mrg num_new_inv_drs++; 1506 1.1 mrg } 1507 1.1 mrg 1508 1.1 mrg if (TREE_CODE (iloop_stride) == INTEGER_CST 1509 1.1 mrg && TREE_CODE (oloop_stride) == INTEGER_CST) 1510 1.1 mrg { 1511 1.1 mrg iloop_strides = wi::add (iloop_strides, wi::to_widest (iloop_stride)); 1512 1.1 mrg oloop_strides = wi::add (oloop_strides, wi::to_widest (oloop_stride)); 1513 1.1 mrg } 1514 1.1 mrg else if (multiple_of_p (TREE_TYPE (iloop_stride), 1515 1.1 mrg iloop_stride, oloop_stride)) 1516 1.1 mrg num_resolved_ok_drs++; 1517 1.1 mrg else if (multiple_of_p (TREE_TYPE (iloop_stride), 1518 1.1 mrg oloop_stride, iloop_stride)) 1519 1.1 mrg num_resolved_not_ok_drs++; 1520 1.1 mrg else 1521 1.1 mrg num_unresolved_drs++; 1522 1.1 mrg 1523 1.1 mrg /* Data ref can't be sequential access if its address changes in sub 1524 1.1 mrg loop. */ 1525 1.1 mrg if (subloop_stride_p) 1526 1.1 mrg { 1527 1.1 mrg all_seq_dr_before_p = false; 1528 1.1 mrg all_seq_dr_after_p = false; 1529 1.1 mrg continue; 1530 1.1 mrg } 1531 1.1 mrg /* Track if all data references are sequential accesses before/after loop 1532 1.1 mrg interchange. Note invariant is considered sequential here. */ 1533 1.1 mrg tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 1534 1.1 mrg if (all_seq_dr_before_p 1535 1.1 mrg && ! (integer_zerop (iloop_stride) 1536 1.1 mrg || operand_equal_p (access_size, iloop_stride, 0))) 1537 1.1 mrg all_seq_dr_before_p = false; 1538 1.1 mrg if (all_seq_dr_after_p 1539 1.1 mrg && ! (integer_zerop (oloop_stride) 1540 1.1 mrg || operand_equal_p (access_size, oloop_stride, 0))) 1541 1.1 mrg all_seq_dr_after_p = false; 1542 1.1 mrg } 1543 1.1 mrg 1544 1.1 mrg if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS)) 1545 1.1 mrg { 1546 1.1 mrg fprintf (dump_file, "\toverall:\t\t"); 1547 1.1 mrg print_decu (iloop_strides, dump_file); 1548 1.1 mrg fprintf (dump_file, "\t"); 1549 1.1 mrg print_decu (oloop_strides, dump_file); 1550 1.1 mrg fprintf (dump_file, "\n"); 1551 1.1 mrg 1552 1.1 mrg fprintf (dump_file, "Invariant data ref: before(%d), after(%d)\n", 1553 1.1 mrg num_old_inv_drs, num_new_inv_drs); 1554 1.1 mrg fprintf (dump_file, "All consecutive stride: before(%s), after(%s)\n", 1555 1.1 mrg all_seq_dr_before_p ? "true" : "false", 1556 1.1 mrg all_seq_dr_after_p ? "true" : "false"); 1557 1.1 mrg fprintf (dump_file, "OK to interchage with variable strides: %d\n", 1558 1.1 mrg num_resolved_ok_drs); 1559 1.1 mrg fprintf (dump_file, "Not OK to interchage with variable strides: %d\n", 1560 1.1 mrg num_resolved_not_ok_drs); 1561 1.1 mrg fprintf (dump_file, "Variable strides we cannot decide: %d\n", 1562 1.1 mrg num_unresolved_drs); 1563 1.1 mrg fprintf (dump_file, "Stmt cost of inner loop: %d\n", i_stmt_cost); 1564 1.1 mrg fprintf (dump_file, "Stmt cost of outer loop: %d\n", o_stmt_cost); 1565 1.1 mrg } 1566 1.1 mrg 1567 1.1 mrg if (num_unresolved_drs != 0 || num_resolved_not_ok_drs != 0) 1568 1.1 mrg return false; 1569 1.1 mrg 1570 1.1 mrg /* Stmts of outer loop will be moved to inner loop. If there are two many 1571 1.1 mrg such stmts, it could make inner loop costly. Here we compare stmt cost 1572 1.1 mrg between outer and inner loops. */ 1573 1.1 mrg if (i_stmt_cost && o_stmt_cost 1574 1.1 mrg && num_old_inv_drs + o_stmt_cost > num_new_inv_drs 1575 1.1 mrg && o_stmt_cost * STMT_COST_RATIO > i_stmt_cost) 1576 1.1 mrg return false; 1577 1.1 mrg 1578 1.1 mrg /* We use different stride comparison ratio for interchanging innermost 1579 1.1 mrg two loops or not. The idea is to be conservative in interchange for 1580 1.1 mrg the innermost loops. */ 1581 1.1 mrg ratio = innermost_loops_p ? INNER_STRIDE_RATIO : OUTER_STRIDE_RATIO; 1582 1.1 mrg /* Do interchange if it gives better data locality behavior. */ 1583 1.1 mrg if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides, ratio))) 1584 1.1 mrg return true; 1585 1.1 mrg if (wi::gtu_p (iloop_strides, oloop_strides)) 1586 1.1 mrg { 1587 1.1 mrg /* Or it creates more invariant memory references. */ 1588 1.1 mrg if ((!all_seq_dr_before_p || all_seq_dr_after_p) 1589 1.1 mrg && num_new_inv_drs > num_old_inv_drs) 1590 1.1 mrg return true; 1591 1.1 mrg /* Or it makes all memory references sequential. */ 1592 1.1 mrg if (num_new_inv_drs >= num_old_inv_drs 1593 1.1 mrg && !all_seq_dr_before_p && all_seq_dr_after_p) 1594 1.1 mrg return true; 1595 1.1 mrg } 1596 1.1 mrg 1597 1.1 mrg return false; 1598 1.1 mrg } 1599 1.1 mrg 1600 1.1 mrg /* Try to interchange inner loop of a loop nest to outer level. */ 1601 1.1 mrg 1602 1.1 mrg bool 1603 1.1 mrg tree_loop_interchange::interchange (vec<data_reference_p> datarefs, 1604 1.1 mrg vec<ddr_p> ddrs) 1605 1.1 mrg { 1606 1.1.1.2 mrg dump_user_location_t loc = find_loop_location (m_loop_nest[0]); 1607 1.1 mrg bool changed_p = false; 1608 1.1 mrg /* In each iteration we try to interchange I-th loop with (I+1)-th loop. 1609 1.1 mrg The overall effect is to push inner loop to outermost level in whole 1610 1.1 mrg loop nest. */ 1611 1.1 mrg for (unsigned i = m_loop_nest.length (); i > 1; --i) 1612 1.1 mrg { 1613 1.1 mrg unsigned i_idx = i - 1, o_idx = i - 2; 1614 1.1 mrg 1615 1.1 mrg /* Check validity for loop interchange. */ 1616 1.1 mrg if (!valid_data_dependences (i_idx, o_idx, ddrs)) 1617 1.1 mrg break; 1618 1.1 mrg 1619 1.1 mrg loop_cand iloop (m_loop_nest[i_idx], m_loop_nest[o_idx]); 1620 1.1 mrg loop_cand oloop (m_loop_nest[o_idx], m_loop_nest[o_idx]); 1621 1.1 mrg 1622 1.1 mrg /* Check if we can do transformation for loop interchange. */ 1623 1.1 mrg if (!iloop.analyze_carried_vars (NULL) 1624 1.1 mrg || !iloop.analyze_lcssa_phis () 1625 1.1 mrg || !oloop.analyze_carried_vars (&iloop) 1626 1.1 mrg || !oloop.analyze_lcssa_phis () 1627 1.1 mrg || !iloop.can_interchange_p (NULL) 1628 1.1 mrg || !oloop.can_interchange_p (&iloop)) 1629 1.1 mrg break; 1630 1.1 mrg 1631 1.1 mrg /* Outer loop's stmts will be moved to inner loop during interchange. 1632 1.1 mrg If there are many of them, it may make inner loop very costly. We 1633 1.1 mrg need to check number of outer loop's stmts in profit cost model of 1634 1.1 mrg interchange. */ 1635 1.1 mrg int stmt_cost = oloop.m_num_stmts; 1636 1.1 mrg /* Count out the exit checking stmt of outer loop. */ 1637 1.1 mrg stmt_cost --; 1638 1.1 mrg /* Count out IV's increasing stmt, IVOPTs takes care if it. */ 1639 1.1 mrg stmt_cost -= oloop.m_inductions.length (); 1640 1.1 mrg /* Count in the additional load and cond_expr stmts caused by inner 1641 1.1 mrg loop's constant initialized reduction. */ 1642 1.1 mrg stmt_cost += iloop.m_const_init_reduc * 2; 1643 1.1 mrg if (stmt_cost < 0) 1644 1.1 mrg stmt_cost = 0; 1645 1.1 mrg 1646 1.1 mrg /* Check profitability for loop interchange. */ 1647 1.1 mrg if (should_interchange_loops (i_idx, o_idx, datarefs, 1648 1.1 mrg (unsigned) iloop.m_num_stmts, 1649 1.1 mrg (unsigned) stmt_cost, 1650 1.1 mrg iloop.m_loop->inner == NULL)) 1651 1.1 mrg { 1652 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1653 1.1 mrg fprintf (dump_file, 1654 1.1 mrg "Loop_pair<outer:%d, inner:%d> is interchanged\n\n", 1655 1.1 mrg oloop.m_loop->num, iloop.m_loop->num); 1656 1.1 mrg 1657 1.1 mrg changed_p = true; 1658 1.1 mrg interchange_loops (iloop, oloop); 1659 1.1 mrg /* No need to update if there is no further loop interchange. */ 1660 1.1 mrg if (o_idx > 0) 1661 1.1 mrg update_data_info (i_idx, o_idx, datarefs, ddrs); 1662 1.1 mrg } 1663 1.1 mrg else 1664 1.1 mrg { 1665 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1666 1.1 mrg fprintf (dump_file, 1667 1.1 mrg "Loop_pair<outer:%d, inner:%d> is not interchanged\n\n", 1668 1.1 mrg oloop.m_loop->num, iloop.m_loop->num); 1669 1.1 mrg } 1670 1.1 mrg } 1671 1.1 mrg simple_dce_from_worklist (m_dce_seeds); 1672 1.1 mrg 1673 1.1.1.2 mrg if (changed_p && dump_enabled_p ()) 1674 1.1 mrg dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, 1675 1.1 mrg "loops interchanged in loop nest\n"); 1676 1.1 mrg 1677 1.1 mrg return changed_p; 1678 1.1 mrg } 1679 1.1 mrg 1680 1.1 mrg 1681 1.1 mrg /* Loop interchange pass. */ 1682 1.1 mrg 1683 1.1 mrg namespace { 1684 1.1 mrg 1685 1.1 mrg const pass_data pass_data_linterchange = 1686 1.1 mrg { 1687 1.1 mrg GIMPLE_PASS, /* type */ 1688 1.1 mrg "linterchange", /* name */ 1689 1.1 mrg OPTGROUP_LOOP, /* optinfo_flags */ 1690 1.1 mrg TV_LINTERCHANGE, /* tv_id */ 1691 1.1 mrg PROP_cfg, /* properties_required */ 1692 1.1 mrg 0, /* properties_provided */ 1693 1.1 mrg 0, /* properties_destroyed */ 1694 1.1 mrg 0, /* todo_flags_start */ 1695 1.1 mrg 0, /* todo_flags_finish */ 1696 1.1 mrg }; 1697 1.1 mrg 1698 1.1 mrg class pass_linterchange : public gimple_opt_pass 1699 1.1 mrg { 1700 1.1 mrg public: 1701 1.1 mrg pass_linterchange (gcc::context *ctxt) 1702 1.1 mrg : gimple_opt_pass (pass_data_linterchange, ctxt) 1703 1.1 mrg {} 1704 1.1 mrg 1705 1.1 mrg /* opt_pass methods: */ 1706 1.1 mrg opt_pass * clone () { return new pass_linterchange (m_ctxt); } 1707 1.1 mrg virtual bool gate (function *) { return flag_loop_interchange; } 1708 1.1 mrg virtual unsigned int execute (function *); 1709 1.1 mrg 1710 1.1 mrg }; // class pass_linterchange 1711 1.1 mrg 1712 1.1 mrg 1713 1.1 mrg /* Return true if LOOP has proper form for interchange. We check three 1714 1.1 mrg conditions in the function: 1715 1.1 mrg 1) In general, a loop can be interchanged only if it doesn't have 1716 1.1 mrg basic blocks other than header, exit and latch besides possible 1717 1.1 mrg inner loop nest. This basically restricts loop interchange to 1718 1.1 mrg below form loop nests: 1719 1.1 mrg 1720 1.1 mrg header<---+ 1721 1.1 mrg | | 1722 1.1 mrg v | 1723 1.1 mrg INNER_LOOP | 1724 1.1 mrg | | 1725 1.1 mrg v | 1726 1.1 mrg exit--->latch 1727 1.1 mrg 1728 1.1 mrg 2) Data reference in basic block that executes in different times 1729 1.1 mrg than loop head/exit is not allowed. 1730 1.1 mrg 3) Record the innermost outer loop that doesn't form rectangle loop 1731 1.1 mrg nest with LOOP. */ 1732 1.1 mrg 1733 1.1 mrg static bool 1734 1.1.1.3 mrg proper_loop_form_for_interchange (class loop *loop, class loop **min_outer) 1735 1.1 mrg { 1736 1.1 mrg edge e0, e1, exit; 1737 1.1 mrg 1738 1.1 mrg /* Don't interchange if loop has unsupported information for the moment. */ 1739 1.1 mrg if (loop->safelen > 0 1740 1.1 mrg || loop->constraints != 0 1741 1.1 mrg || loop->can_be_parallel 1742 1.1 mrg || loop->dont_vectorize 1743 1.1 mrg || loop->force_vectorize 1744 1.1 mrg || loop->in_oacc_kernels_region 1745 1.1 mrg || loop->orig_loop_num != 0 1746 1.1 mrg || loop->simduid != NULL_TREE) 1747 1.1 mrg return false; 1748 1.1 mrg 1749 1.1 mrg /* Don't interchange if outer loop has basic block other than header, exit 1750 1.1 mrg and latch. */ 1751 1.1 mrg if (loop->inner != NULL 1752 1.1 mrg && loop->num_nodes != loop->inner->num_nodes + 3) 1753 1.1 mrg return false; 1754 1.1 mrg 1755 1.1 mrg if ((exit = single_dom_exit (loop)) == NULL) 1756 1.1 mrg return false; 1757 1.1 mrg 1758 1.1 mrg /* Check control flow on loop header/exit blocks. */ 1759 1.1 mrg if (loop->header == exit->src 1760 1.1 mrg && (EDGE_COUNT (loop->header->preds) != 2 1761 1.1 mrg || EDGE_COUNT (loop->header->succs) != 2)) 1762 1.1 mrg return false; 1763 1.1 mrg else if (loop->header != exit->src 1764 1.1 mrg && (EDGE_COUNT (loop->header->preds) != 2 1765 1.1 mrg || !single_succ_p (loop->header) 1766 1.1 mrg || unsupported_edge (single_succ_edge (loop->header)) 1767 1.1 mrg || EDGE_COUNT (exit->src->succs) != 2 1768 1.1 mrg || !single_pred_p (exit->src) 1769 1.1 mrg || unsupported_edge (single_pred_edge (exit->src)))) 1770 1.1 mrg return false; 1771 1.1 mrg 1772 1.1 mrg e0 = EDGE_PRED (loop->header, 0); 1773 1.1 mrg e1 = EDGE_PRED (loop->header, 1); 1774 1.1 mrg if (unsupported_edge (e0) || unsupported_edge (e1) 1775 1.1 mrg || (e0->src != loop->latch && e1->src != loop->latch) 1776 1.1 mrg || (e0->src->loop_father == loop && e1->src->loop_father == loop)) 1777 1.1 mrg return false; 1778 1.1 mrg 1779 1.1 mrg e0 = EDGE_SUCC (exit->src, 0); 1780 1.1 mrg e1 = EDGE_SUCC (exit->src, 1); 1781 1.1 mrg if (unsupported_edge (e0) || unsupported_edge (e1) 1782 1.1 mrg || (e0->dest != loop->latch && e1->dest != loop->latch) 1783 1.1 mrg || (e0->dest->loop_father == loop && e1->dest->loop_father == loop)) 1784 1.1 mrg return false; 1785 1.1 mrg 1786 1.1 mrg /* Don't interchange if any reference is in basic block that doesn't 1787 1.1 mrg dominate exit block. */ 1788 1.1 mrg basic_block *bbs = get_loop_body (loop); 1789 1.1 mrg for (unsigned i = 0; i < loop->num_nodes; i++) 1790 1.1 mrg { 1791 1.1 mrg basic_block bb = bbs[i]; 1792 1.1 mrg 1793 1.1 mrg if (bb->loop_father != loop 1794 1.1 mrg || bb == loop->header || bb == exit->src 1795 1.1 mrg || dominated_by_p (CDI_DOMINATORS, exit->src, bb)) 1796 1.1 mrg continue; 1797 1.1 mrg 1798 1.1 mrg for (gimple_stmt_iterator gsi = gsi_start_nondebug_bb (bb); 1799 1.1 mrg !gsi_end_p (gsi); gsi_next_nondebug (&gsi)) 1800 1.1 mrg if (gimple_vuse (gsi_stmt (gsi))) 1801 1.1 mrg { 1802 1.1 mrg free (bbs); 1803 1.1 mrg return false; 1804 1.1 mrg } 1805 1.1 mrg } 1806 1.1 mrg free (bbs); 1807 1.1 mrg 1808 1.1 mrg tree niters = number_of_latch_executions (loop); 1809 1.1 mrg niters = analyze_scalar_evolution (loop_outer (loop), niters); 1810 1.1 mrg if (!niters || chrec_contains_undetermined (niters)) 1811 1.1 mrg return false; 1812 1.1 mrg 1813 1.1 mrg /* Record the innermost outer loop that doesn't form rectangle loop nest. */ 1814 1.1 mrg for (loop_p loop2 = loop_outer (loop); 1815 1.1 mrg loop2 && flow_loop_nested_p (*min_outer, loop2); 1816 1.1 mrg loop2 = loop_outer (loop2)) 1817 1.1 mrg { 1818 1.1 mrg niters = instantiate_scev (loop_preheader_edge (loop2), 1819 1.1 mrg loop_outer (loop), niters); 1820 1.1 mrg if (!evolution_function_is_invariant_p (niters, loop2->num)) 1821 1.1 mrg { 1822 1.1 mrg *min_outer = loop2; 1823 1.1 mrg break; 1824 1.1 mrg } 1825 1.1 mrg } 1826 1.1 mrg return true; 1827 1.1 mrg } 1828 1.1 mrg 1829 1.1 mrg /* Return true if any two adjacent loops in loop nest [INNERMOST, LOOP_NEST] 1830 1.1 mrg should be interchanged by looking into all DATAREFS. */ 1831 1.1 mrg 1832 1.1 mrg static bool 1833 1.1.1.3 mrg should_interchange_loop_nest (class loop *loop_nest, class loop *innermost, 1834 1.1 mrg vec<data_reference_p> datarefs) 1835 1.1 mrg { 1836 1.1 mrg unsigned idx = loop_depth (innermost) - loop_depth (loop_nest); 1837 1.1 mrg gcc_assert (idx > 0); 1838 1.1 mrg 1839 1.1 mrg /* Check if any two adjacent loops should be interchanged. */ 1840 1.1.1.3 mrg for (class loop *loop = innermost; 1841 1.1 mrg loop != loop_nest; loop = loop_outer (loop), idx--) 1842 1.1 mrg if (should_interchange_loops (idx, idx - 1, datarefs, 0, 0, 1843 1.1 mrg loop == innermost, false)) 1844 1.1 mrg return true; 1845 1.1 mrg 1846 1.1 mrg return false; 1847 1.1 mrg } 1848 1.1 mrg 1849 1.1 mrg /* Given loop nest LOOP_NEST and data references DATAREFS, compute data 1850 1.1 mrg dependences for loop interchange and store it in DDRS. Note we compute 1851 1.1 mrg dependences directly rather than call generic interface so that we can 1852 1.1 mrg return on unknown dependence instantly. */ 1853 1.1 mrg 1854 1.1 mrg static bool 1855 1.1 mrg tree_loop_interchange_compute_ddrs (vec<loop_p> loop_nest, 1856 1.1 mrg vec<data_reference_p> datarefs, 1857 1.1 mrg vec<ddr_p> *ddrs) 1858 1.1 mrg { 1859 1.1 mrg struct data_reference *a, *b; 1860 1.1.1.3 mrg class loop *innermost = loop_nest.last (); 1861 1.1 mrg 1862 1.1 mrg for (unsigned i = 0; datarefs.iterate (i, &a); ++i) 1863 1.1 mrg { 1864 1.1 mrg bool a_outer_p = gimple_bb (DR_STMT (a))->loop_father != innermost; 1865 1.1 mrg for (unsigned j = i + 1; datarefs.iterate (j, &b); ++j) 1866 1.1 mrg if (DR_IS_WRITE (a) || DR_IS_WRITE (b)) 1867 1.1 mrg { 1868 1.1 mrg bool b_outer_p = gimple_bb (DR_STMT (b))->loop_father != innermost; 1869 1.1 mrg /* Don't support multiple write references in outer loop. */ 1870 1.1 mrg if (a_outer_p && b_outer_p && DR_IS_WRITE (a) && DR_IS_WRITE (b)) 1871 1.1 mrg return false; 1872 1.1 mrg 1873 1.1 mrg ddr_p ddr = initialize_data_dependence_relation (a, b, loop_nest); 1874 1.1 mrg ddrs->safe_push (ddr); 1875 1.1 mrg compute_affine_dependence (ddr, loop_nest[0]); 1876 1.1 mrg 1877 1.1 mrg /* Give up if ddr is unknown dependence or classic direct vector 1878 1.1 mrg is not available. */ 1879 1.1 mrg if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know 1880 1.1 mrg || (DDR_ARE_DEPENDENT (ddr) == NULL_TREE 1881 1.1 mrg && DDR_NUM_DIR_VECTS (ddr) == 0)) 1882 1.1 mrg return false; 1883 1.1 mrg 1884 1.1 mrg /* If either data references is in outer loop of nest, we require 1885 1.1 mrg no dependence here because the data reference need to be moved 1886 1.1 mrg into inner loop during interchange. */ 1887 1.1 mrg if (a_outer_p && b_outer_p 1888 1.1 mrg && operand_equal_p (DR_REF (a), DR_REF (b), 0)) 1889 1.1 mrg continue; 1890 1.1 mrg if (DDR_ARE_DEPENDENT (ddr) != chrec_known 1891 1.1 mrg && (a_outer_p || b_outer_p)) 1892 1.1 mrg return false; 1893 1.1 mrg } 1894 1.1 mrg } 1895 1.1 mrg 1896 1.1 mrg return true; 1897 1.1 mrg } 1898 1.1 mrg 1899 1.1 mrg /* Prune DATAREFS by removing any data reference not inside of LOOP. */ 1900 1.1 mrg 1901 1.1 mrg static inline void 1902 1.1.1.3 mrg prune_datarefs_not_in_loop (class loop *loop, vec<data_reference_p> datarefs) 1903 1.1 mrg { 1904 1.1 mrg unsigned i, j; 1905 1.1 mrg struct data_reference *dr; 1906 1.1 mrg 1907 1.1 mrg for (i = 0, j = 0; datarefs.iterate (i, &dr); ++i) 1908 1.1 mrg { 1909 1.1 mrg if (flow_bb_inside_loop_p (loop, gimple_bb (DR_STMT (dr)))) 1910 1.1 mrg datarefs[j++] = dr; 1911 1.1 mrg else 1912 1.1 mrg { 1913 1.1 mrg if (dr->aux) 1914 1.1 mrg { 1915 1.1 mrg DR_ACCESS_STRIDE (dr)->release (); 1916 1.1 mrg delete (vec<tree> *) dr->aux; 1917 1.1 mrg } 1918 1.1 mrg free_data_ref (dr); 1919 1.1 mrg } 1920 1.1 mrg } 1921 1.1 mrg datarefs.truncate (j); 1922 1.1 mrg } 1923 1.1 mrg 1924 1.1 mrg /* Find and store data references in DATAREFS for LOOP nest. If there's 1925 1.1 mrg difficult data reference in a basic block, we shrink the loop nest to 1926 1.1 mrg inner loop of that basic block's father loop. On success, return the 1927 1.1 mrg outer loop of the result loop nest. */ 1928 1.1 mrg 1929 1.1.1.3 mrg static class loop * 1930 1.1.1.3 mrg prepare_data_references (class loop *loop, vec<data_reference_p> *datarefs) 1931 1.1 mrg { 1932 1.1.1.3 mrg class loop *loop_nest = loop; 1933 1.1 mrg vec<data_reference_p> *bb_refs; 1934 1.1 mrg basic_block bb, *bbs = get_loop_body_in_dom_order (loop); 1935 1.1 mrg 1936 1.1 mrg for (unsigned i = 0; i < loop->num_nodes; i++) 1937 1.1 mrg bbs[i]->aux = NULL; 1938 1.1 mrg 1939 1.1 mrg /* Find data references for all basic blocks. Shrink loop nest on difficult 1940 1.1 mrg data reference. */ 1941 1.1 mrg for (unsigned i = 0; loop_nest && i < loop->num_nodes; ++i) 1942 1.1 mrg { 1943 1.1 mrg bb = bbs[i]; 1944 1.1 mrg if (!flow_bb_inside_loop_p (loop_nest, bb)) 1945 1.1 mrg continue; 1946 1.1 mrg 1947 1.1 mrg bb_refs = new vec<data_reference_p> (); 1948 1.1 mrg if (find_data_references_in_bb (loop, bb, bb_refs) == chrec_dont_know) 1949 1.1 mrg { 1950 1.1 mrg loop_nest = bb->loop_father->inner; 1951 1.1 mrg if (loop_nest && !loop_nest->inner) 1952 1.1 mrg loop_nest = NULL; 1953 1.1 mrg 1954 1.1 mrg free_data_refs (*bb_refs); 1955 1.1 mrg delete bb_refs; 1956 1.1 mrg } 1957 1.1 mrg else if (bb_refs->is_empty ()) 1958 1.1.1.4 mrg { 1959 1.1.1.4 mrg bb_refs->release (); 1960 1.1.1.4 mrg delete bb_refs; 1961 1.1.1.4 mrg } 1962 1.1 mrg else 1963 1.1 mrg bb->aux = bb_refs; 1964 1.1 mrg } 1965 1.1 mrg 1966 1.1 mrg /* Collect all data references in loop nest. */ 1967 1.1 mrg for (unsigned i = 0; i < loop->num_nodes; i++) 1968 1.1 mrg { 1969 1.1 mrg bb = bbs[i]; 1970 1.1 mrg if (!bb->aux) 1971 1.1 mrg continue; 1972 1.1 mrg 1973 1.1 mrg bb_refs = (vec<data_reference_p> *) bb->aux; 1974 1.1 mrg if (loop_nest && flow_bb_inside_loop_p (loop_nest, bb)) 1975 1.1.1.4 mrg { 1976 1.1.1.4 mrg datarefs->safe_splice (*bb_refs); 1977 1.1.1.4 mrg bb_refs->release (); 1978 1.1.1.4 mrg } 1979 1.1 mrg else 1980 1.1 mrg free_data_refs (*bb_refs); 1981 1.1 mrg 1982 1.1 mrg delete bb_refs; 1983 1.1 mrg bb->aux = NULL; 1984 1.1 mrg } 1985 1.1 mrg free (bbs); 1986 1.1 mrg 1987 1.1 mrg return loop_nest; 1988 1.1 mrg } 1989 1.1 mrg 1990 1.1 mrg /* Given innermost LOOP, return true if perfect loop nest can be found and 1991 1.1 mrg data dependences can be computed. If succeed, record the perfect loop 1992 1.1 mrg nest in LOOP_NEST; record all data references in DATAREFS and record all 1993 1.1 mrg data dependence relations in DDRS. 1994 1.1 mrg 1995 1.1 mrg We do support a restricted form of imperfect loop nest, i.e, loop nest 1996 1.1 mrg with load/store in outer loop initializing/finalizing simple reduction 1997 1.1 mrg of the innermost loop. For such outer loop reference, we require that 1998 1.1 mrg it has no dependence with others sinve it will be moved to inner loop 1999 1.1 mrg in interchange. */ 2000 1.1 mrg 2001 1.1 mrg static bool 2002 1.1.1.3 mrg prepare_perfect_loop_nest (class loop *loop, vec<loop_p> *loop_nest, 2003 1.1 mrg vec<data_reference_p> *datarefs, vec<ddr_p> *ddrs) 2004 1.1 mrg { 2005 1.1.1.3 mrg class loop *start_loop = NULL, *innermost = loop; 2006 1.1.1.3 mrg class loop *outermost = loops_for_fn (cfun)->tree_root; 2007 1.1 mrg 2008 1.1 mrg /* Find loop nest from the innermost loop. The outermost is the innermost 2009 1.1 mrg outer*/ 2010 1.1 mrg while (loop->num != 0 && loop->inner == start_loop 2011 1.1 mrg && flow_loop_nested_p (outermost, loop)) 2012 1.1 mrg { 2013 1.1 mrg if (!proper_loop_form_for_interchange (loop, &outermost)) 2014 1.1 mrg break; 2015 1.1 mrg 2016 1.1 mrg start_loop = loop; 2017 1.1 mrg /* If this loop has sibling loop, the father loop won't be in perfect 2018 1.1 mrg loop nest. */ 2019 1.1 mrg if (loop->next != NULL) 2020 1.1 mrg break; 2021 1.1 mrg 2022 1.1 mrg loop = loop_outer (loop); 2023 1.1 mrg } 2024 1.1 mrg if (!start_loop || !start_loop->inner) 2025 1.1 mrg return false; 2026 1.1 mrg 2027 1.1 mrg /* Prepare the data reference vector for the loop nest, pruning outer 2028 1.1 mrg loops we cannot handle. */ 2029 1.1 mrg start_loop = prepare_data_references (start_loop, datarefs); 2030 1.1 mrg if (!start_loop 2031 1.1 mrg /* Check if there is no data reference. */ 2032 1.1 mrg || datarefs->is_empty () 2033 1.1 mrg /* Check if there are too many of data references. */ 2034 1.1 mrg || (int) datarefs->length () > MAX_DATAREFS) 2035 1.1 mrg return false; 2036 1.1 mrg 2037 1.1 mrg /* Compute access strides for all data references, pruning outer 2038 1.1 mrg loops we cannot analyze refs in. */ 2039 1.1 mrg start_loop = compute_access_strides (start_loop, innermost, *datarefs); 2040 1.1 mrg if (!start_loop) 2041 1.1 mrg return false; 2042 1.1 mrg 2043 1.1 mrg /* Check if any interchange is profitable in the loop nest. */ 2044 1.1 mrg if (!should_interchange_loop_nest (start_loop, innermost, *datarefs)) 2045 1.1 mrg return false; 2046 1.1 mrg 2047 1.1 mrg /* Check if data dependences can be computed for loop nest starting from 2048 1.1 mrg start_loop. */ 2049 1.1 mrg loop = start_loop; 2050 1.1 mrg do { 2051 1.1 mrg loop_nest->truncate (0); 2052 1.1 mrg 2053 1.1 mrg if (loop != start_loop) 2054 1.1 mrg prune_datarefs_not_in_loop (start_loop, *datarefs); 2055 1.1 mrg 2056 1.1 mrg if (find_loop_nest (start_loop, loop_nest) 2057 1.1 mrg && tree_loop_interchange_compute_ddrs (*loop_nest, *datarefs, ddrs)) 2058 1.1 mrg { 2059 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 2060 1.1 mrg fprintf (dump_file, 2061 1.1 mrg "\nConsider loop interchange for loop_nest<%d - %d>\n", 2062 1.1 mrg start_loop->num, innermost->num); 2063 1.1 mrg 2064 1.1 mrg if (loop != start_loop) 2065 1.1 mrg prune_access_strides_not_in_loop (start_loop, innermost, *datarefs); 2066 1.1 mrg 2067 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 2068 1.1 mrg dump_access_strides (*datarefs); 2069 1.1 mrg 2070 1.1 mrg return true; 2071 1.1 mrg } 2072 1.1 mrg 2073 1.1 mrg free_dependence_relations (*ddrs); 2074 1.1 mrg *ddrs = vNULL; 2075 1.1 mrg /* Try to compute data dependences with the outermost loop stripped. */ 2076 1.1 mrg loop = start_loop; 2077 1.1 mrg start_loop = start_loop->inner; 2078 1.1 mrg } while (start_loop && start_loop->inner); 2079 1.1 mrg 2080 1.1 mrg return false; 2081 1.1 mrg } 2082 1.1 mrg 2083 1.1 mrg /* Main entry for loop interchange pass. */ 2084 1.1 mrg 2085 1.1 mrg unsigned int 2086 1.1 mrg pass_linterchange::execute (function *fun) 2087 1.1 mrg { 2088 1.1 mrg if (number_of_loops (fun) <= 2) 2089 1.1 mrg return 0; 2090 1.1 mrg 2091 1.1 mrg bool changed_p = false; 2092 1.1.1.4 mrg for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST)) 2093 1.1 mrg { 2094 1.1 mrg vec<loop_p> loop_nest = vNULL; 2095 1.1 mrg vec<data_reference_p> datarefs = vNULL; 2096 1.1 mrg vec<ddr_p> ddrs = vNULL; 2097 1.1 mrg if (prepare_perfect_loop_nest (loop, &loop_nest, &datarefs, &ddrs)) 2098 1.1 mrg { 2099 1.1 mrg tree_loop_interchange loop_interchange (loop_nest); 2100 1.1 mrg changed_p |= loop_interchange.interchange (datarefs, ddrs); 2101 1.1 mrg } 2102 1.1 mrg free_dependence_relations (ddrs); 2103 1.1 mrg free_data_refs_with_aux (datarefs); 2104 1.1 mrg loop_nest.release (); 2105 1.1 mrg } 2106 1.1 mrg 2107 1.1.1.4 mrg if (changed_p) 2108 1.1.1.4 mrg { 2109 1.1.1.4 mrg unsigned todo = TODO_update_ssa_only_virtuals; 2110 1.1.1.4 mrg todo |= loop_invariant_motion_in_fun (cfun, false); 2111 1.1.1.4 mrg scev_reset (); 2112 1.1.1.4 mrg return todo; 2113 1.1.1.4 mrg } 2114 1.1.1.4 mrg return 0; 2115 1.1 mrg } 2116 1.1 mrg 2117 1.1 mrg } // anon namespace 2118 1.1 mrg 2119 1.1 mrg gimple_opt_pass * 2120 1.1 mrg make_pass_linterchange (gcc::context *ctxt) 2121 1.1 mrg { 2122 1.1 mrg return new pass_linterchange (ctxt); 2123 1.1 mrg } 2124