Home | History | Annotate | Line # | Download | only in gcc
gimple-loop-interchange.cc revision 1.1.1.3
      1 /* Loop interchange.
      2    Copyright (C) 2017-2020 Free Software Foundation, Inc.
      3    Contributed by ARM Ltd.
      4 
      5 This file is part of GCC.
      6 
      7 GCC is free software; you can redistribute it and/or modify it
      8 under the terms of the GNU General Public License as published by the
      9 Free Software Foundation; either version 3, or (at your option) any
     10 later version.
     11 
     12 GCC is distributed in the hope that it will be useful, but WITHOUT
     13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
     14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     15 for more details.
     16 
     17 You should have received a copy of the GNU General Public License
     18 along with GCC; see the file COPYING3.  If not see
     19 <http://www.gnu.org/licenses/>.  */
     20 
     21 #include "config.h"
     22 #include "system.h"
     23 #include "coretypes.h"
     24 #include "backend.h"
     25 #include "is-a.h"
     26 #include "tree.h"
     27 #include "gimple.h"
     28 #include "tree-pass.h"
     29 #include "ssa.h"
     30 #include "gimple-pretty-print.h"
     31 #include "fold-const.h"
     32 #include "gimplify.h"
     33 #include "gimple-iterator.h"
     34 #include "gimplify-me.h"
     35 #include "cfgloop.h"
     36 #include "tree-ssa.h"
     37 #include "tree-scalar-evolution.h"
     38 #include "tree-ssa-loop-manip.h"
     39 #include "tree-ssa-loop-niter.h"
     40 #include "tree-ssa-loop-ivopts.h"
     41 #include "tree-ssa-dce.h"
     42 #include "tree-data-ref.h"
     43 #include "tree-vectorizer.h"
     44 
     45 /* This pass performs loop interchange: for example, the loop nest
     46 
     47    for (int j = 0; j < N; j++)
     48      for (int k = 0; k < N; k++)
     49        for (int i = 0; i < N; i++)
     50 	 c[i][j] = c[i][j] + a[i][k]*b[k][j];
     51 
     52    is transformed to
     53 
     54    for (int i = 0; i < N; i++)
     55      for (int j = 0; j < N; j++)
     56        for (int k = 0; k < N; k++)
     57 	 c[i][j] = c[i][j] + a[i][k]*b[k][j];
     58 
     59    This pass implements loop interchange in the following steps:
     60 
     61      1) Find perfect loop nest for each innermost loop and compute data
     62 	dependence relations for it.  For above example, loop nest is
     63 	<loop_j, loop_k, loop_i>.
     64      2) From innermost to outermost loop, this pass tries to interchange
     65 	each loop pair.  For above case, it firstly tries to interchange
     66 	<loop_k, loop_i> and loop nest becomes <loop_j, loop_i, loop_k>.
     67 	Then it tries to interchange <loop_j, loop_i> and loop nest becomes
     68 	<loop_i, loop_j, loop_k>.  The overall effect is to move innermost
     69 	loop to the outermost position.  For loop pair <loop_i, loop_j>
     70 	to be interchanged, we:
     71      3) Check if data dependence relations are valid for loop interchange.
     72      4) Check if both loops can be interchanged in terms of transformation.
     73      5) Check if interchanging the two loops is profitable.
     74      6) Interchange the two loops by mapping induction variables.
     75 
     76    This pass also handles reductions in loop nest.  So far we only support
     77    simple reduction of inner loop and double reduction of the loop nest.  */
     78 
     79 /* Maximum number of stmts in each loop that should be interchanged.  */
     80 #define MAX_NUM_STMT    (param_loop_interchange_max_num_stmts)
     81 /* Maximum number of data references in loop nest.  */
     82 #define MAX_DATAREFS    (param_loop_max_datarefs_for_datadeps)
     83 
     84 /* Comparison ratio of access stride between inner/outer loops to be
     85    interchanged.  This is the minimum stride ratio for loop interchange
     86    to be profitable.  */
     87 #define OUTER_STRIDE_RATIO  (param_loop_interchange_stride_ratio)
     88 /* The same as above, but we require higher ratio for interchanging the
     89    innermost two loops.  */
     90 #define INNER_STRIDE_RATIO  ((OUTER_STRIDE_RATIO) + 1)
     91 
     92 /* Comparison ratio of stmt cost between inner/outer loops.  Loops won't
     93    be interchanged if outer loop has too many stmts.  */
     94 #define STMT_COST_RATIO     (3)
     95 
     96 /* Vector of strides that DR accesses in each level loop of a loop nest.  */
     97 #define DR_ACCESS_STRIDE(dr) ((vec<tree> *) dr->aux)
     98 
     99 /* Structure recording loop induction variable.  */
    100 typedef struct induction
    101 {
    102   /* IV itself.  */
    103   tree var;
    104   /* IV's initializing value, which is the init arg of the IV PHI node.  */
    105   tree init_val;
    106   /* IV's initializing expr, which is (the expanded result of) init_val.  */
    107   tree init_expr;
    108   /* IV's step.  */
    109   tree step;
    110 } *induction_p;
    111 
    112 /* Enum type for loop reduction variable.  */
    113 enum reduction_type
    114 {
    115   UNKNOWN_RTYPE = 0,
    116   SIMPLE_RTYPE,
    117   DOUBLE_RTYPE
    118 };
    119 
    120 /* Structure recording loop reduction variable.  */
    121 typedef struct reduction
    122 {
    123   /* Reduction itself.  */
    124   tree var;
    125   /* PHI node defining reduction variable.  */
    126   gphi *phi;
    127   /* Init and next variables of the reduction.  */
    128   tree init;
    129   tree next;
    130   /* Lcssa PHI node if reduction is used outside of its definition loop.  */
    131   gphi *lcssa_phi;
    132   /* Stmts defining init and next.  */
    133   gimple *producer;
    134   gimple *consumer;
    135   /* If init is loaded from memory, this is the loading memory reference.  */
    136   tree init_ref;
    137   /* If reduction is finally stored to memory, this is the stored memory
    138      reference.  */
    139   tree fini_ref;
    140   enum reduction_type type;
    141 } *reduction_p;
    142 
    143 
    144 /* Dump reduction RE.  */
    145 
    146 static void
    147 dump_reduction (reduction_p re)
    148 {
    149   if (re->type == SIMPLE_RTYPE)
    150     fprintf (dump_file, "  Simple reduction:  ");
    151   else if (re->type == DOUBLE_RTYPE)
    152     fprintf (dump_file, "  Double reduction:  ");
    153   else
    154     fprintf (dump_file, "  Unknown reduction:  ");
    155 
    156   print_gimple_stmt (dump_file, re->phi, 0);
    157 }
    158 
    159 /* Dump LOOP's induction IV.  */
    160 static void
    161 dump_induction (class loop *loop, induction_p iv)
    162 {
    163   fprintf (dump_file, "  Induction:  ");
    164   print_generic_expr (dump_file, iv->var, TDF_SLIM);
    165   fprintf (dump_file, " = {");
    166   print_generic_expr (dump_file, iv->init_expr, TDF_SLIM);
    167   fprintf (dump_file, ", ");
    168   print_generic_expr (dump_file, iv->step, TDF_SLIM);
    169   fprintf (dump_file, "}_%d\n", loop->num);
    170 }
    171 
    172 /* Loop candidate for interchange.  */
    173 
    174 class loop_cand
    175 {
    176 public:
    177   loop_cand (class loop *, class loop *);
    178   ~loop_cand ();
    179 
    180   reduction_p find_reduction_by_stmt (gimple *);
    181   void classify_simple_reduction (reduction_p);
    182   bool analyze_iloop_reduction_var (tree);
    183   bool analyze_oloop_reduction_var (loop_cand *, tree);
    184   bool analyze_induction_var (tree, tree);
    185   bool analyze_carried_vars (loop_cand *);
    186   bool analyze_lcssa_phis (void);
    187   bool can_interchange_p (loop_cand *);
    188   void undo_simple_reduction (reduction_p, bitmap);
    189 
    190   /* The loop itself.  */
    191   class loop *m_loop;
    192   /* The outer loop for interchange.  It equals to loop if this loop cand
    193      itself represents the outer loop.  */
    194   class loop *m_outer;
    195   /* Vector of induction variables in loop.  */
    196   vec<induction_p> m_inductions;
    197   /* Vector of reduction variables in loop.  */
    198   vec<reduction_p> m_reductions;
    199   /* Lcssa PHI nodes of this loop.  */
    200   vec<gphi *> m_lcssa_nodes;
    201   /* Single exit edge of this loop.  */
    202   edge m_exit;
    203   /* Basic blocks of this loop.  */
    204   basic_block *m_bbs;
    205   /* Number of stmts of this loop.  Inner loops' stmts are not included.  */
    206   int m_num_stmts;
    207   /* Number of constant initialized simple reduction.  */
    208   int m_const_init_reduc;
    209 };
    210 
    211 /* Constructor.  */
    212 
    213 loop_cand::loop_cand (class loop *loop, class loop *outer)
    214   : m_loop (loop), m_outer (outer), m_exit (single_exit (loop)),
    215     m_bbs (get_loop_body (loop)), m_num_stmts (0), m_const_init_reduc (0)
    216 {
    217     m_inductions.create (3);
    218     m_reductions.create (3);
    219     m_lcssa_nodes.create (3);
    220 }
    221 
    222 /* Destructor.  */
    223 
    224 loop_cand::~loop_cand ()
    225 {
    226   induction_p iv;
    227   for (unsigned i = 0; m_inductions.iterate (i, &iv); ++i)
    228     free (iv);
    229 
    230   reduction_p re;
    231   for (unsigned i = 0; m_reductions.iterate (i, &re); ++i)
    232     free (re);
    233 
    234   m_inductions.release ();
    235   m_reductions.release ();
    236   m_lcssa_nodes.release ();
    237   free (m_bbs);
    238 }
    239 
    240 /* Return single use stmt of VAR in LOOP, otherwise return NULL.  */
    241 
    242 static gimple *
    243 single_use_in_loop (tree var, class loop *loop)
    244 {
    245   gimple *stmt, *res = NULL;
    246   use_operand_p use_p;
    247   imm_use_iterator iterator;
    248 
    249   FOR_EACH_IMM_USE_FAST (use_p, iterator, var)
    250     {
    251       stmt = USE_STMT (use_p);
    252       if (is_gimple_debug (stmt))
    253 	continue;
    254 
    255       if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
    256 	continue;
    257 
    258       if (res)
    259 	return NULL;
    260 
    261       res = stmt;
    262     }
    263   return res;
    264 }
    265 
    266 /* Return true if E is unsupported in loop interchange, i.e, E is a complex
    267    edge or part of irreducible loop.  */
    268 
    269 static inline bool
    270 unsupported_edge (edge e)
    271 {
    272   return (e->flags & (EDGE_COMPLEX | EDGE_IRREDUCIBLE_LOOP));
    273 }
    274 
    275 /* Return the reduction if STMT is one of its lcssa PHI, producer or consumer
    276    stmt.  */
    277 
    278 reduction_p
    279 loop_cand::find_reduction_by_stmt (gimple *stmt)
    280 {
    281   gphi *phi = dyn_cast <gphi *> (stmt);
    282   reduction_p re;
    283 
    284   for (unsigned i = 0; m_reductions.iterate (i, &re); ++i)
    285     if ((phi != NULL && phi == re->lcssa_phi)
    286 	|| (stmt == re->producer || stmt == re->consumer))
    287       return re;
    288 
    289   return NULL;
    290 }
    291 
    292 /* Return true if current loop_cand be interchanged.  ILOOP is not NULL if
    293    current loop_cand is outer loop in loop nest.  */
    294 
    295 bool
    296 loop_cand::can_interchange_p (loop_cand *iloop)
    297 {
    298   /* For now we only support at most one reduction.  */
    299   unsigned allowed_reduction_num = 1;
    300 
    301   /* Only support reduction if the loop nest to be interchanged is the
    302      innermostin two loops.  */
    303   if ((iloop == NULL && m_loop->inner != NULL)
    304        || (iloop != NULL && iloop->m_loop->inner != NULL))
    305     allowed_reduction_num = 0;
    306 
    307   if (m_reductions.length () > allowed_reduction_num
    308       || (m_reductions.length () == 1
    309 	  && m_reductions[0]->type == UNKNOWN_RTYPE))
    310     return false;
    311 
    312   /* Only support lcssa PHI node which is for reduction.  */
    313   if (m_lcssa_nodes.length () > allowed_reduction_num)
    314     return false;
    315 
    316   /* Check if basic block has any unsupported operation.  Note basic blocks
    317      of inner loops are not checked here.  */
    318   for (unsigned i = 0; i < m_loop->num_nodes; i++)
    319     {
    320       basic_block bb = m_bbs[i];
    321       gphi_iterator psi;
    322       gimple_stmt_iterator gsi;
    323 
    324       /* Skip basic blocks of inner loops.  */
    325       if (bb->loop_father != m_loop)
    326        continue;
    327 
    328       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
    329 	{
    330 	  gimple *stmt = gsi_stmt (gsi);
    331 	  if (is_gimple_debug (stmt))
    332 	    continue;
    333 
    334 	  if (gimple_has_side_effects (stmt))
    335 	    return false;
    336 
    337 	  m_num_stmts++;
    338 	  if (gcall *call = dyn_cast <gcall *> (stmt))
    339 	    {
    340 	      /* In basic block of outer loop, the call should be cheap since
    341 		 it will be moved to inner loop.  */
    342 	      if (iloop != NULL
    343 		  && !gimple_inexpensive_call_p (call))
    344 		return false;
    345 	      continue;
    346 	    }
    347 
    348 	  if (!iloop || !gimple_vuse (stmt))
    349 	    continue;
    350 
    351 	  /* Support stmt accessing memory in outer loop only if it is for
    352 	     inner loop's reduction.  */
    353 	  if (iloop->find_reduction_by_stmt (stmt))
    354 	    continue;
    355 
    356 	  tree lhs;
    357 	  /* Support loop invariant memory reference if it's only used once by
    358 	     inner loop.  */
    359 	  /* ???  How's this checking for invariantness?  */
    360 	  if (gimple_assign_single_p (stmt)
    361 	      && (lhs = gimple_assign_lhs (stmt)) != NULL_TREE
    362 	      && TREE_CODE (lhs) == SSA_NAME
    363 	      && single_use_in_loop (lhs, iloop->m_loop))
    364 	    continue;
    365 
    366 	  return false;
    367 	}
    368       /* Check if loop has too many stmts.  */
    369       if (m_num_stmts > MAX_NUM_STMT)
    370 	return false;
    371 
    372       /* Allow PHI nodes in any basic block of inner loop, PHI nodes in outer
    373 	 loop's header, or PHI nodes in dest bb of inner loop's exit edge.  */
    374       if (!iloop || bb == m_loop->header
    375 	  || bb == iloop->m_exit->dest)
    376 	continue;
    377 
    378       /* Don't allow any other PHI nodes.  */
    379       for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi))
    380 	if (!virtual_operand_p (PHI_RESULT (psi.phi ())))
    381 	  return false;
    382     }
    383 
    384   return true;
    385 }
    386 
    387 /* Programmers and optimizers (like loop store motion) may optimize code:
    388 
    389      for (int i = 0; i < N; i++)
    390        for (int j = 0; j < N; j++)
    391 	 a[i] += b[j][i] * c[j][i];
    392 
    393    into reduction:
    394 
    395      for (int i = 0; i < N; i++)
    396        {
    397 	 // producer.  Note sum can be intitialized to a constant.
    398 	 int sum = a[i];
    399 	 for (int j = 0; j < N; j++)
    400 	   {
    401 	     sum += b[j][i] * c[j][i];
    402 	   }
    403 	 // consumer.
    404 	 a[i] = sum;
    405        }
    406 
    407    The result code can't be interchanged without undoing the optimization.
    408    This function classifies this kind reduction and records information so
    409    that we can undo the store motion during interchange.  */
    410 
    411 void
    412 loop_cand::classify_simple_reduction (reduction_p re)
    413 {
    414   gimple *producer, *consumer;
    415 
    416   /* Check init variable of reduction and how it is initialized.  */
    417   if (TREE_CODE (re->init) == SSA_NAME)
    418     {
    419       producer = SSA_NAME_DEF_STMT (re->init);
    420       re->producer = producer;
    421       basic_block bb = gimple_bb (producer);
    422       if (!bb || bb->loop_father != m_outer)
    423 	return;
    424 
    425       if (!gimple_assign_load_p (producer))
    426 	return;
    427 
    428       re->init_ref = gimple_assign_rhs1 (producer);
    429     }
    430   else if (CONSTANT_CLASS_P (re->init))
    431     m_const_init_reduc++;
    432   else
    433     return;
    434 
    435   /* Check how reduction variable is used.  */
    436   consumer = single_use_in_loop (PHI_RESULT (re->lcssa_phi), m_outer);
    437   if (!consumer
    438       || !gimple_store_p (consumer))
    439     return;
    440 
    441   re->fini_ref = gimple_get_lhs (consumer);
    442   re->consumer = consumer;
    443 
    444   /* Simple reduction with constant initializer.  */
    445   if (!re->init_ref)
    446     {
    447       gcc_assert (CONSTANT_CLASS_P (re->init));
    448       re->init_ref = unshare_expr (re->fini_ref);
    449     }
    450 
    451   /* Require memory references in producer and consumer are the same so
    452      that we can undo reduction during interchange.  */
    453   if (re->init_ref && !operand_equal_p (re->init_ref, re->fini_ref, 0))
    454     return;
    455 
    456   re->type = SIMPLE_RTYPE;
    457 }
    458 
    459 /* Analyze reduction variable VAR for inner loop of the loop nest to be
    460    interchanged.  Return true if analysis succeeds.  */
    461 
    462 bool
    463 loop_cand::analyze_iloop_reduction_var (tree var)
    464 {
    465   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
    466   gphi *lcssa_phi = NULL, *use_phi;
    467   tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
    468   tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop));
    469   reduction_p re;
    470   gimple *stmt, *next_def, *single_use = NULL;
    471   use_operand_p use_p;
    472   imm_use_iterator iterator;
    473 
    474   if (TREE_CODE (next) != SSA_NAME)
    475     return false;
    476 
    477   next_def = SSA_NAME_DEF_STMT (next);
    478   basic_block bb = gimple_bb (next_def);
    479   if (!bb || !flow_bb_inside_loop_p (m_loop, bb))
    480     return false;
    481 
    482   /* In restricted reduction, the var is (and must be) used in defining
    483      the updated var.  The process can be depicted as below:
    484 
    485 		var ;; = PHI<init, next>
    486 		 |
    487 		 |
    488 		 v
    489       +---------------------+
    490       | reduction operators | <-- other operands
    491       +---------------------+
    492 		 |
    493 		 |
    494 		 v
    495 		next
    496 
    497      In terms loop interchange, we don't change how NEXT is computed based
    498      on VAR and OTHER OPERANDS.  In case of double reduction in loop nest
    499      to be interchanged, we don't changed it at all.  In the case of simple
    500      reduction in inner loop, we only make change how VAR/NEXT is loaded or
    501      stored.  With these conditions, we can relax restrictions on reduction
    502      in a way that reduction operation is seen as black box.  In general,
    503      we can ignore reassociation of reduction operator; we can handle fake
    504      reductions in which VAR is not even used to compute NEXT.  */
    505   if (! single_imm_use (var, &use_p, &single_use)
    506       || ! flow_bb_inside_loop_p (m_loop, gimple_bb (single_use)))
    507     return false;
    508 
    509   /* Check the reduction operation.  We require a left-associative operation.
    510      For FP math we also need to be allowed to associate operations.  */
    511   if (gassign *ass = dyn_cast <gassign *> (single_use))
    512     {
    513       enum tree_code code = gimple_assign_rhs_code (ass);
    514       if (! (associative_tree_code (code)
    515 	     || (code == MINUS_EXPR
    516 		 && use_p->use == gimple_assign_rhs1_ptr (ass)))
    517 	  || (FLOAT_TYPE_P (TREE_TYPE (var))
    518 	      && ! flag_associative_math))
    519 	return false;
    520     }
    521   else
    522     return false;
    523 
    524   /* Handle and verify a series of stmts feeding the reduction op.  */
    525   if (single_use != next_def
    526       && !check_reduction_path (dump_user_location_t (), m_loop, phi, next,
    527 				gimple_assign_rhs_code (single_use)))
    528     return false;
    529 
    530   /* Only support cases in which INIT is used in inner loop.  */
    531   if (TREE_CODE (init) == SSA_NAME)
    532     FOR_EACH_IMM_USE_FAST (use_p, iterator, init)
    533       {
    534 	stmt = USE_STMT (use_p);
    535 	if (is_gimple_debug (stmt))
    536 	  continue;
    537 
    538 	if (!flow_bb_inside_loop_p (m_loop, gimple_bb (stmt)))
    539 	  return false;
    540       }
    541 
    542   FOR_EACH_IMM_USE_FAST (use_p, iterator, next)
    543     {
    544       stmt = USE_STMT (use_p);
    545       if (is_gimple_debug (stmt))
    546 	continue;
    547 
    548       /* Or else it's used in PHI itself.  */
    549       use_phi = dyn_cast <gphi *> (stmt);
    550       if (use_phi == phi)
    551 	continue;
    552 
    553       if (use_phi != NULL
    554 	  && lcssa_phi == NULL
    555 	  && gimple_bb (stmt) == m_exit->dest
    556 	  && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next)
    557 	lcssa_phi = use_phi;
    558       else
    559 	return false;
    560     }
    561   if (!lcssa_phi)
    562     return false;
    563 
    564   re = XCNEW (struct reduction);
    565   re->var = var;
    566   re->init = init;
    567   re->next = next;
    568   re->phi = phi;
    569   re->lcssa_phi = lcssa_phi;
    570 
    571   classify_simple_reduction (re);
    572 
    573   if (dump_file && (dump_flags & TDF_DETAILS))
    574     dump_reduction (re);
    575 
    576   m_reductions.safe_push (re);
    577   return true;
    578 }
    579 
    580 /* Analyze reduction variable VAR for outer loop of the loop nest to be
    581    interchanged.  ILOOP is not NULL and points to inner loop.  For the
    582    moment, we only support double reduction for outer loop, like:
    583 
    584      for (int i = 0; i < n; i++)
    585        {
    586 	 int sum = 0;
    587 
    588 	 for (int j = 0; j < n; j++)    // outer loop
    589 	   for (int k = 0; k < n; k++)  // inner loop
    590 	     sum += a[i][k]*b[k][j];
    591 
    592 	 s[i] = sum;
    593        }
    594 
    595    Note the innermost two loops are the loop nest to be interchanged.
    596    Return true if analysis succeeds.  */
    597 
    598 bool
    599 loop_cand::analyze_oloop_reduction_var (loop_cand *iloop, tree var)
    600 {
    601   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
    602   gphi *lcssa_phi = NULL, *use_phi;
    603   tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
    604   tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop));
    605   reduction_p re;
    606   gimple *stmt, *next_def;
    607   use_operand_p use_p;
    608   imm_use_iterator iterator;
    609 
    610   if (TREE_CODE (next) != SSA_NAME)
    611     return false;
    612 
    613   next_def = SSA_NAME_DEF_STMT (next);
    614   basic_block bb = gimple_bb (next_def);
    615   if (!bb || !flow_bb_inside_loop_p (m_loop, bb))
    616     return false;
    617 
    618   /* Find inner loop's simple reduction that uses var as initializer.  */
    619   reduction_p inner_re = NULL;
    620   for (unsigned i = 0; iloop->m_reductions.iterate (i, &inner_re); ++i)
    621     if (inner_re->init == var || operand_equal_p (inner_re->init, var, 0))
    622       break;
    623 
    624   if (inner_re == NULL
    625       || inner_re->type != UNKNOWN_RTYPE
    626       || inner_re->producer != phi)
    627     return false;
    628 
    629   /* In case of double reduction, outer loop's reduction should be updated
    630      by inner loop's simple reduction.  */
    631   if (next_def != inner_re->lcssa_phi)
    632     return false;
    633 
    634   /* Outer loop's reduction should only be used to initialize inner loop's
    635      simple reduction.  */
    636   if (! single_imm_use (var, &use_p, &stmt)
    637       || stmt != inner_re->phi)
    638     return false;
    639 
    640   /* Check this reduction is correctly used outside of loop via lcssa phi.  */
    641   FOR_EACH_IMM_USE_FAST (use_p, iterator, next)
    642     {
    643       stmt = USE_STMT (use_p);
    644       if (is_gimple_debug (stmt))
    645 	continue;
    646 
    647       /* Or else it's used in PHI itself.  */
    648       use_phi = dyn_cast <gphi *> (stmt);
    649       if (use_phi == phi)
    650 	continue;
    651 
    652       if (lcssa_phi == NULL
    653 	  && use_phi != NULL
    654 	  && gimple_bb (stmt) == m_exit->dest
    655 	  && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next)
    656 	lcssa_phi = use_phi;
    657       else
    658 	return false;
    659     }
    660   if (!lcssa_phi)
    661     return false;
    662 
    663   re = XCNEW (struct reduction);
    664   re->var = var;
    665   re->init = init;
    666   re->next = next;
    667   re->phi = phi;
    668   re->lcssa_phi = lcssa_phi;
    669   re->type = DOUBLE_RTYPE;
    670   inner_re->type = DOUBLE_RTYPE;
    671 
    672   if (dump_file && (dump_flags & TDF_DETAILS))
    673     dump_reduction (re);
    674 
    675   m_reductions.safe_push (re);
    676   return true;
    677 }
    678 
    679 /* Return true if VAR is induction variable of current loop whose scev is
    680    specified by CHREC.  */
    681 
    682 bool
    683 loop_cand::analyze_induction_var (tree var, tree chrec)
    684 {
    685   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
    686   tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
    687 
    688   /* Var is loop invariant, though it's unlikely to happen.  */
    689   if (tree_does_not_contain_chrecs (chrec))
    690     {
    691       /* Punt on floating point invariants if honoring signed zeros,
    692 	 representing that as + 0.0 would change the result if init
    693 	 is -0.0.  Similarly for SNaNs it can raise exception.  */
    694       if (HONOR_SIGNED_ZEROS (chrec) || HONOR_SNANS (chrec))
    695 	return false;
    696       struct induction *iv = XCNEW (struct induction);
    697       iv->var = var;
    698       iv->init_val = init;
    699       iv->init_expr = chrec;
    700       iv->step = build_zero_cst (TREE_TYPE (chrec));
    701       m_inductions.safe_push (iv);
    702       return true;
    703     }
    704 
    705   if (TREE_CODE (chrec) != POLYNOMIAL_CHREC
    706       || CHREC_VARIABLE (chrec) != (unsigned) m_loop->num
    707       || tree_contains_chrecs (CHREC_LEFT (chrec), NULL)
    708       || tree_contains_chrecs (CHREC_RIGHT (chrec), NULL))
    709     return false;
    710 
    711   struct induction *iv = XCNEW (struct induction);
    712   iv->var = var;
    713   iv->init_val = init;
    714   iv->init_expr = CHREC_LEFT (chrec);
    715   iv->step = CHREC_RIGHT (chrec);
    716 
    717   if (dump_file && (dump_flags & TDF_DETAILS))
    718     dump_induction (m_loop, iv);
    719 
    720   m_inductions.safe_push (iv);
    721   return true;
    722 }
    723 
    724 /* Return true if all loop carried variables defined in loop header can
    725    be successfully analyzed.  */
    726 
    727 bool
    728 loop_cand::analyze_carried_vars (loop_cand *iloop)
    729 {
    730   edge e = loop_preheader_edge (m_outer);
    731   gphi_iterator gsi;
    732 
    733   if (dump_file && (dump_flags & TDF_DETAILS))
    734     fprintf (dump_file, "\nLoop(%d) carried vars:\n", m_loop->num);
    735 
    736   for (gsi = gsi_start_phis (m_loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
    737     {
    738       gphi *phi = gsi.phi ();
    739 
    740       tree var = PHI_RESULT (phi);
    741       if (virtual_operand_p (var))
    742 	continue;
    743 
    744       tree chrec = analyze_scalar_evolution (m_loop, var);
    745       chrec = instantiate_scev (e, m_loop, chrec);
    746 
    747       /* Analyze var as reduction variable.  */
    748       if (chrec_contains_undetermined (chrec)
    749 	  || chrec_contains_symbols_defined_in_loop (chrec, m_outer->num))
    750 	{
    751 	  if (iloop && !analyze_oloop_reduction_var (iloop, var))
    752 	    return false;
    753 	  if (!iloop && !analyze_iloop_reduction_var (var))
    754 	    return false;
    755 	}
    756       /* Analyze var as induction variable.  */
    757       else if (!analyze_induction_var (var, chrec))
    758 	return false;
    759     }
    760 
    761   return true;
    762 }
    763 
    764 /* Return TRUE if loop closed PHI nodes can be analyzed successfully.  */
    765 
    766 bool
    767 loop_cand::analyze_lcssa_phis (void)
    768 {
    769   gphi_iterator gsi;
    770   for (gsi = gsi_start_phis (m_exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
    771     {
    772       gphi *phi = gsi.phi ();
    773 
    774       if (virtual_operand_p (PHI_RESULT (phi)))
    775 	continue;
    776 
    777       /* TODO: We only support lcssa phi for reduction for now.  */
    778       if (!find_reduction_by_stmt (phi))
    779 	return false;
    780     }
    781 
    782   return true;
    783 }
    784 
    785 /* CONSUMER is a stmt in BB storing reduction result into memory object.
    786    When the reduction is intialized from constant value, we need to add
    787    a stmt loading from the memory object to target basic block in inner
    788    loop during undoing the reduction.  Problem is that memory reference
    789    may use ssa variables not dominating the target basic block.  This
    790    function finds all stmts on which CONSUMER depends in basic block BB,
    791    records and returns them via STMTS.  */
    792 
    793 static void
    794 find_deps_in_bb_for_stmt (gimple_seq *stmts, basic_block bb, gimple *consumer)
    795 {
    796   auto_vec<gimple *, 4> worklist;
    797   use_operand_p use_p;
    798   ssa_op_iter iter;
    799   gimple *stmt, *def_stmt;
    800   gimple_stmt_iterator gsi;
    801 
    802   /* First clear flag for stmts in bb.  */
    803   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
    804     gimple_set_plf (gsi_stmt (gsi), GF_PLF_1, false);
    805 
    806   /* DFS search all depended stmts in bb and mark flag for these stmts.  */
    807   worklist.safe_push (consumer);
    808   while (!worklist.is_empty ())
    809     {
    810       stmt = worklist.pop ();
    811       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
    812 	{
    813 	  def_stmt = SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p));
    814 
    815 	  if (is_a <gphi *> (def_stmt)
    816 	      || gimple_bb (def_stmt) != bb
    817 	      || gimple_plf (def_stmt, GF_PLF_1))
    818 	    continue;
    819 
    820 	  worklist.safe_push (def_stmt);
    821 	}
    822       gimple_set_plf (stmt, GF_PLF_1, true);
    823     }
    824   for (gsi = gsi_start_nondebug_bb (bb);
    825        !gsi_end_p (gsi) && (stmt = gsi_stmt (gsi)) != consumer;)
    826     {
    827       /* Move dep stmts to sequence STMTS.  */
    828       if (gimple_plf (stmt, GF_PLF_1))
    829 	{
    830 	  gsi_remove (&gsi, false);
    831 	  gimple_seq_add_stmt_without_update (stmts, stmt);
    832 	}
    833       else
    834 	gsi_next_nondebug (&gsi);
    835     }
    836 }
    837 
    838 /* User can write, optimizers can generate simple reduction RE for inner
    839    loop.  In order to make interchange valid, we have to undo reduction by
    840    moving producer and consumer stmts into the inner loop.  For example,
    841    below code:
    842 
    843      init = MEM_REF[idx];		//producer
    844      loop:
    845        var = phi<init, next>
    846        next = var op ...
    847      reduc_sum = phi<next>
    848      MEM_REF[idx] = reduc_sum		//consumer
    849 
    850    is transformed into:
    851 
    852      loop:
    853        new_var = MEM_REF[idx];		//producer after moving
    854        next = new_var op ...
    855        MEM_REF[idx] = next;		//consumer after moving
    856 
    857    Note if the reduction variable is initialized to constant, like:
    858 
    859      var = phi<0.0, next>
    860 
    861    we compute new_var as below:
    862 
    863      loop:
    864        tmp = MEM_REF[idx];
    865        new_var = !first_iteration ? tmp : 0.0;
    866 
    867    so that the initial const is used in the first iteration of loop.  Also
    868    record ssa variables for dead code elimination in DCE_SEEDS.  */
    869 
    870 void
    871 loop_cand::undo_simple_reduction (reduction_p re, bitmap dce_seeds)
    872 {
    873   gimple *stmt;
    874   gimple_stmt_iterator from, to = gsi_after_labels (m_loop->header);
    875   gimple_seq stmts = NULL;
    876   tree new_var;
    877 
    878   /* Prepare the initialization stmts and insert it to inner loop.  */
    879   if (re->producer != NULL)
    880     {
    881       gimple_set_vuse (re->producer, NULL_TREE);
    882       update_stmt (re->producer);
    883       from = gsi_for_stmt (re->producer);
    884       gsi_remove (&from, false);
    885       gimple_seq_add_stmt_without_update (&stmts, re->producer);
    886       new_var = re->init;
    887     }
    888   else
    889     {
    890       /* Find all stmts on which expression "MEM_REF[idx]" depends.  */
    891       find_deps_in_bb_for_stmt (&stmts, gimple_bb (re->consumer), re->consumer);
    892       /* Because we generate new stmt loading from the MEM_REF to TMP.  */
    893       tree cond, tmp = copy_ssa_name (re->var);
    894       stmt = gimple_build_assign (tmp, re->init_ref);
    895       gimple_seq_add_stmt_without_update (&stmts, stmt);
    896 
    897       /* Init new_var to MEM_REF or CONST depending on if it is the first
    898 	 iteration.  */
    899       induction_p iv = m_inductions[0];
    900       cond = fold_build2 (NE_EXPR, boolean_type_node, iv->var, iv->init_val);
    901       new_var = copy_ssa_name (re->var);
    902       stmt = gimple_build_assign (new_var, COND_EXPR, cond, tmp, re->init);
    903       gimple_seq_add_stmt_without_update (&stmts, stmt);
    904     }
    905   gsi_insert_seq_before (&to, stmts, GSI_SAME_STMT);
    906 
    907   /* Replace all uses of reduction var with new variable.  */
    908   use_operand_p use_p;
    909   imm_use_iterator iterator;
    910   FOR_EACH_IMM_USE_STMT (stmt, iterator, re->var)
    911     {
    912       FOR_EACH_IMM_USE_ON_STMT (use_p, iterator)
    913 	SET_USE (use_p, new_var);
    914 
    915       update_stmt (stmt);
    916     }
    917 
    918   /* Move consumer stmt into inner loop, just after reduction next's def.  */
    919   unlink_stmt_vdef (re->consumer);
    920   release_ssa_name (gimple_vdef (re->consumer));
    921   gimple_set_vdef (re->consumer, NULL_TREE);
    922   gimple_set_vuse (re->consumer, NULL_TREE);
    923   gimple_assign_set_rhs1 (re->consumer, re->next);
    924   update_stmt (re->consumer);
    925   from = gsi_for_stmt (re->consumer);
    926   to = gsi_for_stmt (SSA_NAME_DEF_STMT (re->next));
    927   gsi_move_after (&from, &to);
    928 
    929   /* Mark the reduction variables for DCE.  */
    930   bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (re->var));
    931   bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (PHI_RESULT (re->lcssa_phi)));
    932 }
    933 
    934 /* Free DATAREFS and its auxiliary memory.  */
    935 
    936 static void
    937 free_data_refs_with_aux (vec<data_reference_p> datarefs)
    938 {
    939   data_reference_p dr;
    940   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
    941     if (dr->aux != NULL)
    942       {
    943 	DR_ACCESS_STRIDE (dr)->release ();
    944 	delete (vec<tree> *) dr->aux;
    945       }
    946 
    947   free_data_refs (datarefs);
    948 }
    949 
    950 /* Class for loop interchange transformation.  */
    951 
    952 class tree_loop_interchange
    953 {
    954 public:
    955   tree_loop_interchange (vec<class loop *> loop_nest)
    956     : m_loop_nest (loop_nest), m_niters_iv_var (NULL_TREE),
    957       m_dce_seeds (BITMAP_ALLOC (NULL)) { }
    958   ~tree_loop_interchange () { BITMAP_FREE (m_dce_seeds); }
    959   bool interchange (vec<data_reference_p>, vec<ddr_p>);
    960 
    961 private:
    962   void update_data_info (unsigned, unsigned, vec<data_reference_p>, vec<ddr_p>);
    963   bool valid_data_dependences (unsigned, unsigned, vec<ddr_p>);
    964   void interchange_loops (loop_cand &, loop_cand &);
    965   void map_inductions_to_loop (loop_cand &, loop_cand &);
    966   void move_code_to_inner_loop (class loop *, class loop *, basic_block *);
    967 
    968   /* The whole loop nest in which interchange is ongoing.  */
    969   vec<class loop *> m_loop_nest;
    970   /* We create new IV which is only used in loop's exit condition check.
    971      In case of 3-level loop nest interchange, when we interchange the
    972      innermost two loops, new IV created in the middle level loop does
    973      not need to be preserved in interchanging the outermost two loops
    974      later.  We record the IV so that it can be skipped.  */
    975   tree m_niters_iv_var;
    976   /* Bitmap of seed variables for dead code elimination after interchange.  */
    977   bitmap m_dce_seeds;
    978 };
    979 
    980 /* Update data refs' access stride and dependence information after loop
    981    interchange.  I_IDX/O_IDX gives indices of interchanged loops in loop
    982    nest.  DATAREFS are data references.  DDRS are data dependences.  */
    983 
    984 void
    985 tree_loop_interchange::update_data_info (unsigned i_idx, unsigned o_idx,
    986 					 vec<data_reference_p> datarefs,
    987 					 vec<ddr_p> ddrs)
    988 {
    989   struct data_reference *dr;
    990   struct data_dependence_relation *ddr;
    991 
    992   /* Update strides of data references.  */
    993   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
    994     {
    995       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
    996       gcc_assert (stride->length () > i_idx);
    997       std::swap ((*stride)[i_idx], (*stride)[o_idx]);
    998     }
    999   /* Update data dependences.  */
   1000   for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i)
   1001     if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
   1002       {
   1003         for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j)
   1004 	  {
   1005 	    lambda_vector dist_vect = DDR_DIST_VECT (ddr, j);
   1006 	    std::swap (dist_vect[i_idx], dist_vect[o_idx]);
   1007 	  }
   1008       }
   1009 }
   1010 
   1011 /* Check data dependence relations, return TRUE if it's valid to interchange
   1012    two loops specified by I_IDX/O_IDX.  Theoretically, interchanging the two
   1013    loops is valid only if dist vector, after interchanging, doesn't have '>'
   1014    as the leftmost non-'=' direction.  Practically, this function have been
   1015    conservative here by not checking some valid cases.  */
   1016 
   1017 bool
   1018 tree_loop_interchange::valid_data_dependences (unsigned i_idx, unsigned o_idx,
   1019 					       vec<ddr_p> ddrs)
   1020 {
   1021   struct data_dependence_relation *ddr;
   1022 
   1023   for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i)
   1024     {
   1025       /* Skip no-dependence case.  */
   1026       if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
   1027 	continue;
   1028 
   1029       for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j)
   1030 	{
   1031 	  lambda_vector dist_vect = DDR_DIST_VECT (ddr, j);
   1032 	  unsigned level = dependence_level (dist_vect, m_loop_nest.length ());
   1033 
   1034 	  /* If there is no carried dependence.  */
   1035 	  if (level == 0)
   1036 	    continue;
   1037 
   1038 	  level --;
   1039 
   1040 	  /* If dependence is not carried by any loop in between the two
   1041 	     loops [oloop, iloop] to interchange.  */
   1042 	  if (level < o_idx || level > i_idx)
   1043 	    continue;
   1044 
   1045 	  /* Be conservative, skip case if either direction at i_idx/o_idx
   1046 	     levels is not '=' or '<'.  */
   1047 	  if ((!DDR_REVERSED_P (ddr) && dist_vect[i_idx] < 0)
   1048 	      || (DDR_REVERSED_P (ddr) && dist_vect[i_idx] > 0)
   1049 	      || (!DDR_REVERSED_P (ddr) && dist_vect[o_idx] < 0)
   1050 	      || (DDR_REVERSED_P (ddr) && dist_vect[o_idx] > 0))
   1051 	    return false;
   1052 	}
   1053     }
   1054 
   1055   return true;
   1056 }
   1057 
   1058 /* Interchange two loops specified by ILOOP and OLOOP.  */
   1059 
   1060 void
   1061 tree_loop_interchange::interchange_loops (loop_cand &iloop, loop_cand &oloop)
   1062 {
   1063   reduction_p re;
   1064   gimple_stmt_iterator gsi;
   1065   tree i_niters, o_niters, var_after;
   1066 
   1067   /* Undo inner loop's simple reduction.  */
   1068   for (unsigned i = 0; iloop.m_reductions.iterate (i, &re); ++i)
   1069     if (re->type != DOUBLE_RTYPE)
   1070       {
   1071 	if (re->producer)
   1072 	  reset_debug_uses (re->producer);
   1073 
   1074 	iloop.undo_simple_reduction (re, m_dce_seeds);
   1075       }
   1076 
   1077   /* Only need to reset debug uses for double reduction.  */
   1078   for (unsigned i = 0; oloop.m_reductions.iterate (i, &re); ++i)
   1079     {
   1080       gcc_assert (re->type == DOUBLE_RTYPE);
   1081       reset_debug_uses (SSA_NAME_DEF_STMT (re->var));
   1082       reset_debug_uses (SSA_NAME_DEF_STMT (re->next));
   1083     }
   1084 
   1085   /* Prepare niters for both loops.  */
   1086   class loop *loop_nest = m_loop_nest[0];
   1087   edge instantiate_below = loop_preheader_edge (loop_nest);
   1088   gsi = gsi_last_bb (loop_preheader_edge (loop_nest)->src);
   1089   i_niters = number_of_latch_executions (iloop.m_loop);
   1090   i_niters = analyze_scalar_evolution (loop_outer (iloop.m_loop), i_niters);
   1091   i_niters = instantiate_scev (instantiate_below, loop_outer (iloop.m_loop),
   1092 			       i_niters);
   1093   i_niters = force_gimple_operand_gsi (&gsi, unshare_expr (i_niters), true,
   1094 				       NULL_TREE, false, GSI_CONTINUE_LINKING);
   1095   o_niters = number_of_latch_executions (oloop.m_loop);
   1096   if (oloop.m_loop != loop_nest)
   1097     {
   1098       o_niters = analyze_scalar_evolution (loop_outer (oloop.m_loop), o_niters);
   1099       o_niters = instantiate_scev (instantiate_below, loop_outer (oloop.m_loop),
   1100 				   o_niters);
   1101     }
   1102   o_niters = force_gimple_operand_gsi (&gsi, unshare_expr (o_niters), true,
   1103 				       NULL_TREE, false, GSI_CONTINUE_LINKING);
   1104 
   1105   /* Move src's code to tgt loop.  This is necessary when src is the outer
   1106      loop and tgt is the inner loop.  */
   1107   move_code_to_inner_loop (oloop.m_loop, iloop.m_loop, oloop.m_bbs);
   1108 
   1109   /* Map outer loop's IV to inner loop, and vice versa.  */
   1110   map_inductions_to_loop (oloop, iloop);
   1111   map_inductions_to_loop (iloop, oloop);
   1112 
   1113   /* Create canonical IV for both loops.  Note canonical IV for outer/inner
   1114      loop is actually from inner/outer loop.  Also we record the new IV
   1115      created for the outer loop so that it can be skipped in later loop
   1116      interchange.  */
   1117   create_canonical_iv (oloop.m_loop, oloop.m_exit,
   1118 		       i_niters, &m_niters_iv_var, &var_after);
   1119   bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
   1120   create_canonical_iv (iloop.m_loop, iloop.m_exit,
   1121 		       o_niters, NULL, &var_after);
   1122   bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
   1123 
   1124   /* Scrap niters estimation of interchanged loops.  */
   1125   iloop.m_loop->any_upper_bound = false;
   1126   iloop.m_loop->any_likely_upper_bound = false;
   1127   free_numbers_of_iterations_estimates (iloop.m_loop);
   1128   oloop.m_loop->any_upper_bound = false;
   1129   oloop.m_loop->any_likely_upper_bound = false;
   1130   free_numbers_of_iterations_estimates (oloop.m_loop);
   1131 
   1132   /* Clear all cached scev information.  This is expensive but shouldn't be
   1133      a problem given we interchange in very limited times.  */
   1134   scev_reset_htab ();
   1135 
   1136   /* ???  The association between the loop data structure and the
   1137      CFG changed, so what was loop N at the source level is now
   1138      loop M.  We should think of retaining the association or breaking
   1139      it fully by creating a new loop instead of re-using the "wrong" one.  */
   1140 }
   1141 
   1142 /* Map induction variables of SRC loop to TGT loop.  The function firstly
   1143    creates the same IV of SRC loop in TGT loop, then deletes the original
   1144    IV and re-initialize it using the newly created IV.  For example, loop
   1145    nest:
   1146 
   1147      for (i = 0; i < N; i++)
   1148        for (j = 0; j < M; j++)
   1149 	 {
   1150 	   //use of i;
   1151 	   //use of j;
   1152 	 }
   1153 
   1154    will be transformed into:
   1155 
   1156      for (jj = 0; jj < M; jj++)
   1157        for (ii = 0; ii < N; ii++)
   1158 	 {
   1159 	   //use of ii;
   1160 	   //use of jj;
   1161 	 }
   1162 
   1163    after loop interchange.  */
   1164 
   1165 void
   1166 tree_loop_interchange::map_inductions_to_loop (loop_cand &src, loop_cand &tgt)
   1167 {
   1168   induction_p iv;
   1169   edge e = tgt.m_exit;
   1170   gimple_stmt_iterator incr_pos = gsi_last_bb (e->src), gsi;
   1171 
   1172   /* Map source loop's IV to target loop.  */
   1173   for (unsigned i = 0; src.m_inductions.iterate (i, &iv); ++i)
   1174     {
   1175       gimple *use_stmt, *stmt = SSA_NAME_DEF_STMT (iv->var);
   1176       gcc_assert (is_a <gphi *> (stmt));
   1177 
   1178       use_operand_p use_p;
   1179       /* Only map original IV to target loop.  */
   1180       if (m_niters_iv_var != iv->var)
   1181 	{
   1182 	  /* Map the IV by creating the same one in target loop.  */
   1183 	  tree var_before, var_after;
   1184 	  tree base = unshare_expr (iv->init_expr);
   1185 	  tree step = unshare_expr (iv->step);
   1186 	  create_iv (base, step, SSA_NAME_VAR (iv->var),
   1187 		     tgt.m_loop, &incr_pos, false, &var_before, &var_after);
   1188 	  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before));
   1189 	  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
   1190 
   1191 	  /* Replace uses of the original IV var with newly created IV var.  */
   1192 	  imm_use_iterator imm_iter;
   1193 	  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, iv->var)
   1194 	    {
   1195 	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
   1196 		SET_USE (use_p, var_before);
   1197 
   1198 	      update_stmt (use_stmt);
   1199 	    }
   1200 	}
   1201 
   1202       /* Mark all uses for DCE.  */
   1203       ssa_op_iter op_iter;
   1204       FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, op_iter, SSA_OP_USE)
   1205 	{
   1206 	  tree use = USE_FROM_PTR (use_p);
   1207 	  if (TREE_CODE (use) == SSA_NAME
   1208 	      && ! SSA_NAME_IS_DEFAULT_DEF (use))
   1209 	    bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (use));
   1210 	}
   1211 
   1212       /* Delete definition of the original IV in the source loop.  */
   1213       gsi = gsi_for_stmt (stmt);
   1214       remove_phi_node (&gsi, true);
   1215     }
   1216 }
   1217 
   1218 /* Move stmts of outer loop to inner loop.  */
   1219 
   1220 void
   1221 tree_loop_interchange::move_code_to_inner_loop (class loop *outer,
   1222 						class loop *inner,
   1223 						basic_block *outer_bbs)
   1224 {
   1225   basic_block oloop_exit_bb = single_exit (outer)->src;
   1226   gimple_stmt_iterator gsi, to;
   1227 
   1228   for (unsigned i = 0; i < outer->num_nodes; i++)
   1229     {
   1230       basic_block bb = outer_bbs[i];
   1231 
   1232       /* Skip basic blocks of inner loop.  */
   1233       if (flow_bb_inside_loop_p (inner, bb))
   1234 	continue;
   1235 
   1236       /* Move code from header/latch to header/latch.  */
   1237       if (bb == outer->header)
   1238 	to = gsi_after_labels (inner->header);
   1239       else if (bb == outer->latch)
   1240 	to = gsi_after_labels (inner->latch);
   1241       else
   1242 	/* Otherwise, simply move to exit->src.  */
   1243 	to = gsi_last_bb (single_exit (inner)->src);
   1244 
   1245       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
   1246 	{
   1247 	  gimple *stmt = gsi_stmt (gsi);
   1248 
   1249 	  if (oloop_exit_bb == bb
   1250 	      && stmt == gsi_stmt (gsi_last_bb (oloop_exit_bb)))
   1251 	    {
   1252 	      gsi_next (&gsi);
   1253 	      continue;
   1254 	    }
   1255 
   1256 	  if (gimple_vdef (stmt))
   1257 	    {
   1258 	      unlink_stmt_vdef (stmt);
   1259 	      release_ssa_name (gimple_vdef (stmt));
   1260 	      gimple_set_vdef (stmt, NULL_TREE);
   1261 	    }
   1262 	  if (gimple_vuse (stmt))
   1263 	    {
   1264 	      gimple_set_vuse (stmt, NULL_TREE);
   1265 	      update_stmt (stmt);
   1266 	    }
   1267 
   1268 	  reset_debug_uses (stmt);
   1269 	  gsi_move_before (&gsi, &to);
   1270 	}
   1271     }
   1272 }
   1273 
   1274 /* Given data reference DR in LOOP_NEST, the function computes DR's access
   1275    stride at each level of loop from innermost LOOP to outer.  On success,
   1276    it saves access stride at each level loop in a vector which is pointed
   1277    by DR->aux.  For example:
   1278 
   1279      int arr[100][100][100];
   1280      for (i = 0; i < 100; i++)       ;(DR->aux)strides[0] = 40000
   1281        for (j = 100; j > 0; j--)     ;(DR->aux)strides[1] = 400
   1282 	 for (k = 0; k < 100; k++)   ;(DR->aux)strides[2] = 4
   1283 	   arr[i][j - 1][k] = 0;  */
   1284 
   1285 static void
   1286 compute_access_stride (class loop *loop_nest, class loop *loop,
   1287 		       data_reference_p dr)
   1288 {
   1289   vec<tree> *strides = new vec<tree> ();
   1290   basic_block bb = gimple_bb (DR_STMT (dr));
   1291 
   1292   while (!flow_bb_inside_loop_p (loop, bb))
   1293     {
   1294       strides->safe_push (build_int_cst (sizetype, 0));
   1295       loop = loop_outer (loop);
   1296     }
   1297   gcc_assert (loop == bb->loop_father);
   1298 
   1299   tree ref = DR_REF (dr);
   1300   if (TREE_CODE (ref) == COMPONENT_REF
   1301       && DECL_BIT_FIELD (TREE_OPERAND (ref, 1)))
   1302     {
   1303       /* We can't take address of bitfields.  If the bitfield is at constant
   1304 	 offset from the start of the struct, just use address of the
   1305 	 struct, for analysis of the strides that shouldn't matter.  */
   1306       if (!TREE_OPERAND (ref, 2)
   1307 	  || TREE_CODE (TREE_OPERAND (ref, 2)) == INTEGER_CST)
   1308 	ref = TREE_OPERAND (ref, 0);
   1309       /* Otherwise, if we have a bit field representative, use that.  */
   1310       else if (DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1))
   1311 	       != NULL_TREE)
   1312 	{
   1313 	  tree repr = DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1));
   1314 	  ref = build3 (COMPONENT_REF, TREE_TYPE (repr), TREE_OPERAND (ref, 0),
   1315 			repr, TREE_OPERAND (ref, 2));
   1316 	}
   1317       /* Otherwise punt.  */
   1318       else
   1319 	{
   1320 	  dr->aux = strides;
   1321 	  return;
   1322 	}
   1323     }
   1324   tree scev_base = build_fold_addr_expr (ref);
   1325   tree scev = analyze_scalar_evolution (loop, scev_base);
   1326   scev = instantiate_scev (loop_preheader_edge (loop_nest), loop, scev);
   1327   if (! chrec_contains_undetermined (scev))
   1328     {
   1329       tree sl = scev;
   1330       class loop *expected = loop;
   1331       while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
   1332 	{
   1333 	  class loop *sl_loop = get_chrec_loop (sl);
   1334 	  while (sl_loop != expected)
   1335 	    {
   1336 	      strides->safe_push (size_int (0));
   1337 	      expected = loop_outer (expected);
   1338 	    }
   1339 	  strides->safe_push (CHREC_RIGHT (sl));
   1340 	  sl = CHREC_LEFT (sl);
   1341 	  expected = loop_outer (expected);
   1342 	}
   1343       if (! tree_contains_chrecs (sl, NULL))
   1344 	while (expected != loop_outer (loop_nest))
   1345 	  {
   1346 	    strides->safe_push (size_int (0));
   1347 	    expected = loop_outer (expected);
   1348 	  }
   1349     }
   1350 
   1351   dr->aux = strides;
   1352 }
   1353 
   1354 /* Given loop nest LOOP_NEST with innermost LOOP, the function computes
   1355    access strides with respect to each level loop for all data refs in
   1356    DATAREFS from inner loop to outer loop.  On success, it returns the
   1357    outermost loop that access strides can be computed successfully for
   1358    all data references.  If access strides cannot be computed at least
   1359    for two levels of loop for any data reference, it returns NULL.  */
   1360 
   1361 static class loop *
   1362 compute_access_strides (class loop *loop_nest, class loop *loop,
   1363 			vec<data_reference_p> datarefs)
   1364 {
   1365   unsigned i, j, num_loops = (unsigned) -1;
   1366   data_reference_p dr;
   1367   vec<tree> *stride;
   1368 
   1369   for (i = 0; datarefs.iterate (i, &dr); ++i)
   1370     {
   1371       compute_access_stride (loop_nest, loop, dr);
   1372       stride = DR_ACCESS_STRIDE (dr);
   1373       if (stride->length () < num_loops)
   1374 	{
   1375 	  num_loops = stride->length ();
   1376 	  if (num_loops < 2)
   1377 	    return NULL;
   1378 	}
   1379     }
   1380 
   1381   for (i = 0; datarefs.iterate (i, &dr); ++i)
   1382     {
   1383       stride = DR_ACCESS_STRIDE (dr);
   1384       if (stride->length () > num_loops)
   1385 	stride->truncate (num_loops);
   1386 
   1387       for (j = 0; j < (num_loops >> 1); ++j)
   1388 	std::swap ((*stride)[j], (*stride)[num_loops - j - 1]);
   1389     }
   1390 
   1391   loop = superloop_at_depth (loop, loop_depth (loop) + 1 - num_loops);
   1392   gcc_assert (loop_nest == loop || flow_loop_nested_p (loop_nest, loop));
   1393   return loop;
   1394 }
   1395 
   1396 /* Prune access strides for data references in DATAREFS by removing strides
   1397    of loops that isn't in current LOOP_NEST.  */
   1398 
   1399 static void
   1400 prune_access_strides_not_in_loop (class loop *loop_nest,
   1401 				  class loop *innermost,
   1402 				  vec<data_reference_p> datarefs)
   1403 {
   1404   data_reference_p dr;
   1405   unsigned num_loops = loop_depth (innermost) - loop_depth (loop_nest) + 1;
   1406   gcc_assert (num_loops > 1);
   1407 
   1408   /* Block remove strides of loops that is not in current loop nest.  */
   1409   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
   1410     {
   1411       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
   1412       if (stride->length () > num_loops)
   1413 	stride->block_remove (0, stride->length () - num_loops);
   1414     }
   1415 }
   1416 
   1417 /* Dump access strides for all DATAREFS.  */
   1418 
   1419 static void
   1420 dump_access_strides (vec<data_reference_p> datarefs)
   1421 {
   1422   data_reference_p dr;
   1423   fprintf (dump_file, "Access Strides for DRs:\n");
   1424   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
   1425     {
   1426       fprintf (dump_file, "  ");
   1427       print_generic_expr (dump_file, DR_REF (dr), TDF_SLIM);
   1428       fprintf (dump_file, ":\t\t<");
   1429 
   1430       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
   1431       unsigned num_loops = stride->length ();
   1432       for (unsigned j = 0; j < num_loops; ++j)
   1433 	{
   1434 	  print_generic_expr (dump_file, (*stride)[j], TDF_SLIM);
   1435 	  fprintf (dump_file, "%s", (j < num_loops - 1) ? ",\t" : ">\n");
   1436 	}
   1437     }
   1438 }
   1439 
   1440 /* Return true if it's profitable to interchange two loops whose index
   1441    in whole loop nest vector are I_IDX/O_IDX respectively.  The function
   1442    computes and compares three types information from all DATAREFS:
   1443      1) Access stride for loop I_IDX and O_IDX.
   1444      2) Number of invariant memory references with respect to I_IDX before
   1445 	and after loop interchange.
   1446      3) Flags indicating if all memory references access sequential memory
   1447 	in ILOOP, before and after loop interchange.
   1448    If INNMOST_LOOP_P is true, the two loops for interchanging are the two
   1449    innermost loops in loop nest.  This function also dumps information if
   1450    DUMP_INFO_P is true.  */
   1451 
   1452 static bool
   1453 should_interchange_loops (unsigned i_idx, unsigned o_idx,
   1454 			  vec<data_reference_p> datarefs,
   1455 			  unsigned i_stmt_cost, unsigned o_stmt_cost,
   1456 			  bool innermost_loops_p, bool dump_info_p = true)
   1457 {
   1458   unsigned HOST_WIDE_INT ratio;
   1459   unsigned i, j, num_old_inv_drs = 0, num_new_inv_drs = 0;
   1460   struct data_reference *dr;
   1461   bool all_seq_dr_before_p = true, all_seq_dr_after_p = true;
   1462   widest_int iloop_strides = 0, oloop_strides = 0;
   1463   unsigned num_unresolved_drs = 0;
   1464   unsigned num_resolved_ok_drs = 0;
   1465   unsigned num_resolved_not_ok_drs = 0;
   1466 
   1467   if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS))
   1468     fprintf (dump_file, "\nData ref strides:\n\tmem_ref:\t\tiloop\toloop\n");
   1469 
   1470   for (i = 0; datarefs.iterate (i, &dr); ++i)
   1471     {
   1472       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
   1473       tree iloop_stride = (*stride)[i_idx], oloop_stride = (*stride)[o_idx];
   1474 
   1475       bool subloop_stride_p = false;
   1476       /* Data ref can't be invariant or sequential access at current loop if
   1477 	 its address changes with respect to any subloops.  */
   1478       for (j = i_idx + 1; j < stride->length (); ++j)
   1479 	if (!integer_zerop ((*stride)[j]))
   1480 	  {
   1481 	    subloop_stride_p = true;
   1482 	    break;
   1483 	  }
   1484 
   1485       if (integer_zerop (iloop_stride))
   1486 	{
   1487 	  if (!subloop_stride_p)
   1488 	    num_old_inv_drs++;
   1489 	}
   1490       if (integer_zerop (oloop_stride))
   1491 	{
   1492 	  if (!subloop_stride_p)
   1493 	    num_new_inv_drs++;
   1494 	}
   1495 
   1496       if (TREE_CODE (iloop_stride) == INTEGER_CST
   1497 	  && TREE_CODE (oloop_stride) == INTEGER_CST)
   1498 	{
   1499 	  iloop_strides = wi::add (iloop_strides, wi::to_widest (iloop_stride));
   1500 	  oloop_strides = wi::add (oloop_strides, wi::to_widest (oloop_stride));
   1501 	}
   1502       else if (multiple_of_p (TREE_TYPE (iloop_stride),
   1503 			      iloop_stride, oloop_stride))
   1504 	num_resolved_ok_drs++;
   1505       else if (multiple_of_p (TREE_TYPE (iloop_stride),
   1506 			      oloop_stride, iloop_stride))
   1507 	num_resolved_not_ok_drs++;
   1508       else
   1509 	num_unresolved_drs++;
   1510 
   1511       /* Data ref can't be sequential access if its address changes in sub
   1512 	 loop.  */
   1513       if (subloop_stride_p)
   1514 	{
   1515 	  all_seq_dr_before_p = false;
   1516 	  all_seq_dr_after_p = false;
   1517 	  continue;
   1518 	}
   1519       /* Track if all data references are sequential accesses before/after loop
   1520 	 interchange.  Note invariant is considered sequential here.  */
   1521       tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
   1522       if (all_seq_dr_before_p
   1523 	  && ! (integer_zerop (iloop_stride)
   1524 		|| operand_equal_p (access_size, iloop_stride, 0)))
   1525 	all_seq_dr_before_p = false;
   1526       if (all_seq_dr_after_p
   1527 	  && ! (integer_zerop (oloop_stride)
   1528 		|| operand_equal_p (access_size, oloop_stride, 0)))
   1529 	all_seq_dr_after_p = false;
   1530     }
   1531 
   1532   if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS))
   1533     {
   1534       fprintf (dump_file, "\toverall:\t\t");
   1535       print_decu (iloop_strides, dump_file);
   1536       fprintf (dump_file, "\t");
   1537       print_decu (oloop_strides, dump_file);
   1538       fprintf (dump_file, "\n");
   1539 
   1540       fprintf (dump_file, "Invariant data ref: before(%d), after(%d)\n",
   1541 	       num_old_inv_drs, num_new_inv_drs);
   1542       fprintf (dump_file, "All consecutive stride: before(%s), after(%s)\n",
   1543 	       all_seq_dr_before_p ? "true" : "false",
   1544 	       all_seq_dr_after_p ? "true" : "false");
   1545       fprintf (dump_file, "OK to interchage with variable strides: %d\n",
   1546 	       num_resolved_ok_drs);
   1547       fprintf (dump_file, "Not OK to interchage with variable strides: %d\n",
   1548 	       num_resolved_not_ok_drs);
   1549       fprintf (dump_file, "Variable strides we cannot decide: %d\n",
   1550 	       num_unresolved_drs);
   1551       fprintf (dump_file, "Stmt cost of inner loop: %d\n", i_stmt_cost);
   1552       fprintf (dump_file, "Stmt cost of outer loop: %d\n", o_stmt_cost);
   1553     }
   1554 
   1555   if (num_unresolved_drs != 0 || num_resolved_not_ok_drs != 0)
   1556     return false;
   1557 
   1558   /* Stmts of outer loop will be moved to inner loop.  If there are two many
   1559      such stmts, it could make inner loop costly.  Here we compare stmt cost
   1560      between outer and inner loops.  */
   1561   if (i_stmt_cost && o_stmt_cost
   1562       && num_old_inv_drs + o_stmt_cost > num_new_inv_drs
   1563       && o_stmt_cost * STMT_COST_RATIO > i_stmt_cost)
   1564     return false;
   1565 
   1566   /* We use different stride comparison ratio for interchanging innermost
   1567      two loops or not.  The idea is to be conservative in interchange for
   1568      the innermost loops.  */
   1569   ratio = innermost_loops_p ? INNER_STRIDE_RATIO : OUTER_STRIDE_RATIO;
   1570   /* Do interchange if it gives better data locality behavior.  */
   1571   if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides, ratio)))
   1572     return true;
   1573   if (wi::gtu_p (iloop_strides, oloop_strides))
   1574     {
   1575       /* Or it creates more invariant memory references.  */
   1576       if ((!all_seq_dr_before_p || all_seq_dr_after_p)
   1577 	  && num_new_inv_drs > num_old_inv_drs)
   1578 	return true;
   1579       /* Or it makes all memory references sequential.  */
   1580       if (num_new_inv_drs >= num_old_inv_drs
   1581 	  && !all_seq_dr_before_p && all_seq_dr_after_p)
   1582 	return true;
   1583     }
   1584 
   1585   return false;
   1586 }
   1587 
   1588 /* Try to interchange inner loop of a loop nest to outer level.  */
   1589 
   1590 bool
   1591 tree_loop_interchange::interchange (vec<data_reference_p> datarefs,
   1592 				    vec<ddr_p> ddrs)
   1593 {
   1594   dump_user_location_t loc = find_loop_location (m_loop_nest[0]);
   1595   bool changed_p = false;
   1596   /* In each iteration we try to interchange I-th loop with (I+1)-th loop.
   1597      The overall effect is to push inner loop to outermost level in whole
   1598      loop nest.  */
   1599   for (unsigned i = m_loop_nest.length (); i > 1; --i)
   1600     {
   1601       unsigned i_idx = i - 1, o_idx = i - 2;
   1602 
   1603       /* Check validity for loop interchange.  */
   1604       if (!valid_data_dependences (i_idx, o_idx, ddrs))
   1605 	break;
   1606 
   1607       loop_cand iloop (m_loop_nest[i_idx], m_loop_nest[o_idx]);
   1608       loop_cand oloop (m_loop_nest[o_idx], m_loop_nest[o_idx]);
   1609 
   1610       /* Check if we can do transformation for loop interchange.  */
   1611       if (!iloop.analyze_carried_vars (NULL)
   1612 	  || !iloop.analyze_lcssa_phis ()
   1613 	  || !oloop.analyze_carried_vars (&iloop)
   1614 	  || !oloop.analyze_lcssa_phis ()
   1615 	  || !iloop.can_interchange_p (NULL)
   1616 	  || !oloop.can_interchange_p (&iloop))
   1617 	break;
   1618 
   1619       /* Outer loop's stmts will be moved to inner loop during interchange.
   1620 	 If there are many of them, it may make inner loop very costly.  We
   1621 	 need to check number of outer loop's stmts in profit cost model of
   1622 	 interchange.  */
   1623       int stmt_cost = oloop.m_num_stmts;
   1624       /* Count out the exit checking stmt of outer loop.  */
   1625       stmt_cost --;
   1626       /* Count out IV's increasing stmt, IVOPTs takes care if it.  */
   1627       stmt_cost -= oloop.m_inductions.length ();
   1628       /* Count in the additional load and cond_expr stmts caused by inner
   1629 	 loop's constant initialized reduction.  */
   1630       stmt_cost += iloop.m_const_init_reduc * 2;
   1631       if (stmt_cost < 0)
   1632 	stmt_cost = 0;
   1633 
   1634       /* Check profitability for loop interchange.  */
   1635       if (should_interchange_loops (i_idx, o_idx, datarefs,
   1636 				    (unsigned) iloop.m_num_stmts,
   1637 				    (unsigned) stmt_cost,
   1638 				    iloop.m_loop->inner == NULL))
   1639 	{
   1640 	  if (dump_file && (dump_flags & TDF_DETAILS))
   1641 	    fprintf (dump_file,
   1642 		     "Loop_pair<outer:%d, inner:%d> is interchanged\n\n",
   1643 		     oloop.m_loop->num, iloop.m_loop->num);
   1644 
   1645 	  changed_p = true;
   1646 	  interchange_loops (iloop, oloop);
   1647 	  /* No need to update if there is no further loop interchange.  */
   1648 	  if (o_idx > 0)
   1649 	    update_data_info (i_idx, o_idx, datarefs, ddrs);
   1650 	}
   1651       else
   1652 	{
   1653 	  if (dump_file && (dump_flags & TDF_DETAILS))
   1654 	    fprintf (dump_file,
   1655 		     "Loop_pair<outer:%d, inner:%d> is not interchanged\n\n",
   1656 		     oloop.m_loop->num, iloop.m_loop->num);
   1657 	}
   1658     }
   1659   simple_dce_from_worklist (m_dce_seeds);
   1660 
   1661   if (changed_p && dump_enabled_p ())
   1662     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
   1663 		     "loops interchanged in loop nest\n");
   1664 
   1665   return changed_p;
   1666 }
   1667 
   1668 
   1669 /* Loop interchange pass.  */
   1670 
   1671 namespace {
   1672 
   1673 const pass_data pass_data_linterchange =
   1674 {
   1675   GIMPLE_PASS, /* type */
   1676   "linterchange", /* name */
   1677   OPTGROUP_LOOP, /* optinfo_flags */
   1678   TV_LINTERCHANGE, /* tv_id */
   1679   PROP_cfg, /* properties_required */
   1680   0, /* properties_provided */
   1681   0, /* properties_destroyed */
   1682   0, /* todo_flags_start */
   1683   0, /* todo_flags_finish */
   1684 };
   1685 
   1686 class pass_linterchange : public gimple_opt_pass
   1687 {
   1688 public:
   1689   pass_linterchange (gcc::context *ctxt)
   1690     : gimple_opt_pass (pass_data_linterchange, ctxt)
   1691   {}
   1692 
   1693   /* opt_pass methods: */
   1694   opt_pass * clone () { return new pass_linterchange (m_ctxt); }
   1695   virtual bool gate (function *) { return flag_loop_interchange; }
   1696   virtual unsigned int execute (function *);
   1697 
   1698 }; // class pass_linterchange
   1699 
   1700 
   1701 /* Return true if LOOP has proper form for interchange.  We check three
   1702    conditions in the function:
   1703      1) In general, a loop can be interchanged only if it doesn't have
   1704 	basic blocks other than header, exit and latch besides possible
   1705 	inner loop nest.  This basically restricts loop interchange to
   1706 	below form loop nests:
   1707 
   1708           header<---+
   1709             |       |
   1710             v       |
   1711         INNER_LOOP  |
   1712             |       |
   1713             v       |
   1714           exit--->latch
   1715 
   1716      2) Data reference in basic block that executes in different times
   1717 	than loop head/exit is not allowed.
   1718      3) Record the innermost outer loop that doesn't form rectangle loop
   1719 	nest with LOOP.  */
   1720 
   1721 static bool
   1722 proper_loop_form_for_interchange (class loop *loop, class loop **min_outer)
   1723 {
   1724   edge e0, e1, exit;
   1725 
   1726   /* Don't interchange if loop has unsupported information for the moment.  */
   1727   if (loop->safelen > 0
   1728       || loop->constraints != 0
   1729       || loop->can_be_parallel
   1730       || loop->dont_vectorize
   1731       || loop->force_vectorize
   1732       || loop->in_oacc_kernels_region
   1733       || loop->orig_loop_num != 0
   1734       || loop->simduid != NULL_TREE)
   1735     return false;
   1736 
   1737   /* Don't interchange if outer loop has basic block other than header, exit
   1738      and latch.  */
   1739   if (loop->inner != NULL
   1740       && loop->num_nodes != loop->inner->num_nodes + 3)
   1741     return false;
   1742 
   1743   if ((exit = single_dom_exit (loop)) == NULL)
   1744     return false;
   1745 
   1746   /* Check control flow on loop header/exit blocks.  */
   1747   if (loop->header == exit->src
   1748       && (EDGE_COUNT (loop->header->preds) != 2
   1749 	  || EDGE_COUNT (loop->header->succs) != 2))
   1750     return false;
   1751   else if (loop->header != exit->src
   1752 	   && (EDGE_COUNT (loop->header->preds) != 2
   1753 	       || !single_succ_p (loop->header)
   1754 	       || unsupported_edge (single_succ_edge (loop->header))
   1755 	       || EDGE_COUNT (exit->src->succs) != 2
   1756 	       || !single_pred_p (exit->src)
   1757 	       || unsupported_edge (single_pred_edge (exit->src))))
   1758     return false;
   1759 
   1760   e0 = EDGE_PRED (loop->header, 0);
   1761   e1 = EDGE_PRED (loop->header, 1);
   1762   if (unsupported_edge (e0) || unsupported_edge (e1)
   1763       || (e0->src != loop->latch && e1->src != loop->latch)
   1764       || (e0->src->loop_father == loop && e1->src->loop_father == loop))
   1765     return false;
   1766 
   1767   e0 = EDGE_SUCC (exit->src, 0);
   1768   e1 = EDGE_SUCC (exit->src, 1);
   1769   if (unsupported_edge (e0) || unsupported_edge (e1)
   1770       || (e0->dest != loop->latch && e1->dest != loop->latch)
   1771       || (e0->dest->loop_father == loop && e1->dest->loop_father == loop))
   1772     return false;
   1773 
   1774   /* Don't interchange if any reference is in basic block that doesn't
   1775      dominate exit block.  */
   1776   basic_block *bbs = get_loop_body (loop);
   1777   for (unsigned i = 0; i < loop->num_nodes; i++)
   1778     {
   1779       basic_block bb = bbs[i];
   1780 
   1781       if (bb->loop_father != loop
   1782 	  || bb == loop->header || bb == exit->src
   1783 	  || dominated_by_p (CDI_DOMINATORS, exit->src, bb))
   1784 	continue;
   1785 
   1786       for (gimple_stmt_iterator gsi = gsi_start_nondebug_bb (bb);
   1787 	   !gsi_end_p (gsi); gsi_next_nondebug (&gsi))
   1788 	if (gimple_vuse (gsi_stmt (gsi)))
   1789 	  {
   1790 	    free (bbs);
   1791 	    return false;
   1792 	  }
   1793     }
   1794   free (bbs);
   1795 
   1796   tree niters = number_of_latch_executions (loop);
   1797   niters = analyze_scalar_evolution (loop_outer (loop), niters);
   1798   if (!niters || chrec_contains_undetermined (niters))
   1799     return false;
   1800 
   1801   /* Record the innermost outer loop that doesn't form rectangle loop nest.  */
   1802   for (loop_p loop2 = loop_outer (loop);
   1803        loop2 && flow_loop_nested_p (*min_outer, loop2);
   1804        loop2 = loop_outer (loop2))
   1805     {
   1806       niters = instantiate_scev (loop_preheader_edge (loop2),
   1807 				 loop_outer (loop), niters);
   1808       if (!evolution_function_is_invariant_p (niters, loop2->num))
   1809 	{
   1810 	  *min_outer = loop2;
   1811 	  break;
   1812 	}
   1813     }
   1814   return true;
   1815 }
   1816 
   1817 /* Return true if any two adjacent loops in loop nest [INNERMOST, LOOP_NEST]
   1818    should be interchanged by looking into all DATAREFS.  */
   1819 
   1820 static bool
   1821 should_interchange_loop_nest (class loop *loop_nest, class loop *innermost,
   1822 			      vec<data_reference_p> datarefs)
   1823 {
   1824   unsigned idx = loop_depth (innermost) - loop_depth (loop_nest);
   1825   gcc_assert (idx > 0);
   1826 
   1827   /* Check if any two adjacent loops should be interchanged.  */
   1828   for (class loop *loop = innermost;
   1829        loop != loop_nest; loop = loop_outer (loop), idx--)
   1830     if (should_interchange_loops (idx, idx - 1, datarefs, 0, 0,
   1831 				  loop == innermost, false))
   1832       return true;
   1833 
   1834   return false;
   1835 }
   1836 
   1837 /* Given loop nest LOOP_NEST and data references DATAREFS, compute data
   1838    dependences for loop interchange and store it in DDRS.  Note we compute
   1839    dependences directly rather than call generic interface so that we can
   1840    return on unknown dependence instantly.  */
   1841 
   1842 static bool
   1843 tree_loop_interchange_compute_ddrs (vec<loop_p> loop_nest,
   1844 				    vec<data_reference_p> datarefs,
   1845 				    vec<ddr_p> *ddrs)
   1846 {
   1847   struct data_reference *a, *b;
   1848   class loop *innermost = loop_nest.last ();
   1849 
   1850   for (unsigned i = 0; datarefs.iterate (i, &a); ++i)
   1851     {
   1852       bool a_outer_p = gimple_bb (DR_STMT (a))->loop_father != innermost;
   1853       for (unsigned j = i + 1; datarefs.iterate (j, &b); ++j)
   1854 	if (DR_IS_WRITE (a) || DR_IS_WRITE (b))
   1855 	  {
   1856 	    bool b_outer_p = gimple_bb (DR_STMT (b))->loop_father != innermost;
   1857 	    /* Don't support multiple write references in outer loop.  */
   1858 	    if (a_outer_p && b_outer_p && DR_IS_WRITE (a) && DR_IS_WRITE (b))
   1859 	      return false;
   1860 
   1861 	    ddr_p ddr = initialize_data_dependence_relation (a, b, loop_nest);
   1862 	    ddrs->safe_push (ddr);
   1863 	    compute_affine_dependence (ddr, loop_nest[0]);
   1864 
   1865 	    /* Give up if ddr is unknown dependence or classic direct vector
   1866 	       is not available.  */
   1867 	    if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
   1868 		|| (DDR_ARE_DEPENDENT (ddr) == NULL_TREE
   1869 		    && DDR_NUM_DIR_VECTS (ddr) == 0))
   1870 	      return false;
   1871 
   1872 	    /* If either data references is in outer loop of nest, we require
   1873 	       no dependence here because the data reference need to be moved
   1874 	       into inner loop during interchange.  */
   1875 	    if (a_outer_p && b_outer_p
   1876 		&& operand_equal_p (DR_REF (a), DR_REF (b), 0))
   1877 	      continue;
   1878 	    if (DDR_ARE_DEPENDENT (ddr) != chrec_known
   1879 		&& (a_outer_p || b_outer_p))
   1880 	      return false;
   1881 	}
   1882     }
   1883 
   1884   return true;
   1885 }
   1886 
   1887 /* Prune DATAREFS by removing any data reference not inside of LOOP.  */
   1888 
   1889 static inline void
   1890 prune_datarefs_not_in_loop (class loop *loop, vec<data_reference_p> datarefs)
   1891 {
   1892   unsigned i, j;
   1893   struct data_reference *dr;
   1894 
   1895   for (i = 0, j = 0; datarefs.iterate (i, &dr); ++i)
   1896     {
   1897       if (flow_bb_inside_loop_p (loop, gimple_bb (DR_STMT (dr))))
   1898 	datarefs[j++] = dr;
   1899       else
   1900 	{
   1901 	  if (dr->aux)
   1902 	    {
   1903 	      DR_ACCESS_STRIDE (dr)->release ();
   1904 	      delete (vec<tree> *) dr->aux;
   1905 	    }
   1906 	  free_data_ref (dr);
   1907 	}
   1908     }
   1909   datarefs.truncate (j);
   1910 }
   1911 
   1912 /* Find and store data references in DATAREFS for LOOP nest.  If there's
   1913    difficult data reference in a basic block, we shrink the loop nest to
   1914    inner loop of that basic block's father loop.  On success, return the
   1915    outer loop of the result loop nest.  */
   1916 
   1917 static class loop *
   1918 prepare_data_references (class loop *loop, vec<data_reference_p> *datarefs)
   1919 {
   1920   class loop *loop_nest = loop;
   1921   vec<data_reference_p> *bb_refs;
   1922   basic_block bb, *bbs = get_loop_body_in_dom_order (loop);
   1923 
   1924   for (unsigned i = 0; i < loop->num_nodes; i++)
   1925     bbs[i]->aux = NULL;
   1926 
   1927   /* Find data references for all basic blocks.  Shrink loop nest on difficult
   1928      data reference.  */
   1929   for (unsigned i = 0; loop_nest && i < loop->num_nodes; ++i)
   1930     {
   1931       bb = bbs[i];
   1932       if (!flow_bb_inside_loop_p (loop_nest, bb))
   1933 	continue;
   1934 
   1935       bb_refs = new vec<data_reference_p> ();
   1936       if (find_data_references_in_bb (loop, bb, bb_refs) == chrec_dont_know)
   1937         {
   1938 	  loop_nest = bb->loop_father->inner;
   1939 	  if (loop_nest && !loop_nest->inner)
   1940 	    loop_nest = NULL;
   1941 
   1942 	  free_data_refs (*bb_refs);
   1943           delete bb_refs;
   1944         }
   1945       else if (bb_refs->is_empty ())
   1946 	delete bb_refs;
   1947       else
   1948 	bb->aux = bb_refs;
   1949     }
   1950 
   1951   /* Collect all data references in loop nest.  */
   1952   for (unsigned i = 0; i < loop->num_nodes; i++)
   1953     {
   1954       bb = bbs[i];
   1955       if (!bb->aux)
   1956 	continue;
   1957 
   1958       bb_refs = (vec<data_reference_p> *) bb->aux;
   1959       if (loop_nest && flow_bb_inside_loop_p (loop_nest, bb))
   1960 	datarefs->safe_splice (*bb_refs);
   1961       else
   1962 	free_data_refs (*bb_refs);
   1963 
   1964       delete bb_refs;
   1965       bb->aux = NULL;
   1966     }
   1967   free (bbs);
   1968 
   1969   return loop_nest;
   1970 }
   1971 
   1972 /* Given innermost LOOP, return true if perfect loop nest can be found and
   1973    data dependences can be computed.  If succeed, record the perfect loop
   1974    nest in LOOP_NEST; record all data references in DATAREFS and record all
   1975    data dependence relations in DDRS.
   1976 
   1977    We do support a restricted form of imperfect loop nest, i.e, loop nest
   1978    with load/store in outer loop initializing/finalizing simple reduction
   1979    of the innermost loop.  For such outer loop reference, we require that
   1980    it has no dependence with others sinve it will be moved to inner loop
   1981    in interchange.  */
   1982 
   1983 static bool
   1984 prepare_perfect_loop_nest (class loop *loop, vec<loop_p> *loop_nest,
   1985 			   vec<data_reference_p> *datarefs, vec<ddr_p> *ddrs)
   1986 {
   1987   class loop *start_loop = NULL, *innermost = loop;
   1988   class loop *outermost = loops_for_fn (cfun)->tree_root;
   1989 
   1990   /* Find loop nest from the innermost loop.  The outermost is the innermost
   1991      outer*/
   1992   while (loop->num != 0 && loop->inner == start_loop
   1993 	 && flow_loop_nested_p (outermost, loop))
   1994     {
   1995       if (!proper_loop_form_for_interchange (loop, &outermost))
   1996 	break;
   1997 
   1998       start_loop = loop;
   1999       /* If this loop has sibling loop, the father loop won't be in perfect
   2000 	 loop nest.  */
   2001       if (loop->next != NULL)
   2002 	break;
   2003 
   2004       loop = loop_outer (loop);
   2005     }
   2006   if (!start_loop || !start_loop->inner)
   2007     return false;
   2008 
   2009   /* Prepare the data reference vector for the loop nest, pruning outer
   2010      loops we cannot handle.  */
   2011   start_loop = prepare_data_references (start_loop, datarefs);
   2012   if (!start_loop
   2013       /* Check if there is no data reference.  */
   2014       || datarefs->is_empty ()
   2015       /* Check if there are too many of data references.  */
   2016       || (int) datarefs->length () > MAX_DATAREFS)
   2017     return false;
   2018 
   2019   /* Compute access strides for all data references, pruning outer
   2020      loops we cannot analyze refs in.  */
   2021   start_loop = compute_access_strides (start_loop, innermost, *datarefs);
   2022   if (!start_loop)
   2023     return false;
   2024 
   2025   /* Check if any interchange is profitable in the loop nest.  */
   2026   if (!should_interchange_loop_nest (start_loop, innermost, *datarefs))
   2027     return false;
   2028 
   2029   /* Check if data dependences can be computed for loop nest starting from
   2030      start_loop.  */
   2031   loop = start_loop;
   2032   do {
   2033     loop_nest->truncate (0);
   2034 
   2035     if (loop != start_loop)
   2036       prune_datarefs_not_in_loop (start_loop, *datarefs);
   2037 
   2038     if (find_loop_nest (start_loop, loop_nest)
   2039 	&& tree_loop_interchange_compute_ddrs (*loop_nest, *datarefs, ddrs))
   2040       {
   2041 	if (dump_file && (dump_flags & TDF_DETAILS))
   2042 	  fprintf (dump_file,
   2043 		   "\nConsider loop interchange for loop_nest<%d - %d>\n",
   2044 		   start_loop->num, innermost->num);
   2045 
   2046 	if (loop != start_loop)
   2047 	  prune_access_strides_not_in_loop (start_loop, innermost, *datarefs);
   2048 
   2049 	if (dump_file && (dump_flags & TDF_DETAILS))
   2050 	  dump_access_strides (*datarefs);
   2051 
   2052 	return true;
   2053       }
   2054 
   2055     free_dependence_relations (*ddrs);
   2056     *ddrs = vNULL;
   2057     /* Try to compute data dependences with the outermost loop stripped.  */
   2058     loop = start_loop;
   2059     start_loop = start_loop->inner;
   2060   } while (start_loop && start_loop->inner);
   2061 
   2062   return false;
   2063 }
   2064 
   2065 /* Main entry for loop interchange pass.  */
   2066 
   2067 unsigned int
   2068 pass_linterchange::execute (function *fun)
   2069 {
   2070   if (number_of_loops (fun) <= 2)
   2071     return 0;
   2072 
   2073   bool changed_p = false;
   2074   class loop *loop;
   2075   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
   2076     {
   2077       vec<loop_p> loop_nest = vNULL;
   2078       vec<data_reference_p> datarefs = vNULL;
   2079       vec<ddr_p> ddrs = vNULL;
   2080       if (prepare_perfect_loop_nest (loop, &loop_nest, &datarefs, &ddrs))
   2081 	{
   2082 	  tree_loop_interchange loop_interchange (loop_nest);
   2083 	  changed_p |= loop_interchange.interchange (datarefs, ddrs);
   2084 	}
   2085       free_dependence_relations (ddrs);
   2086       free_data_refs_with_aux (datarefs);
   2087       loop_nest.release ();
   2088     }
   2089 
   2090   return changed_p ? (TODO_update_ssa_only_virtuals) : 0;
   2091 }
   2092 
   2093 } // anon namespace
   2094 
   2095 gimple_opt_pass *
   2096 make_pass_linterchange (gcc::context *ctxt)
   2097 {
   2098   return new pass_linterchange (ctxt);
   2099 }
   2100