Home | History | Annotate | Line # | Download | only in gcc
gimple-loop-interchange.cc revision 1.1.1.1
      1 /* Loop interchange.
      2    Copyright (C) 2017-2018 Free Software Foundation, Inc.
      3    Contributed by ARM Ltd.
      4 
      5 This file is part of GCC.
      6 
      7 GCC is free software; you can redistribute it and/or modify it
      8 under the terms of the GNU General Public License as published by the
      9 Free Software Foundation; either version 3, or (at your option) any
     10 later version.
     11 
     12 GCC is distributed in the hope that it will be useful, but WITHOUT
     13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
     14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     15 for more details.
     16 
     17 You should have received a copy of the GNU General Public License
     18 along with GCC; see the file COPYING3.  If not see
     19 <http://www.gnu.org/licenses/>.  */
     20 
     21 #include "config.h"
     22 #include "system.h"
     23 #include "coretypes.h"
     24 #include "backend.h"
     25 #include "is-a.h"
     26 #include "tree.h"
     27 #include "gimple.h"
     28 #include "tree-pass.h"
     29 #include "ssa.h"
     30 #include "gimple-pretty-print.h"
     31 #include "fold-const.h"
     32 #include "gimplify.h"
     33 #include "gimple-iterator.h"
     34 #include "gimplify-me.h"
     35 #include "cfgloop.h"
     36 #include "params.h"
     37 #include "tree-ssa.h"
     38 #include "tree-scalar-evolution.h"
     39 #include "tree-ssa-loop-manip.h"
     40 #include "tree-ssa-loop-niter.h"
     41 #include "tree-ssa-loop-ivopts.h"
     42 #include "tree-ssa-dce.h"
     43 #include "tree-data-ref.h"
     44 #include "tree-vectorizer.h"
     45 
     46 /* This pass performs loop interchange: for example, the loop nest
     47 
     48    for (int j = 0; j < N; j++)
     49      for (int k = 0; k < N; k++)
     50        for (int i = 0; i < N; i++)
     51 	 c[i][j] = c[i][j] + a[i][k]*b[k][j];
     52 
     53    is transformed to
     54 
     55    for (int i = 0; i < N; i++)
     56      for (int j = 0; j < N; j++)
     57        for (int k = 0; k < N; k++)
     58 	 c[i][j] = c[i][j] + a[i][k]*b[k][j];
     59 
     60    This pass implements loop interchange in the following steps:
     61 
     62      1) Find perfect loop nest for each innermost loop and compute data
     63 	dependence relations for it.  For above example, loop nest is
     64 	<loop_j, loop_k, loop_i>.
     65      2) From innermost to outermost loop, this pass tries to interchange
     66 	each loop pair.  For above case, it firstly tries to interchange
     67 	<loop_k, loop_i> and loop nest becomes <loop_j, loop_i, loop_k>.
     68 	Then it tries to interchange <loop_j, loop_i> and loop nest becomes
     69 	<loop_i, loop_j, loop_k>.  The overall effect is to move innermost
     70 	loop to the outermost position.  For loop pair <loop_i, loop_j>
     71 	to be interchanged, we:
     72      3) Check if data dependence relations are valid for loop interchange.
     73      4) Check if both loops can be interchanged in terms of transformation.
     74      5) Check if interchanging the two loops is profitable.
     75      6) Interchange the two loops by mapping induction variables.
     76 
     77    This pass also handles reductions in loop nest.  So far we only support
     78    simple reduction of inner loop and double reduction of the loop nest.  */
     79 
     80 /* Maximum number of stmts in each loop that should be interchanged.  */
     81 #define MAX_NUM_STMT    (PARAM_VALUE (PARAM_LOOP_INTERCHANGE_MAX_NUM_STMTS))
     82 /* Maximum number of data references in loop nest.  */
     83 #define MAX_DATAREFS    (PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
     84 
     85 /* Comparison ratio of access stride between inner/outer loops to be
     86    interchanged.  This is the minimum stride ratio for loop interchange
     87    to be profitable.  */
     88 #define OUTER_STRIDE_RATIO  (PARAM_VALUE (PARAM_LOOP_INTERCHANGE_STRIDE_RATIO))
     89 /* The same as above, but we require higher ratio for interchanging the
     90    innermost two loops.  */
     91 #define INNER_STRIDE_RATIO  ((OUTER_STRIDE_RATIO) + 1)
     92 
     93 /* Comparison ratio of stmt cost between inner/outer loops.  Loops won't
     94    be interchanged if outer loop has too many stmts.  */
     95 #define STMT_COST_RATIO     (3)
     96 
     97 /* Vector of strides that DR accesses in each level loop of a loop nest.  */
     98 #define DR_ACCESS_STRIDE(dr) ((vec<tree> *) dr->aux)
     99 
    100 /* Structure recording loop induction variable.  */
    101 typedef struct induction
    102 {
    103   /* IV itself.  */
    104   tree var;
    105   /* IV's initializing value, which is the init arg of the IV PHI node.  */
    106   tree init_val;
    107   /* IV's initializing expr, which is (the expanded result of) init_val.  */
    108   tree init_expr;
    109   /* IV's step.  */
    110   tree step;
    111 } *induction_p;
    112 
    113 /* Enum type for loop reduction variable.  */
    114 enum reduction_type
    115 {
    116   UNKNOWN_RTYPE = 0,
    117   SIMPLE_RTYPE,
    118   DOUBLE_RTYPE
    119 };
    120 
    121 /* Structure recording loop reduction variable.  */
    122 typedef struct reduction
    123 {
    124   /* Reduction itself.  */
    125   tree var;
    126   /* PHI node defining reduction variable.  */
    127   gphi *phi;
    128   /* Init and next variables of the reduction.  */
    129   tree init;
    130   tree next;
    131   /* Lcssa PHI node if reduction is used outside of its definition loop.  */
    132   gphi *lcssa_phi;
    133   /* Stmts defining init and next.  */
    134   gimple *producer;
    135   gimple *consumer;
    136   /* If init is loaded from memory, this is the loading memory reference.  */
    137   tree init_ref;
    138   /* If reduction is finally stored to memory, this is the stored memory
    139      reference.  */
    140   tree fini_ref;
    141   enum reduction_type type;
    142 } *reduction_p;
    143 
    144 
    145 /* Dump reduction RE.  */
    146 
    147 static void
    148 dump_reduction (reduction_p re)
    149 {
    150   if (re->type == SIMPLE_RTYPE)
    151     fprintf (dump_file, "  Simple reduction:  ");
    152   else if (re->type == DOUBLE_RTYPE)
    153     fprintf (dump_file, "  Double reduction:  ");
    154   else
    155     fprintf (dump_file, "  Unknown reduction:  ");
    156 
    157   print_gimple_stmt (dump_file, re->phi, 0);
    158 }
    159 
    160 /* Dump LOOP's induction IV.  */
    161 static void
    162 dump_induction (struct loop *loop, induction_p iv)
    163 {
    164   fprintf (dump_file, "  Induction:  ");
    165   print_generic_expr (dump_file, iv->var, TDF_SLIM);
    166   fprintf (dump_file, " = {");
    167   print_generic_expr (dump_file, iv->init_expr, TDF_SLIM);
    168   fprintf (dump_file, ", ");
    169   print_generic_expr (dump_file, iv->step, TDF_SLIM);
    170   fprintf (dump_file, "}_%d\n", loop->num);
    171 }
    172 
    173 /* Loop candidate for interchange.  */
    174 
    175 struct loop_cand
    176 {
    177   loop_cand (struct loop *, struct loop *);
    178   ~loop_cand ();
    179 
    180   reduction_p find_reduction_by_stmt (gimple *);
    181   void classify_simple_reduction (reduction_p);
    182   bool analyze_iloop_reduction_var (tree);
    183   bool analyze_oloop_reduction_var (loop_cand *, tree);
    184   bool analyze_induction_var (tree, tree);
    185   bool analyze_carried_vars (loop_cand *);
    186   bool analyze_lcssa_phis (void);
    187   bool can_interchange_p (loop_cand *);
    188   void undo_simple_reduction (reduction_p, bitmap);
    189 
    190   /* The loop itself.  */
    191   struct loop *m_loop;
    192   /* The outer loop for interchange.  It equals to loop if this loop cand
    193      itself represents the outer loop.  */
    194   struct loop *m_outer;
    195   /* Vector of induction variables in loop.  */
    196   vec<induction_p> m_inductions;
    197   /* Vector of reduction variables in loop.  */
    198   vec<reduction_p> m_reductions;
    199   /* Lcssa PHI nodes of this loop.  */
    200   vec<gphi *> m_lcssa_nodes;
    201   /* Single exit edge of this loop.  */
    202   edge m_exit;
    203   /* Basic blocks of this loop.  */
    204   basic_block *m_bbs;
    205   /* Number of stmts of this loop.  Inner loops' stmts are not included.  */
    206   int m_num_stmts;
    207   /* Number of constant initialized simple reduction.  */
    208   int m_const_init_reduc;
    209 };
    210 
    211 /* Constructor.  */
    212 
    213 loop_cand::loop_cand (struct loop *loop, struct loop *outer)
    214   : m_loop (loop), m_outer (outer), m_exit (single_exit (loop)),
    215     m_bbs (get_loop_body (loop)), m_num_stmts (0), m_const_init_reduc (0)
    216 {
    217     m_inductions.create (3);
    218     m_reductions.create (3);
    219     m_lcssa_nodes.create (3);
    220 }
    221 
    222 /* Destructor.  */
    223 
    224 loop_cand::~loop_cand ()
    225 {
    226   induction_p iv;
    227   for (unsigned i = 0; m_inductions.iterate (i, &iv); ++i)
    228     free (iv);
    229 
    230   reduction_p re;
    231   for (unsigned i = 0; m_reductions.iterate (i, &re); ++i)
    232     free (re);
    233 
    234   m_inductions.release ();
    235   m_reductions.release ();
    236   m_lcssa_nodes.release ();
    237   free (m_bbs);
    238 }
    239 
    240 /* Return single use stmt of VAR in LOOP, otherwise return NULL.  */
    241 
    242 static gimple *
    243 single_use_in_loop (tree var, struct loop *loop)
    244 {
    245   gimple *stmt, *res = NULL;
    246   use_operand_p use_p;
    247   imm_use_iterator iterator;
    248 
    249   FOR_EACH_IMM_USE_FAST (use_p, iterator, var)
    250     {
    251       stmt = USE_STMT (use_p);
    252       if (is_gimple_debug (stmt))
    253 	continue;
    254 
    255       if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
    256 	continue;
    257 
    258       if (res)
    259 	return NULL;
    260 
    261       res = stmt;
    262     }
    263   return res;
    264 }
    265 
    266 /* Return true if E is unsupported in loop interchange, i.e, E is a complex
    267    edge or part of irreducible loop.  */
    268 
    269 static inline bool
    270 unsupported_edge (edge e)
    271 {
    272   return (e->flags & (EDGE_COMPLEX | EDGE_IRREDUCIBLE_LOOP));
    273 }
    274 
    275 /* Return the reduction if STMT is one of its lcssa PHI, producer or consumer
    276    stmt.  */
    277 
    278 reduction_p
    279 loop_cand::find_reduction_by_stmt (gimple *stmt)
    280 {
    281   gphi *phi = dyn_cast <gphi *> (stmt);
    282   reduction_p re;
    283 
    284   for (unsigned i = 0; m_reductions.iterate (i, &re); ++i)
    285     if ((phi != NULL && phi == re->lcssa_phi)
    286 	|| (stmt == re->producer || stmt == re->consumer))
    287       return re;
    288 
    289   return NULL;
    290 }
    291 
    292 /* Return true if current loop_cand be interchanged.  ILOOP is not NULL if
    293    current loop_cand is outer loop in loop nest.  */
    294 
    295 bool
    296 loop_cand::can_interchange_p (loop_cand *iloop)
    297 {
    298   /* For now we only support at most one reduction.  */
    299   unsigned allowed_reduction_num = 1;
    300 
    301   /* Only support reduction if the loop nest to be interchanged is the
    302      innermostin two loops.  */
    303   if ((iloop == NULL && m_loop->inner != NULL)
    304        || (iloop != NULL && iloop->m_loop->inner != NULL))
    305     allowed_reduction_num = 0;
    306 
    307   if (m_reductions.length () > allowed_reduction_num
    308       || (m_reductions.length () == 1
    309 	  && m_reductions[0]->type == UNKNOWN_RTYPE))
    310     return false;
    311 
    312   /* Only support lcssa PHI node which is for reduction.  */
    313   if (m_lcssa_nodes.length () > allowed_reduction_num)
    314     return false;
    315 
    316   /* Check if basic block has any unsupported operation.  Note basic blocks
    317      of inner loops are not checked here.  */
    318   for (unsigned i = 0; i < m_loop->num_nodes; i++)
    319     {
    320       basic_block bb = m_bbs[i];
    321       gphi_iterator psi;
    322       gimple_stmt_iterator gsi;
    323 
    324       /* Skip basic blocks of inner loops.  */
    325       if (bb->loop_father != m_loop)
    326        continue;
    327 
    328       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
    329 	{
    330 	  gimple *stmt = gsi_stmt (gsi);
    331 	  if (is_gimple_debug (stmt))
    332 	    continue;
    333 
    334 	  if (gimple_has_side_effects (stmt))
    335 	    return false;
    336 
    337 	  m_num_stmts++;
    338 	  if (gcall *call = dyn_cast <gcall *> (stmt))
    339 	    {
    340 	      /* In basic block of outer loop, the call should be cheap since
    341 		 it will be moved to inner loop.  */
    342 	      if (iloop != NULL
    343 		  && !gimple_inexpensive_call_p (call))
    344 		return false;
    345 	      continue;
    346 	    }
    347 
    348 	  if (!iloop || !gimple_vuse (stmt))
    349 	    continue;
    350 
    351 	  /* Support stmt accessing memory in outer loop only if it is for
    352 	     inner loop's reduction.  */
    353 	  if (iloop->find_reduction_by_stmt (stmt))
    354 	    continue;
    355 
    356 	  tree lhs;
    357 	  /* Support loop invariant memory reference if it's only used once by
    358 	     inner loop.  */
    359 	  /* ???  How's this checking for invariantness?  */
    360 	  if (gimple_assign_single_p (stmt)
    361 	      && (lhs = gimple_assign_lhs (stmt)) != NULL_TREE
    362 	      && TREE_CODE (lhs) == SSA_NAME
    363 	      && single_use_in_loop (lhs, iloop->m_loop))
    364 	    continue;
    365 
    366 	  return false;
    367 	}
    368       /* Check if loop has too many stmts.  */
    369       if (m_num_stmts > MAX_NUM_STMT)
    370 	return false;
    371 
    372       /* Allow PHI nodes in any basic block of inner loop, PHI nodes in outer
    373 	 loop's header, or PHI nodes in dest bb of inner loop's exit edge.  */
    374       if (!iloop || bb == m_loop->header
    375 	  || bb == iloop->m_exit->dest)
    376 	continue;
    377 
    378       /* Don't allow any other PHI nodes.  */
    379       for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi))
    380 	if (!virtual_operand_p (PHI_RESULT (psi.phi ())))
    381 	  return false;
    382     }
    383 
    384   return true;
    385 }
    386 
    387 /* Programmers and optimizers (like loop store motion) may optimize code:
    388 
    389      for (int i = 0; i < N; i++)
    390        for (int j = 0; j < N; j++)
    391 	 a[i] += b[j][i] * c[j][i];
    392 
    393    into reduction:
    394 
    395      for (int i = 0; i < N; i++)
    396        {
    397 	 // producer.  Note sum can be intitialized to a constant.
    398 	 int sum = a[i];
    399 	 for (int j = 0; j < N; j++)
    400 	   {
    401 	     sum += b[j][i] * c[j][i];
    402 	   }
    403 	 // consumer.
    404 	 a[i] = sum;
    405        }
    406 
    407    The result code can't be interchanged without undoing the optimization.
    408    This function classifies this kind reduction and records information so
    409    that we can undo the store motion during interchange.  */
    410 
    411 void
    412 loop_cand::classify_simple_reduction (reduction_p re)
    413 {
    414   gimple *producer, *consumer;
    415 
    416   /* Check init variable of reduction and how it is initialized.  */
    417   if (TREE_CODE (re->init) == SSA_NAME)
    418     {
    419       producer = SSA_NAME_DEF_STMT (re->init);
    420       re->producer = producer;
    421       basic_block bb = gimple_bb (producer);
    422       if (!bb || bb->loop_father != m_outer)
    423 	return;
    424 
    425       if (!gimple_assign_load_p (producer))
    426 	return;
    427 
    428       re->init_ref = gimple_assign_rhs1 (producer);
    429     }
    430   else if (CONSTANT_CLASS_P (re->init))
    431     m_const_init_reduc++;
    432   else
    433     return;
    434 
    435   /* Check how reduction variable is used.  */
    436   consumer = single_use_in_loop (PHI_RESULT (re->lcssa_phi), m_outer);
    437   if (!consumer
    438       || !gimple_store_p (consumer))
    439     return;
    440 
    441   re->fini_ref = gimple_get_lhs (consumer);
    442   re->consumer = consumer;
    443 
    444   /* Simple reduction with constant initializer.  */
    445   if (!re->init_ref)
    446     {
    447       gcc_assert (CONSTANT_CLASS_P (re->init));
    448       re->init_ref = unshare_expr (re->fini_ref);
    449     }
    450 
    451   /* Require memory references in producer and consumer are the same so
    452      that we can undo reduction during interchange.  */
    453   if (re->init_ref && !operand_equal_p (re->init_ref, re->fini_ref, 0))
    454     return;
    455 
    456   re->type = SIMPLE_RTYPE;
    457 }
    458 
    459 /* Analyze reduction variable VAR for inner loop of the loop nest to be
    460    interchanged.  Return true if analysis succeeds.  */
    461 
    462 bool
    463 loop_cand::analyze_iloop_reduction_var (tree var)
    464 {
    465   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
    466   gphi *lcssa_phi = NULL, *use_phi;
    467   tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
    468   tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop));
    469   reduction_p re;
    470   gimple *stmt, *next_def, *single_use = NULL;
    471   use_operand_p use_p;
    472   imm_use_iterator iterator;
    473 
    474   if (TREE_CODE (next) != SSA_NAME)
    475     return false;
    476 
    477   next_def = SSA_NAME_DEF_STMT (next);
    478   basic_block bb = gimple_bb (next_def);
    479   if (!bb || !flow_bb_inside_loop_p (m_loop, bb))
    480     return false;
    481 
    482   /* In restricted reduction, the var is (and must be) used in defining
    483      the updated var.  The process can be depicted as below:
    484 
    485 		var ;; = PHI<init, next>
    486 		 |
    487 		 |
    488 		 v
    489       +---------------------+
    490       | reduction operators | <-- other operands
    491       +---------------------+
    492 		 |
    493 		 |
    494 		 v
    495 		next
    496 
    497      In terms loop interchange, we don't change how NEXT is computed based
    498      on VAR and OTHER OPERANDS.  In case of double reduction in loop nest
    499      to be interchanged, we don't changed it at all.  In the case of simple
    500      reduction in inner loop, we only make change how VAR/NEXT is loaded or
    501      stored.  With these conditions, we can relax restrictions on reduction
    502      in a way that reduction operation is seen as black box.  In general,
    503      we can ignore reassociation of reduction operator; we can handle fake
    504      reductions in which VAR is not even used to compute NEXT.  */
    505   if (! single_imm_use (var, &use_p, &single_use)
    506       || ! flow_bb_inside_loop_p (m_loop, gimple_bb (single_use)))
    507     return false;
    508 
    509   /* Check the reduction operation.  We require a left-associative operation.
    510      For FP math we also need to be allowed to associate operations.  */
    511   if (gassign *ass = dyn_cast <gassign *> (single_use))
    512     {
    513       enum tree_code code = gimple_assign_rhs_code (ass);
    514       if (! (associative_tree_code (code)
    515 	     || (code == MINUS_EXPR
    516 		 && use_p->use == gimple_assign_rhs1_ptr (ass)))
    517 	  || (FLOAT_TYPE_P (TREE_TYPE (var))
    518 	      && ! flag_associative_math))
    519 	return false;
    520     }
    521   else
    522     return false;
    523 
    524   /* Handle and verify a series of stmts feeding the reduction op.  */
    525   if (single_use != next_def
    526       && !check_reduction_path (UNKNOWN_LOCATION, m_loop, phi, next,
    527 				gimple_assign_rhs_code (single_use)))
    528     return false;
    529 
    530   /* Only support cases in which INIT is used in inner loop.  */
    531   if (TREE_CODE (init) == SSA_NAME)
    532     FOR_EACH_IMM_USE_FAST (use_p, iterator, init)
    533       {
    534 	stmt = USE_STMT (use_p);
    535 	if (is_gimple_debug (stmt))
    536 	  continue;
    537 
    538 	if (!flow_bb_inside_loop_p (m_loop, gimple_bb (stmt)))
    539 	  return false;
    540       }
    541 
    542   FOR_EACH_IMM_USE_FAST (use_p, iterator, next)
    543     {
    544       stmt = USE_STMT (use_p);
    545       if (is_gimple_debug (stmt))
    546 	continue;
    547 
    548       /* Or else it's used in PHI itself.  */
    549       use_phi = dyn_cast <gphi *> (stmt);
    550       if (use_phi == phi)
    551 	continue;
    552 
    553       if (use_phi != NULL
    554 	  && lcssa_phi == NULL
    555 	  && gimple_bb (stmt) == m_exit->dest
    556 	  && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next)
    557 	lcssa_phi = use_phi;
    558       else
    559 	return false;
    560     }
    561   if (!lcssa_phi)
    562     return false;
    563 
    564   re = XCNEW (struct reduction);
    565   re->var = var;
    566   re->init = init;
    567   re->next = next;
    568   re->phi = phi;
    569   re->lcssa_phi = lcssa_phi;
    570 
    571   classify_simple_reduction (re);
    572 
    573   if (dump_file && (dump_flags & TDF_DETAILS))
    574     dump_reduction (re);
    575 
    576   m_reductions.safe_push (re);
    577   return true;
    578 }
    579 
    580 /* Analyze reduction variable VAR for outer loop of the loop nest to be
    581    interchanged.  ILOOP is not NULL and points to inner loop.  For the
    582    moment, we only support double reduction for outer loop, like:
    583 
    584      for (int i = 0; i < n; i++)
    585        {
    586 	 int sum = 0;
    587 
    588 	 for (int j = 0; j < n; j++)    // outer loop
    589 	   for (int k = 0; k < n; k++)  // inner loop
    590 	     sum += a[i][k]*b[k][j];
    591 
    592 	 s[i] = sum;
    593        }
    594 
    595    Note the innermost two loops are the loop nest to be interchanged.
    596    Return true if analysis succeeds.  */
    597 
    598 bool
    599 loop_cand::analyze_oloop_reduction_var (loop_cand *iloop, tree var)
    600 {
    601   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
    602   gphi *lcssa_phi = NULL, *use_phi;
    603   tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
    604   tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop));
    605   reduction_p re;
    606   gimple *stmt, *next_def;
    607   use_operand_p use_p;
    608   imm_use_iterator iterator;
    609 
    610   if (TREE_CODE (next) != SSA_NAME)
    611     return false;
    612 
    613   next_def = SSA_NAME_DEF_STMT (next);
    614   basic_block bb = gimple_bb (next_def);
    615   if (!bb || !flow_bb_inside_loop_p (m_loop, bb))
    616     return false;
    617 
    618   /* Find inner loop's simple reduction that uses var as initializer.  */
    619   reduction_p inner_re = NULL;
    620   for (unsigned i = 0; iloop->m_reductions.iterate (i, &inner_re); ++i)
    621     if (inner_re->init == var || operand_equal_p (inner_re->init, var, 0))
    622       break;
    623 
    624   if (inner_re == NULL
    625       || inner_re->type != UNKNOWN_RTYPE
    626       || inner_re->producer != phi)
    627     return false;
    628 
    629   /* In case of double reduction, outer loop's reduction should be updated
    630      by inner loop's simple reduction.  */
    631   if (next_def != inner_re->lcssa_phi)
    632     return false;
    633 
    634   /* Outer loop's reduction should only be used to initialize inner loop's
    635      simple reduction.  */
    636   if (! single_imm_use (var, &use_p, &stmt)
    637       || stmt != inner_re->phi)
    638     return false;
    639 
    640   /* Check this reduction is correctly used outside of loop via lcssa phi.  */
    641   FOR_EACH_IMM_USE_FAST (use_p, iterator, next)
    642     {
    643       stmt = USE_STMT (use_p);
    644       if (is_gimple_debug (stmt))
    645 	continue;
    646 
    647       /* Or else it's used in PHI itself.  */
    648       use_phi = dyn_cast <gphi *> (stmt);
    649       if (use_phi == phi)
    650 	continue;
    651 
    652       if (lcssa_phi == NULL
    653 	  && use_phi != NULL
    654 	  && gimple_bb (stmt) == m_exit->dest
    655 	  && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next)
    656 	lcssa_phi = use_phi;
    657       else
    658 	return false;
    659     }
    660   if (!lcssa_phi)
    661     return false;
    662 
    663   re = XCNEW (struct reduction);
    664   re->var = var;
    665   re->init = init;
    666   re->next = next;
    667   re->phi = phi;
    668   re->lcssa_phi = lcssa_phi;
    669   re->type = DOUBLE_RTYPE;
    670   inner_re->type = DOUBLE_RTYPE;
    671 
    672   if (dump_file && (dump_flags & TDF_DETAILS))
    673     dump_reduction (re);
    674 
    675   m_reductions.safe_push (re);
    676   return true;
    677 }
    678 
    679 /* Return true if VAR is induction variable of current loop whose scev is
    680    specified by CHREC.  */
    681 
    682 bool
    683 loop_cand::analyze_induction_var (tree var, tree chrec)
    684 {
    685   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
    686   tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
    687 
    688   /* Var is loop invariant, though it's unlikely to happen.  */
    689   if (tree_does_not_contain_chrecs (chrec))
    690     {
    691       /* Punt on floating point invariants if honoring signed zeros,
    692 	 representing that as + 0.0 would change the result if init
    693 	 is -0.0.  Similarly for SNaNs it can raise exception.  */
    694       if (HONOR_SIGNED_ZEROS (chrec) || HONOR_SNANS (chrec))
    695 	return false;
    696       struct induction *iv = XCNEW (struct induction);
    697       iv->var = var;
    698       iv->init_val = init;
    699       iv->init_expr = chrec;
    700       iv->step = build_zero_cst (TREE_TYPE (chrec));
    701       m_inductions.safe_push (iv);
    702       return true;
    703     }
    704 
    705   if (TREE_CODE (chrec) != POLYNOMIAL_CHREC
    706       || CHREC_VARIABLE (chrec) != (unsigned) m_loop->num
    707       || tree_contains_chrecs (CHREC_LEFT (chrec), NULL)
    708       || tree_contains_chrecs (CHREC_RIGHT (chrec), NULL))
    709     return false;
    710 
    711   struct induction *iv = XCNEW (struct induction);
    712   iv->var = var;
    713   iv->init_val = init;
    714   iv->init_expr = CHREC_LEFT (chrec);
    715   iv->step = CHREC_RIGHT (chrec);
    716 
    717   if (dump_file && (dump_flags & TDF_DETAILS))
    718     dump_induction (m_loop, iv);
    719 
    720   m_inductions.safe_push (iv);
    721   return true;
    722 }
    723 
    724 /* Return true if all loop carried variables defined in loop header can
    725    be successfully analyzed.  */
    726 
    727 bool
    728 loop_cand::analyze_carried_vars (loop_cand *iloop)
    729 {
    730   edge e = loop_preheader_edge (m_outer);
    731   gphi_iterator gsi;
    732 
    733   if (dump_file && (dump_flags & TDF_DETAILS))
    734     fprintf (dump_file, "\nLoop(%d) carried vars:\n", m_loop->num);
    735 
    736   for (gsi = gsi_start_phis (m_loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
    737     {
    738       gphi *phi = gsi.phi ();
    739 
    740       tree var = PHI_RESULT (phi);
    741       if (virtual_operand_p (var))
    742 	continue;
    743 
    744       tree chrec = analyze_scalar_evolution (m_loop, var);
    745       chrec = instantiate_scev (e, m_loop, chrec);
    746 
    747       /* Analyze var as reduction variable.  */
    748       if (chrec_contains_undetermined (chrec)
    749 	  || chrec_contains_symbols_defined_in_loop (chrec, m_outer->num))
    750 	{
    751 	  if (iloop && !analyze_oloop_reduction_var (iloop, var))
    752 	    return false;
    753 	  if (!iloop && !analyze_iloop_reduction_var (var))
    754 	    return false;
    755 	}
    756       /* Analyze var as induction variable.  */
    757       else if (!analyze_induction_var (var, chrec))
    758 	return false;
    759     }
    760 
    761   return true;
    762 }
    763 
    764 /* Return TRUE if loop closed PHI nodes can be analyzed successfully.  */
    765 
    766 bool
    767 loop_cand::analyze_lcssa_phis (void)
    768 {
    769   gphi_iterator gsi;
    770   for (gsi = gsi_start_phis (m_exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
    771     {
    772       gphi *phi = gsi.phi ();
    773 
    774       if (virtual_operand_p (PHI_RESULT (phi)))
    775 	continue;
    776 
    777       /* TODO: We only support lcssa phi for reduction for now.  */
    778       if (!find_reduction_by_stmt (phi))
    779 	return false;
    780     }
    781 
    782   return true;
    783 }
    784 
    785 /* CONSUMER is a stmt in BB storing reduction result into memory object.
    786    When the reduction is intialized from constant value, we need to add
    787    a stmt loading from the memory object to target basic block in inner
    788    loop during undoing the reduction.  Problem is that memory reference
    789    may use ssa variables not dominating the target basic block.  This
    790    function finds all stmts on which CONSUMER depends in basic block BB,
    791    records and returns them via STMTS.  */
    792 
    793 static void
    794 find_deps_in_bb_for_stmt (gimple_seq *stmts, basic_block bb, gimple *consumer)
    795 {
    796   auto_vec<gimple *, 4> worklist;
    797   use_operand_p use_p;
    798   ssa_op_iter iter;
    799   gimple *stmt, *def_stmt;
    800   gimple_stmt_iterator gsi;
    801 
    802   /* First clear flag for stmts in bb.  */
    803   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
    804     gimple_set_plf (gsi_stmt (gsi), GF_PLF_1, false);
    805 
    806   /* DFS search all depended stmts in bb and mark flag for these stmts.  */
    807   worklist.safe_push (consumer);
    808   while (!worklist.is_empty ())
    809     {
    810       stmt = worklist.pop ();
    811       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
    812 	{
    813 	  def_stmt = SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p));
    814 
    815 	  if (is_a <gphi *> (def_stmt)
    816 	      || gimple_bb (def_stmt) != bb
    817 	      || gimple_plf (def_stmt, GF_PLF_1))
    818 	    continue;
    819 
    820 	  worklist.safe_push (def_stmt);
    821 	}
    822       gimple_set_plf (stmt, GF_PLF_1, true);
    823     }
    824   for (gsi = gsi_start_nondebug_bb (bb);
    825        !gsi_end_p (gsi) && (stmt = gsi_stmt (gsi)) != consumer;)
    826     {
    827       /* Move dep stmts to sequence STMTS.  */
    828       if (gimple_plf (stmt, GF_PLF_1))
    829 	{
    830 	  gsi_remove (&gsi, false);
    831 	  gimple_seq_add_stmt_without_update (stmts, stmt);
    832 	}
    833       else
    834 	gsi_next_nondebug (&gsi);
    835     }
    836 }
    837 
    838 /* User can write, optimizers can generate simple reduction RE for inner
    839    loop.  In order to make interchange valid, we have to undo reduction by
    840    moving producer and consumer stmts into the inner loop.  For example,
    841    below code:
    842 
    843      init = MEM_REF[idx];		//producer
    844      loop:
    845        var = phi<init, next>
    846        next = var op ...
    847      reduc_sum = phi<next>
    848      MEM_REF[idx] = reduc_sum		//consumer
    849 
    850    is transformed into:
    851 
    852      loop:
    853        new_var = MEM_REF[idx];		//producer after moving
    854        next = new_var op ...
    855        MEM_REF[idx] = next;		//consumer after moving
    856 
    857    Note if the reduction variable is initialized to constant, like:
    858 
    859      var = phi<0.0, next>
    860 
    861    we compute new_var as below:
    862 
    863      loop:
    864        tmp = MEM_REF[idx];
    865        new_var = !first_iteration ? tmp : 0.0;
    866 
    867    so that the initial const is used in the first iteration of loop.  Also
    868    record ssa variables for dead code elimination in DCE_SEEDS.  */
    869 
    870 void
    871 loop_cand::undo_simple_reduction (reduction_p re, bitmap dce_seeds)
    872 {
    873   gimple *stmt;
    874   gimple_stmt_iterator from, to = gsi_after_labels (m_loop->header);
    875   gimple_seq stmts = NULL;
    876   tree new_var;
    877 
    878   /* Prepare the initialization stmts and insert it to inner loop.  */
    879   if (re->producer != NULL)
    880     {
    881       gimple_set_vuse (re->producer, NULL_TREE);
    882       from = gsi_for_stmt (re->producer);
    883       gsi_remove (&from, false);
    884       gimple_seq_add_stmt_without_update (&stmts, re->producer);
    885       new_var = re->init;
    886     }
    887   else
    888     {
    889       /* Find all stmts on which expression "MEM_REF[idx]" depends.  */
    890       find_deps_in_bb_for_stmt (&stmts, gimple_bb (re->consumer), re->consumer);
    891       /* Because we generate new stmt loading from the MEM_REF to TMP.  */
    892       tree cond, tmp = copy_ssa_name (re->var);
    893       stmt = gimple_build_assign (tmp, re->init_ref);
    894       gimple_seq_add_stmt_without_update (&stmts, stmt);
    895 
    896       /* Init new_var to MEM_REF or CONST depending on if it is the first
    897 	 iteration.  */
    898       induction_p iv = m_inductions[0];
    899       cond = fold_build2 (NE_EXPR, boolean_type_node, iv->var, iv->init_val);
    900       new_var = copy_ssa_name (re->var);
    901       stmt = gimple_build_assign (new_var, COND_EXPR, cond, tmp, re->init);
    902       gimple_seq_add_stmt_without_update (&stmts, stmt);
    903     }
    904   gsi_insert_seq_before (&to, stmts, GSI_SAME_STMT);
    905 
    906   /* Replace all uses of reduction var with new variable.  */
    907   use_operand_p use_p;
    908   imm_use_iterator iterator;
    909   FOR_EACH_IMM_USE_STMT (stmt, iterator, re->var)
    910     {
    911       FOR_EACH_IMM_USE_ON_STMT (use_p, iterator)
    912 	SET_USE (use_p, new_var);
    913 
    914       update_stmt (stmt);
    915     }
    916 
    917   /* Move consumer stmt into inner loop, just after reduction next's def.  */
    918   unlink_stmt_vdef (re->consumer);
    919   release_ssa_name (gimple_vdef (re->consumer));
    920   gimple_set_vdef (re->consumer, NULL_TREE);
    921   gimple_set_vuse (re->consumer, NULL_TREE);
    922   gimple_assign_set_rhs1 (re->consumer, re->next);
    923   from = gsi_for_stmt (re->consumer);
    924   to = gsi_for_stmt (SSA_NAME_DEF_STMT (re->next));
    925   gsi_move_after (&from, &to);
    926 
    927   /* Mark the reduction variables for DCE.  */
    928   bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (re->var));
    929   bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (PHI_RESULT (re->lcssa_phi)));
    930 }
    931 
    932 /* Free DATAREFS and its auxiliary memory.  */
    933 
    934 static void
    935 free_data_refs_with_aux (vec<data_reference_p> datarefs)
    936 {
    937   data_reference_p dr;
    938   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
    939     if (dr->aux != NULL)
    940       {
    941 	DR_ACCESS_STRIDE (dr)->release ();
    942 	delete (vec<tree> *) dr->aux;
    943       }
    944 
    945   free_data_refs (datarefs);
    946 }
    947 
    948 /* Class for loop interchange transformation.  */
    949 
    950 class tree_loop_interchange
    951 {
    952 public:
    953   tree_loop_interchange (vec<struct loop *> loop_nest)
    954     : m_loop_nest (loop_nest), m_niters_iv_var (NULL_TREE),
    955       m_dce_seeds (BITMAP_ALLOC (NULL)) { }
    956   ~tree_loop_interchange () { BITMAP_FREE (m_dce_seeds); }
    957   bool interchange (vec<data_reference_p>, vec<ddr_p>);
    958 
    959 private:
    960   void update_data_info (unsigned, unsigned, vec<data_reference_p>, vec<ddr_p>);
    961   bool valid_data_dependences (unsigned, unsigned, vec<ddr_p>);
    962   void interchange_loops (loop_cand &, loop_cand &);
    963   void map_inductions_to_loop (loop_cand &, loop_cand &);
    964   void move_code_to_inner_loop (struct loop *, struct loop *, basic_block *);
    965 
    966   /* The whole loop nest in which interchange is ongoing.  */
    967   vec<struct loop *> m_loop_nest;
    968   /* We create new IV which is only used in loop's exit condition check.
    969      In case of 3-level loop nest interchange, when we interchange the
    970      innermost two loops, new IV created in the middle level loop does
    971      not need to be preserved in interchanging the outermost two loops
    972      later.  We record the IV so that it can be skipped.  */
    973   tree m_niters_iv_var;
    974   /* Bitmap of seed variables for dead code elimination after interchange.  */
    975   bitmap m_dce_seeds;
    976 };
    977 
    978 /* Update data refs' access stride and dependence information after loop
    979    interchange.  I_IDX/O_IDX gives indices of interchanged loops in loop
    980    nest.  DATAREFS are data references.  DDRS are data dependences.  */
    981 
    982 void
    983 tree_loop_interchange::update_data_info (unsigned i_idx, unsigned o_idx,
    984 					 vec<data_reference_p> datarefs,
    985 					 vec<ddr_p> ddrs)
    986 {
    987   struct data_reference *dr;
    988   struct data_dependence_relation *ddr;
    989 
    990   /* Update strides of data references.  */
    991   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
    992     {
    993       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
    994       gcc_assert (stride->length () > i_idx);
    995       std::swap ((*stride)[i_idx], (*stride)[o_idx]);
    996     }
    997   /* Update data dependences.  */
    998   for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i)
    999     if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
   1000       {
   1001         for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j)
   1002 	  {
   1003 	    lambda_vector dist_vect = DDR_DIST_VECT (ddr, j);
   1004 	    std::swap (dist_vect[i_idx], dist_vect[o_idx]);
   1005 	  }
   1006       }
   1007 }
   1008 
   1009 /* Check data dependence relations, return TRUE if it's valid to interchange
   1010    two loops specified by I_IDX/O_IDX.  Theoretically, interchanging the two
   1011    loops is valid only if dist vector, after interchanging, doesn't have '>'
   1012    as the leftmost non-'=' direction.  Practically, this function have been
   1013    conservative here by not checking some valid cases.  */
   1014 
   1015 bool
   1016 tree_loop_interchange::valid_data_dependences (unsigned i_idx, unsigned o_idx,
   1017 					       vec<ddr_p> ddrs)
   1018 {
   1019   struct data_dependence_relation *ddr;
   1020 
   1021   for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i)
   1022     {
   1023       /* Skip no-dependence case.  */
   1024       if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
   1025 	continue;
   1026 
   1027       for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j)
   1028 	{
   1029 	  lambda_vector dist_vect = DDR_DIST_VECT (ddr, j);
   1030 	  unsigned level = dependence_level (dist_vect, m_loop_nest.length ());
   1031 
   1032 	  /* If there is no carried dependence.  */
   1033 	  if (level == 0)
   1034 	    continue;
   1035 
   1036 	  level --;
   1037 
   1038 	  /* If dependence is not carried by any loop in between the two
   1039 	     loops [oloop, iloop] to interchange.  */
   1040 	  if (level < o_idx || level > i_idx)
   1041 	    continue;
   1042 
   1043 	  /* Be conservative, skip case if either direction at i_idx/o_idx
   1044 	     levels is not '=' or '<'.  */
   1045 	  if (dist_vect[i_idx] < 0 || dist_vect[o_idx] < 0)
   1046 	    return false;
   1047 	}
   1048     }
   1049 
   1050   return true;
   1051 }
   1052 
   1053 /* Interchange two loops specified by ILOOP and OLOOP.  */
   1054 
   1055 void
   1056 tree_loop_interchange::interchange_loops (loop_cand &iloop, loop_cand &oloop)
   1057 {
   1058   reduction_p re;
   1059   gimple_stmt_iterator gsi;
   1060   tree i_niters, o_niters, var_after;
   1061 
   1062   /* Undo inner loop's simple reduction.  */
   1063   for (unsigned i = 0; iloop.m_reductions.iterate (i, &re); ++i)
   1064     if (re->type != DOUBLE_RTYPE)
   1065       {
   1066 	if (re->producer)
   1067 	  reset_debug_uses (re->producer);
   1068 
   1069 	iloop.undo_simple_reduction (re, m_dce_seeds);
   1070       }
   1071 
   1072   /* Only need to reset debug uses for double reduction.  */
   1073   for (unsigned i = 0; oloop.m_reductions.iterate (i, &re); ++i)
   1074     {
   1075       gcc_assert (re->type == DOUBLE_RTYPE);
   1076       reset_debug_uses (SSA_NAME_DEF_STMT (re->var));
   1077       reset_debug_uses (SSA_NAME_DEF_STMT (re->next));
   1078     }
   1079 
   1080   /* Prepare niters for both loops.  */
   1081   struct loop *loop_nest = m_loop_nest[0];
   1082   edge instantiate_below = loop_preheader_edge (loop_nest);
   1083   gsi = gsi_last_bb (loop_preheader_edge (loop_nest)->src);
   1084   i_niters = number_of_latch_executions (iloop.m_loop);
   1085   i_niters = analyze_scalar_evolution (loop_outer (iloop.m_loop), i_niters);
   1086   i_niters = instantiate_scev (instantiate_below, loop_outer (iloop.m_loop),
   1087 			       i_niters);
   1088   i_niters = force_gimple_operand_gsi (&gsi, unshare_expr (i_niters), true,
   1089 				       NULL_TREE, false, GSI_CONTINUE_LINKING);
   1090   o_niters = number_of_latch_executions (oloop.m_loop);
   1091   if (oloop.m_loop != loop_nest)
   1092     {
   1093       o_niters = analyze_scalar_evolution (loop_outer (oloop.m_loop), o_niters);
   1094       o_niters = instantiate_scev (instantiate_below, loop_outer (oloop.m_loop),
   1095 				   o_niters);
   1096     }
   1097   o_niters = force_gimple_operand_gsi (&gsi, unshare_expr (o_niters), true,
   1098 				       NULL_TREE, false, GSI_CONTINUE_LINKING);
   1099 
   1100   /* Move src's code to tgt loop.  This is necessary when src is the outer
   1101      loop and tgt is the inner loop.  */
   1102   move_code_to_inner_loop (oloop.m_loop, iloop.m_loop, oloop.m_bbs);
   1103 
   1104   /* Map outer loop's IV to inner loop, and vice versa.  */
   1105   map_inductions_to_loop (oloop, iloop);
   1106   map_inductions_to_loop (iloop, oloop);
   1107 
   1108   /* Create canonical IV for both loops.  Note canonical IV for outer/inner
   1109      loop is actually from inner/outer loop.  Also we record the new IV
   1110      created for the outer loop so that it can be skipped in later loop
   1111      interchange.  */
   1112   create_canonical_iv (oloop.m_loop, oloop.m_exit,
   1113 		       i_niters, &m_niters_iv_var, &var_after);
   1114   bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
   1115   create_canonical_iv (iloop.m_loop, iloop.m_exit,
   1116 		       o_niters, NULL, &var_after);
   1117   bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
   1118 
   1119   /* Scrap niters estimation of interchanged loops.  */
   1120   iloop.m_loop->any_upper_bound = false;
   1121   iloop.m_loop->any_likely_upper_bound = false;
   1122   free_numbers_of_iterations_estimates (iloop.m_loop);
   1123   oloop.m_loop->any_upper_bound = false;
   1124   oloop.m_loop->any_likely_upper_bound = false;
   1125   free_numbers_of_iterations_estimates (oloop.m_loop);
   1126 
   1127   /* Clear all cached scev information.  This is expensive but shouldn't be
   1128      a problem given we interchange in very limited times.  */
   1129   scev_reset_htab ();
   1130 
   1131   /* ???  The association between the loop data structure and the
   1132      CFG changed, so what was loop N at the source level is now
   1133      loop M.  We should think of retaining the association or breaking
   1134      it fully by creating a new loop instead of re-using the "wrong" one.  */
   1135 }
   1136 
   1137 /* Map induction variables of SRC loop to TGT loop.  The function firstly
   1138    creates the same IV of SRC loop in TGT loop, then deletes the original
   1139    IV and re-initialize it using the newly created IV.  For example, loop
   1140    nest:
   1141 
   1142      for (i = 0; i < N; i++)
   1143        for (j = 0; j < M; j++)
   1144 	 {
   1145 	   //use of i;
   1146 	   //use of j;
   1147 	 }
   1148 
   1149    will be transformed into:
   1150 
   1151      for (jj = 0; jj < M; jj++)
   1152        for (ii = 0; ii < N; ii++)
   1153 	 {
   1154 	   //use of ii;
   1155 	   //use of jj;
   1156 	 }
   1157 
   1158    after loop interchange.  */
   1159 
   1160 void
   1161 tree_loop_interchange::map_inductions_to_loop (loop_cand &src, loop_cand &tgt)
   1162 {
   1163   induction_p iv;
   1164   edge e = tgt.m_exit;
   1165   gimple_stmt_iterator incr_pos = gsi_last_bb (e->src), gsi;
   1166 
   1167   /* Map source loop's IV to target loop.  */
   1168   for (unsigned i = 0; src.m_inductions.iterate (i, &iv); ++i)
   1169     {
   1170       gimple *use_stmt, *stmt = SSA_NAME_DEF_STMT (iv->var);
   1171       gcc_assert (is_a <gphi *> (stmt));
   1172 
   1173       use_operand_p use_p;
   1174       /* Only map original IV to target loop.  */
   1175       if (m_niters_iv_var != iv->var)
   1176 	{
   1177 	  /* Map the IV by creating the same one in target loop.  */
   1178 	  tree var_before, var_after;
   1179 	  tree base = unshare_expr (iv->init_expr);
   1180 	  tree step = unshare_expr (iv->step);
   1181 	  create_iv (base, step, SSA_NAME_VAR (iv->var),
   1182 		     tgt.m_loop, &incr_pos, false, &var_before, &var_after);
   1183 	  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before));
   1184 	  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
   1185 
   1186 	  /* Replace uses of the original IV var with newly created IV var.  */
   1187 	  imm_use_iterator imm_iter;
   1188 	  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, iv->var)
   1189 	    {
   1190 	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
   1191 		SET_USE (use_p, var_before);
   1192 
   1193 	      update_stmt (use_stmt);
   1194 	    }
   1195 	}
   1196 
   1197       /* Mark all uses for DCE.  */
   1198       ssa_op_iter op_iter;
   1199       FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, op_iter, SSA_OP_USE)
   1200 	{
   1201 	  tree use = USE_FROM_PTR (use_p);
   1202 	  if (TREE_CODE (use) == SSA_NAME
   1203 	      && ! SSA_NAME_IS_DEFAULT_DEF (use))
   1204 	    bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (use));
   1205 	}
   1206 
   1207       /* Delete definition of the original IV in the source loop.  */
   1208       gsi = gsi_for_stmt (stmt);
   1209       remove_phi_node (&gsi, true);
   1210     }
   1211 }
   1212 
   1213 /* Move stmts of outer loop to inner loop.  */
   1214 
   1215 void
   1216 tree_loop_interchange::move_code_to_inner_loop (struct loop *outer,
   1217 						struct loop *inner,
   1218 						basic_block *outer_bbs)
   1219 {
   1220   basic_block oloop_exit_bb = single_exit (outer)->src;
   1221   gimple_stmt_iterator gsi, to;
   1222 
   1223   for (unsigned i = 0; i < outer->num_nodes; i++)
   1224     {
   1225       basic_block bb = outer_bbs[i];
   1226 
   1227       /* Skip basic blocks of inner loop.  */
   1228       if (flow_bb_inside_loop_p (inner, bb))
   1229 	continue;
   1230 
   1231       /* Move code from header/latch to header/latch.  */
   1232       if (bb == outer->header)
   1233 	to = gsi_after_labels (inner->header);
   1234       else if (bb == outer->latch)
   1235 	to = gsi_after_labels (inner->latch);
   1236       else
   1237 	/* Otherwise, simply move to exit->src.  */
   1238 	to = gsi_last_bb (single_exit (inner)->src);
   1239 
   1240       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
   1241 	{
   1242 	  gimple *stmt = gsi_stmt (gsi);
   1243 
   1244 	  if (oloop_exit_bb == bb
   1245 	      && stmt == gsi_stmt (gsi_last_bb (oloop_exit_bb)))
   1246 	    {
   1247 	      gsi_next (&gsi);
   1248 	      continue;
   1249 	    }
   1250 
   1251 	  if (gimple_vuse (stmt))
   1252 	    gimple_set_vuse (stmt, NULL_TREE);
   1253 	  if (gimple_vdef (stmt))
   1254 	    {
   1255 	      unlink_stmt_vdef (stmt);
   1256 	      release_ssa_name (gimple_vdef (stmt));
   1257 	      gimple_set_vdef (stmt, NULL_TREE);
   1258 	    }
   1259 
   1260 	  reset_debug_uses (stmt);
   1261 	  gsi_move_before (&gsi, &to);
   1262 	}
   1263     }
   1264 }
   1265 
   1266 /* Given data reference DR in LOOP_NEST, the function computes DR's access
   1267    stride at each level of loop from innermost LOOP to outer.  On success,
   1268    it saves access stride at each level loop in a vector which is pointed
   1269    by DR->aux.  For example:
   1270 
   1271      int arr[100][100][100];
   1272      for (i = 0; i < 100; i++)       ;(DR->aux)strides[0] = 40000
   1273        for (j = 100; j > 0; j--)     ;(DR->aux)strides[1] = 400
   1274 	 for (k = 0; k < 100; k++)   ;(DR->aux)strides[2] = 4
   1275 	   arr[i][j - 1][k] = 0;  */
   1276 
   1277 static void
   1278 compute_access_stride (struct loop *loop_nest, struct loop *loop,
   1279 		       data_reference_p dr)
   1280 {
   1281   vec<tree> *strides = new vec<tree> ();
   1282   basic_block bb = gimple_bb (DR_STMT (dr));
   1283 
   1284   while (!flow_bb_inside_loop_p (loop, bb))
   1285     {
   1286       strides->safe_push (build_int_cst (sizetype, 0));
   1287       loop = loop_outer (loop);
   1288     }
   1289   gcc_assert (loop == bb->loop_father);
   1290 
   1291   tree ref = DR_REF (dr);
   1292   if (TREE_CODE (ref) == COMPONENT_REF
   1293       && DECL_BIT_FIELD (TREE_OPERAND (ref, 1)))
   1294     {
   1295       /* We can't take address of bitfields.  If the bitfield is at constant
   1296 	 offset from the start of the struct, just use address of the
   1297 	 struct, for analysis of the strides that shouldn't matter.  */
   1298       if (!TREE_OPERAND (ref, 2)
   1299 	  || TREE_CODE (TREE_OPERAND (ref, 2)) == INTEGER_CST)
   1300 	ref = TREE_OPERAND (ref, 0);
   1301       /* Otherwise, if we have a bit field representative, use that.  */
   1302       else if (DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1))
   1303 	       != NULL_TREE)
   1304 	{
   1305 	  tree repr = DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1));
   1306 	  ref = build3 (COMPONENT_REF, TREE_TYPE (repr), TREE_OPERAND (ref, 0),
   1307 			repr, TREE_OPERAND (ref, 2));
   1308 	}
   1309       /* Otherwise punt.  */
   1310       else
   1311 	{
   1312 	  dr->aux = strides;
   1313 	  return;
   1314 	}
   1315     }
   1316   tree scev_base = build_fold_addr_expr (ref);
   1317   tree scev = analyze_scalar_evolution (loop, scev_base);
   1318   scev = instantiate_scev (loop_preheader_edge (loop_nest), loop, scev);
   1319   if (! chrec_contains_undetermined (scev))
   1320     {
   1321       tree sl = scev;
   1322       struct loop *expected = loop;
   1323       while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
   1324 	{
   1325 	  struct loop *sl_loop = get_chrec_loop (sl);
   1326 	  while (sl_loop != expected)
   1327 	    {
   1328 	      strides->safe_push (size_int (0));
   1329 	      expected = loop_outer (expected);
   1330 	    }
   1331 	  strides->safe_push (CHREC_RIGHT (sl));
   1332 	  sl = CHREC_LEFT (sl);
   1333 	  expected = loop_outer (expected);
   1334 	}
   1335       if (! tree_contains_chrecs (sl, NULL))
   1336 	while (expected != loop_outer (loop_nest))
   1337 	  {
   1338 	    strides->safe_push (size_int (0));
   1339 	    expected = loop_outer (expected);
   1340 	  }
   1341     }
   1342 
   1343   dr->aux = strides;
   1344 }
   1345 
   1346 /* Given loop nest LOOP_NEST with innermost LOOP, the function computes
   1347    access strides with respect to each level loop for all data refs in
   1348    DATAREFS from inner loop to outer loop.  On success, it returns the
   1349    outermost loop that access strides can be computed successfully for
   1350    all data references.  If access strides cannot be computed at least
   1351    for two levels of loop for any data reference, it returns NULL.  */
   1352 
   1353 static struct loop *
   1354 compute_access_strides (struct loop *loop_nest, struct loop *loop,
   1355 			vec<data_reference_p> datarefs)
   1356 {
   1357   unsigned i, j, num_loops = (unsigned) -1;
   1358   data_reference_p dr;
   1359   vec<tree> *stride;
   1360 
   1361   for (i = 0; datarefs.iterate (i, &dr); ++i)
   1362     {
   1363       compute_access_stride (loop_nest, loop, dr);
   1364       stride = DR_ACCESS_STRIDE (dr);
   1365       if (stride->length () < num_loops)
   1366 	{
   1367 	  num_loops = stride->length ();
   1368 	  if (num_loops < 2)
   1369 	    return NULL;
   1370 	}
   1371     }
   1372 
   1373   for (i = 0; datarefs.iterate (i, &dr); ++i)
   1374     {
   1375       stride = DR_ACCESS_STRIDE (dr);
   1376       if (stride->length () > num_loops)
   1377 	stride->truncate (num_loops);
   1378 
   1379       for (j = 0; j < (num_loops >> 1); ++j)
   1380 	std::swap ((*stride)[j], (*stride)[num_loops - j - 1]);
   1381     }
   1382 
   1383   loop = superloop_at_depth (loop, loop_depth (loop) + 1 - num_loops);
   1384   gcc_assert (loop_nest == loop || flow_loop_nested_p (loop_nest, loop));
   1385   return loop;
   1386 }
   1387 
   1388 /* Prune access strides for data references in DATAREFS by removing strides
   1389    of loops that isn't in current LOOP_NEST.  */
   1390 
   1391 static void
   1392 prune_access_strides_not_in_loop (struct loop *loop_nest,
   1393 				  struct loop *innermost,
   1394 				  vec<data_reference_p> datarefs)
   1395 {
   1396   data_reference_p dr;
   1397   unsigned num_loops = loop_depth (innermost) - loop_depth (loop_nest) + 1;
   1398   gcc_assert (num_loops > 1);
   1399 
   1400   /* Block remove strides of loops that is not in current loop nest.  */
   1401   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
   1402     {
   1403       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
   1404       if (stride->length () > num_loops)
   1405 	stride->block_remove (0, stride->length () - num_loops);
   1406     }
   1407 }
   1408 
   1409 /* Dump access strides for all DATAREFS.  */
   1410 
   1411 static void
   1412 dump_access_strides (vec<data_reference_p> datarefs)
   1413 {
   1414   data_reference_p dr;
   1415   fprintf (dump_file, "Access Strides for DRs:\n");
   1416   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
   1417     {
   1418       fprintf (dump_file, "  ");
   1419       print_generic_expr (dump_file, DR_REF (dr), TDF_SLIM);
   1420       fprintf (dump_file, ":\t\t<");
   1421 
   1422       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
   1423       unsigned num_loops = stride->length ();
   1424       for (unsigned j = 0; j < num_loops; ++j)
   1425 	{
   1426 	  print_generic_expr (dump_file, (*stride)[j], TDF_SLIM);
   1427 	  fprintf (dump_file, "%s", (j < num_loops - 1) ? ",\t" : ">\n");
   1428 	}
   1429     }
   1430 }
   1431 
   1432 /* Return true if it's profitable to interchange two loops whose index
   1433    in whole loop nest vector are I_IDX/O_IDX respectively.  The function
   1434    computes and compares three types information from all DATAREFS:
   1435      1) Access stride for loop I_IDX and O_IDX.
   1436      2) Number of invariant memory references with respect to I_IDX before
   1437 	and after loop interchange.
   1438      3) Flags indicating if all memory references access sequential memory
   1439 	in ILOOP, before and after loop interchange.
   1440    If INNMOST_LOOP_P is true, the two loops for interchanging are the two
   1441    innermost loops in loop nest.  This function also dumps information if
   1442    DUMP_INFO_P is true.  */
   1443 
   1444 static bool
   1445 should_interchange_loops (unsigned i_idx, unsigned o_idx,
   1446 			  vec<data_reference_p> datarefs,
   1447 			  unsigned i_stmt_cost, unsigned o_stmt_cost,
   1448 			  bool innermost_loops_p, bool dump_info_p = true)
   1449 {
   1450   unsigned HOST_WIDE_INT ratio;
   1451   unsigned i, j, num_old_inv_drs = 0, num_new_inv_drs = 0;
   1452   struct data_reference *dr;
   1453   bool all_seq_dr_before_p = true, all_seq_dr_after_p = true;
   1454   widest_int iloop_strides = 0, oloop_strides = 0;
   1455   unsigned num_unresolved_drs = 0;
   1456   unsigned num_resolved_ok_drs = 0;
   1457   unsigned num_resolved_not_ok_drs = 0;
   1458 
   1459   if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS))
   1460     fprintf (dump_file, "\nData ref strides:\n\tmem_ref:\t\tiloop\toloop\n");
   1461 
   1462   for (i = 0; datarefs.iterate (i, &dr); ++i)
   1463     {
   1464       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
   1465       tree iloop_stride = (*stride)[i_idx], oloop_stride = (*stride)[o_idx];
   1466 
   1467       bool subloop_stride_p = false;
   1468       /* Data ref can't be invariant or sequential access at current loop if
   1469 	 its address changes with respect to any subloops.  */
   1470       for (j = i_idx + 1; j < stride->length (); ++j)
   1471 	if (!integer_zerop ((*stride)[j]))
   1472 	  {
   1473 	    subloop_stride_p = true;
   1474 	    break;
   1475 	  }
   1476 
   1477       if (integer_zerop (iloop_stride))
   1478 	{
   1479 	  if (!subloop_stride_p)
   1480 	    num_old_inv_drs++;
   1481 	}
   1482       if (integer_zerop (oloop_stride))
   1483 	{
   1484 	  if (!subloop_stride_p)
   1485 	    num_new_inv_drs++;
   1486 	}
   1487 
   1488       if (TREE_CODE (iloop_stride) == INTEGER_CST
   1489 	  && TREE_CODE (oloop_stride) == INTEGER_CST)
   1490 	{
   1491 	  iloop_strides = wi::add (iloop_strides, wi::to_widest (iloop_stride));
   1492 	  oloop_strides = wi::add (oloop_strides, wi::to_widest (oloop_stride));
   1493 	}
   1494       else if (multiple_of_p (TREE_TYPE (iloop_stride),
   1495 			      iloop_stride, oloop_stride))
   1496 	num_resolved_ok_drs++;
   1497       else if (multiple_of_p (TREE_TYPE (iloop_stride),
   1498 			      oloop_stride, iloop_stride))
   1499 	num_resolved_not_ok_drs++;
   1500       else
   1501 	num_unresolved_drs++;
   1502 
   1503       /* Data ref can't be sequential access if its address changes in sub
   1504 	 loop.  */
   1505       if (subloop_stride_p)
   1506 	{
   1507 	  all_seq_dr_before_p = false;
   1508 	  all_seq_dr_after_p = false;
   1509 	  continue;
   1510 	}
   1511       /* Track if all data references are sequential accesses before/after loop
   1512 	 interchange.  Note invariant is considered sequential here.  */
   1513       tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
   1514       if (all_seq_dr_before_p
   1515 	  && ! (integer_zerop (iloop_stride)
   1516 		|| operand_equal_p (access_size, iloop_stride, 0)))
   1517 	all_seq_dr_before_p = false;
   1518       if (all_seq_dr_after_p
   1519 	  && ! (integer_zerop (oloop_stride)
   1520 		|| operand_equal_p (access_size, oloop_stride, 0)))
   1521 	all_seq_dr_after_p = false;
   1522     }
   1523 
   1524   if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS))
   1525     {
   1526       fprintf (dump_file, "\toverall:\t\t");
   1527       print_decu (iloop_strides, dump_file);
   1528       fprintf (dump_file, "\t");
   1529       print_decu (oloop_strides, dump_file);
   1530       fprintf (dump_file, "\n");
   1531 
   1532       fprintf (dump_file, "Invariant data ref: before(%d), after(%d)\n",
   1533 	       num_old_inv_drs, num_new_inv_drs);
   1534       fprintf (dump_file, "All consecutive stride: before(%s), after(%s)\n",
   1535 	       all_seq_dr_before_p ? "true" : "false",
   1536 	       all_seq_dr_after_p ? "true" : "false");
   1537       fprintf (dump_file, "OK to interchage with variable strides: %d\n",
   1538 	       num_resolved_ok_drs);
   1539       fprintf (dump_file, "Not OK to interchage with variable strides: %d\n",
   1540 	       num_resolved_not_ok_drs);
   1541       fprintf (dump_file, "Variable strides we cannot decide: %d\n",
   1542 	       num_unresolved_drs);
   1543       fprintf (dump_file, "Stmt cost of inner loop: %d\n", i_stmt_cost);
   1544       fprintf (dump_file, "Stmt cost of outer loop: %d\n", o_stmt_cost);
   1545     }
   1546 
   1547   if (num_unresolved_drs != 0 || num_resolved_not_ok_drs != 0)
   1548     return false;
   1549 
   1550   /* Stmts of outer loop will be moved to inner loop.  If there are two many
   1551      such stmts, it could make inner loop costly.  Here we compare stmt cost
   1552      between outer and inner loops.  */
   1553   if (i_stmt_cost && o_stmt_cost
   1554       && num_old_inv_drs + o_stmt_cost > num_new_inv_drs
   1555       && o_stmt_cost * STMT_COST_RATIO > i_stmt_cost)
   1556     return false;
   1557 
   1558   /* We use different stride comparison ratio for interchanging innermost
   1559      two loops or not.  The idea is to be conservative in interchange for
   1560      the innermost loops.  */
   1561   ratio = innermost_loops_p ? INNER_STRIDE_RATIO : OUTER_STRIDE_RATIO;
   1562   /* Do interchange if it gives better data locality behavior.  */
   1563   if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides, ratio)))
   1564     return true;
   1565   if (wi::gtu_p (iloop_strides, oloop_strides))
   1566     {
   1567       /* Or it creates more invariant memory references.  */
   1568       if ((!all_seq_dr_before_p || all_seq_dr_after_p)
   1569 	  && num_new_inv_drs > num_old_inv_drs)
   1570 	return true;
   1571       /* Or it makes all memory references sequential.  */
   1572       if (num_new_inv_drs >= num_old_inv_drs
   1573 	  && !all_seq_dr_before_p && all_seq_dr_after_p)
   1574 	return true;
   1575     }
   1576 
   1577   return false;
   1578 }
   1579 
   1580 /* Try to interchange inner loop of a loop nest to outer level.  */
   1581 
   1582 bool
   1583 tree_loop_interchange::interchange (vec<data_reference_p> datarefs,
   1584 				    vec<ddr_p> ddrs)
   1585 {
   1586   location_t loc = find_loop_location (m_loop_nest[0]);
   1587   bool changed_p = false;
   1588   /* In each iteration we try to interchange I-th loop with (I+1)-th loop.
   1589      The overall effect is to push inner loop to outermost level in whole
   1590      loop nest.  */
   1591   for (unsigned i = m_loop_nest.length (); i > 1; --i)
   1592     {
   1593       unsigned i_idx = i - 1, o_idx = i - 2;
   1594 
   1595       /* Check validity for loop interchange.  */
   1596       if (!valid_data_dependences (i_idx, o_idx, ddrs))
   1597 	break;
   1598 
   1599       loop_cand iloop (m_loop_nest[i_idx], m_loop_nest[o_idx]);
   1600       loop_cand oloop (m_loop_nest[o_idx], m_loop_nest[o_idx]);
   1601 
   1602       /* Check if we can do transformation for loop interchange.  */
   1603       if (!iloop.analyze_carried_vars (NULL)
   1604 	  || !iloop.analyze_lcssa_phis ()
   1605 	  || !oloop.analyze_carried_vars (&iloop)
   1606 	  || !oloop.analyze_lcssa_phis ()
   1607 	  || !iloop.can_interchange_p (NULL)
   1608 	  || !oloop.can_interchange_p (&iloop))
   1609 	break;
   1610 
   1611       /* Outer loop's stmts will be moved to inner loop during interchange.
   1612 	 If there are many of them, it may make inner loop very costly.  We
   1613 	 need to check number of outer loop's stmts in profit cost model of
   1614 	 interchange.  */
   1615       int stmt_cost = oloop.m_num_stmts;
   1616       /* Count out the exit checking stmt of outer loop.  */
   1617       stmt_cost --;
   1618       /* Count out IV's increasing stmt, IVOPTs takes care if it.  */
   1619       stmt_cost -= oloop.m_inductions.length ();
   1620       /* Count in the additional load and cond_expr stmts caused by inner
   1621 	 loop's constant initialized reduction.  */
   1622       stmt_cost += iloop.m_const_init_reduc * 2;
   1623       if (stmt_cost < 0)
   1624 	stmt_cost = 0;
   1625 
   1626       /* Check profitability for loop interchange.  */
   1627       if (should_interchange_loops (i_idx, o_idx, datarefs,
   1628 				    (unsigned) iloop.m_num_stmts,
   1629 				    (unsigned) stmt_cost,
   1630 				    iloop.m_loop->inner == NULL))
   1631 	{
   1632 	  if (dump_file && (dump_flags & TDF_DETAILS))
   1633 	    fprintf (dump_file,
   1634 		     "Loop_pair<outer:%d, inner:%d> is interchanged\n\n",
   1635 		     oloop.m_loop->num, iloop.m_loop->num);
   1636 
   1637 	  changed_p = true;
   1638 	  interchange_loops (iloop, oloop);
   1639 	  /* No need to update if there is no further loop interchange.  */
   1640 	  if (o_idx > 0)
   1641 	    update_data_info (i_idx, o_idx, datarefs, ddrs);
   1642 	}
   1643       else
   1644 	{
   1645 	  if (dump_file && (dump_flags & TDF_DETAILS))
   1646 	    fprintf (dump_file,
   1647 		     "Loop_pair<outer:%d, inner:%d> is not interchanged\n\n",
   1648 		     oloop.m_loop->num, iloop.m_loop->num);
   1649 	}
   1650     }
   1651   simple_dce_from_worklist (m_dce_seeds);
   1652 
   1653   if (changed_p)
   1654     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
   1655 		     "loops interchanged in loop nest\n");
   1656 
   1657   return changed_p;
   1658 }
   1659 
   1660 
   1661 /* Loop interchange pass.  */
   1662 
   1663 namespace {
   1664 
   1665 const pass_data pass_data_linterchange =
   1666 {
   1667   GIMPLE_PASS, /* type */
   1668   "linterchange", /* name */
   1669   OPTGROUP_LOOP, /* optinfo_flags */
   1670   TV_LINTERCHANGE, /* tv_id */
   1671   PROP_cfg, /* properties_required */
   1672   0, /* properties_provided */
   1673   0, /* properties_destroyed */
   1674   0, /* todo_flags_start */
   1675   0, /* todo_flags_finish */
   1676 };
   1677 
   1678 class pass_linterchange : public gimple_opt_pass
   1679 {
   1680 public:
   1681   pass_linterchange (gcc::context *ctxt)
   1682     : gimple_opt_pass (pass_data_linterchange, ctxt)
   1683   {}
   1684 
   1685   /* opt_pass methods: */
   1686   opt_pass * clone () { return new pass_linterchange (m_ctxt); }
   1687   virtual bool gate (function *) { return flag_loop_interchange; }
   1688   virtual unsigned int execute (function *);
   1689 
   1690 }; // class pass_linterchange
   1691 
   1692 
   1693 /* Return true if LOOP has proper form for interchange.  We check three
   1694    conditions in the function:
   1695      1) In general, a loop can be interchanged only if it doesn't have
   1696 	basic blocks other than header, exit and latch besides possible
   1697 	inner loop nest.  This basically restricts loop interchange to
   1698 	below form loop nests:
   1699 
   1700           header<---+
   1701             |       |
   1702             v       |
   1703         INNER_LOOP  |
   1704             |       |
   1705             v       |
   1706           exit--->latch
   1707 
   1708      2) Data reference in basic block that executes in different times
   1709 	than loop head/exit is not allowed.
   1710      3) Record the innermost outer loop that doesn't form rectangle loop
   1711 	nest with LOOP.  */
   1712 
   1713 static bool
   1714 proper_loop_form_for_interchange (struct loop *loop, struct loop **min_outer)
   1715 {
   1716   edge e0, e1, exit;
   1717 
   1718   /* Don't interchange if loop has unsupported information for the moment.  */
   1719   if (loop->safelen > 0
   1720       || loop->constraints != 0
   1721       || loop->can_be_parallel
   1722       || loop->dont_vectorize
   1723       || loop->force_vectorize
   1724       || loop->in_oacc_kernels_region
   1725       || loop->orig_loop_num != 0
   1726       || loop->simduid != NULL_TREE)
   1727     return false;
   1728 
   1729   /* Don't interchange if outer loop has basic block other than header, exit
   1730      and latch.  */
   1731   if (loop->inner != NULL
   1732       && loop->num_nodes != loop->inner->num_nodes + 3)
   1733     return false;
   1734 
   1735   if ((exit = single_dom_exit (loop)) == NULL)
   1736     return false;
   1737 
   1738   /* Check control flow on loop header/exit blocks.  */
   1739   if (loop->header == exit->src
   1740       && (EDGE_COUNT (loop->header->preds) != 2
   1741 	  || EDGE_COUNT (loop->header->succs) != 2))
   1742     return false;
   1743   else if (loop->header != exit->src
   1744 	   && (EDGE_COUNT (loop->header->preds) != 2
   1745 	       || !single_succ_p (loop->header)
   1746 	       || unsupported_edge (single_succ_edge (loop->header))
   1747 	       || EDGE_COUNT (exit->src->succs) != 2
   1748 	       || !single_pred_p (exit->src)
   1749 	       || unsupported_edge (single_pred_edge (exit->src))))
   1750     return false;
   1751 
   1752   e0 = EDGE_PRED (loop->header, 0);
   1753   e1 = EDGE_PRED (loop->header, 1);
   1754   if (unsupported_edge (e0) || unsupported_edge (e1)
   1755       || (e0->src != loop->latch && e1->src != loop->latch)
   1756       || (e0->src->loop_father == loop && e1->src->loop_father == loop))
   1757     return false;
   1758 
   1759   e0 = EDGE_SUCC (exit->src, 0);
   1760   e1 = EDGE_SUCC (exit->src, 1);
   1761   if (unsupported_edge (e0) || unsupported_edge (e1)
   1762       || (e0->dest != loop->latch && e1->dest != loop->latch)
   1763       || (e0->dest->loop_father == loop && e1->dest->loop_father == loop))
   1764     return false;
   1765 
   1766   /* Don't interchange if any reference is in basic block that doesn't
   1767      dominate exit block.  */
   1768   basic_block *bbs = get_loop_body (loop);
   1769   for (unsigned i = 0; i < loop->num_nodes; i++)
   1770     {
   1771       basic_block bb = bbs[i];
   1772 
   1773       if (bb->loop_father != loop
   1774 	  || bb == loop->header || bb == exit->src
   1775 	  || dominated_by_p (CDI_DOMINATORS, exit->src, bb))
   1776 	continue;
   1777 
   1778       for (gimple_stmt_iterator gsi = gsi_start_nondebug_bb (bb);
   1779 	   !gsi_end_p (gsi); gsi_next_nondebug (&gsi))
   1780 	if (gimple_vuse (gsi_stmt (gsi)))
   1781 	  {
   1782 	    free (bbs);
   1783 	    return false;
   1784 	  }
   1785     }
   1786   free (bbs);
   1787 
   1788   tree niters = number_of_latch_executions (loop);
   1789   niters = analyze_scalar_evolution (loop_outer (loop), niters);
   1790   if (!niters || chrec_contains_undetermined (niters))
   1791     return false;
   1792 
   1793   /* Record the innermost outer loop that doesn't form rectangle loop nest.  */
   1794   for (loop_p loop2 = loop_outer (loop);
   1795        loop2 && flow_loop_nested_p (*min_outer, loop2);
   1796        loop2 = loop_outer (loop2))
   1797     {
   1798       niters = instantiate_scev (loop_preheader_edge (loop2),
   1799 				 loop_outer (loop), niters);
   1800       if (!evolution_function_is_invariant_p (niters, loop2->num))
   1801 	{
   1802 	  *min_outer = loop2;
   1803 	  break;
   1804 	}
   1805     }
   1806   return true;
   1807 }
   1808 
   1809 /* Return true if any two adjacent loops in loop nest [INNERMOST, LOOP_NEST]
   1810    should be interchanged by looking into all DATAREFS.  */
   1811 
   1812 static bool
   1813 should_interchange_loop_nest (struct loop *loop_nest, struct loop *innermost,
   1814 			      vec<data_reference_p> datarefs)
   1815 {
   1816   unsigned idx = loop_depth (innermost) - loop_depth (loop_nest);
   1817   gcc_assert (idx > 0);
   1818 
   1819   /* Check if any two adjacent loops should be interchanged.  */
   1820   for (struct loop *loop = innermost;
   1821        loop != loop_nest; loop = loop_outer (loop), idx--)
   1822     if (should_interchange_loops (idx, idx - 1, datarefs, 0, 0,
   1823 				  loop == innermost, false))
   1824       return true;
   1825 
   1826   return false;
   1827 }
   1828 
   1829 /* Given loop nest LOOP_NEST and data references DATAREFS, compute data
   1830    dependences for loop interchange and store it in DDRS.  Note we compute
   1831    dependences directly rather than call generic interface so that we can
   1832    return on unknown dependence instantly.  */
   1833 
   1834 static bool
   1835 tree_loop_interchange_compute_ddrs (vec<loop_p> loop_nest,
   1836 				    vec<data_reference_p> datarefs,
   1837 				    vec<ddr_p> *ddrs)
   1838 {
   1839   struct data_reference *a, *b;
   1840   struct loop *innermost = loop_nest.last ();
   1841 
   1842   for (unsigned i = 0; datarefs.iterate (i, &a); ++i)
   1843     {
   1844       bool a_outer_p = gimple_bb (DR_STMT (a))->loop_father != innermost;
   1845       for (unsigned j = i + 1; datarefs.iterate (j, &b); ++j)
   1846 	if (DR_IS_WRITE (a) || DR_IS_WRITE (b))
   1847 	  {
   1848 	    bool b_outer_p = gimple_bb (DR_STMT (b))->loop_father != innermost;
   1849 	    /* Don't support multiple write references in outer loop.  */
   1850 	    if (a_outer_p && b_outer_p && DR_IS_WRITE (a) && DR_IS_WRITE (b))
   1851 	      return false;
   1852 
   1853 	    ddr_p ddr = initialize_data_dependence_relation (a, b, loop_nest);
   1854 	    ddrs->safe_push (ddr);
   1855 	    compute_affine_dependence (ddr, loop_nest[0]);
   1856 
   1857 	    /* Give up if ddr is unknown dependence or classic direct vector
   1858 	       is not available.  */
   1859 	    if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
   1860 		|| (DDR_ARE_DEPENDENT (ddr) == NULL_TREE
   1861 		    && DDR_NUM_DIR_VECTS (ddr) == 0))
   1862 	      return false;
   1863 
   1864 	    /* If either data references is in outer loop of nest, we require
   1865 	       no dependence here because the data reference need to be moved
   1866 	       into inner loop during interchange.  */
   1867 	    if (a_outer_p && b_outer_p
   1868 		&& operand_equal_p (DR_REF (a), DR_REF (b), 0))
   1869 	      continue;
   1870 	    if (DDR_ARE_DEPENDENT (ddr) != chrec_known
   1871 		&& (a_outer_p || b_outer_p))
   1872 	      return false;
   1873 	}
   1874     }
   1875 
   1876   return true;
   1877 }
   1878 
   1879 /* Prune DATAREFS by removing any data reference not inside of LOOP.  */
   1880 
   1881 static inline void
   1882 prune_datarefs_not_in_loop (struct loop *loop, vec<data_reference_p> datarefs)
   1883 {
   1884   unsigned i, j;
   1885   struct data_reference *dr;
   1886 
   1887   for (i = 0, j = 0; datarefs.iterate (i, &dr); ++i)
   1888     {
   1889       if (flow_bb_inside_loop_p (loop, gimple_bb (DR_STMT (dr))))
   1890 	datarefs[j++] = dr;
   1891       else
   1892 	{
   1893 	  if (dr->aux)
   1894 	    {
   1895 	      DR_ACCESS_STRIDE (dr)->release ();
   1896 	      delete (vec<tree> *) dr->aux;
   1897 	    }
   1898 	  free_data_ref (dr);
   1899 	}
   1900     }
   1901   datarefs.truncate (j);
   1902 }
   1903 
   1904 /* Find and store data references in DATAREFS for LOOP nest.  If there's
   1905    difficult data reference in a basic block, we shrink the loop nest to
   1906    inner loop of that basic block's father loop.  On success, return the
   1907    outer loop of the result loop nest.  */
   1908 
   1909 static struct loop *
   1910 prepare_data_references (struct loop *loop, vec<data_reference_p> *datarefs)
   1911 {
   1912   struct loop *loop_nest = loop;
   1913   vec<data_reference_p> *bb_refs;
   1914   basic_block bb, *bbs = get_loop_body_in_dom_order (loop);
   1915 
   1916   for (unsigned i = 0; i < loop->num_nodes; i++)
   1917     bbs[i]->aux = NULL;
   1918 
   1919   /* Find data references for all basic blocks.  Shrink loop nest on difficult
   1920      data reference.  */
   1921   for (unsigned i = 0; loop_nest && i < loop->num_nodes; ++i)
   1922     {
   1923       bb = bbs[i];
   1924       if (!flow_bb_inside_loop_p (loop_nest, bb))
   1925 	continue;
   1926 
   1927       bb_refs = new vec<data_reference_p> ();
   1928       if (find_data_references_in_bb (loop, bb, bb_refs) == chrec_dont_know)
   1929         {
   1930 	  loop_nest = bb->loop_father->inner;
   1931 	  if (loop_nest && !loop_nest->inner)
   1932 	    loop_nest = NULL;
   1933 
   1934 	  free_data_refs (*bb_refs);
   1935           delete bb_refs;
   1936         }
   1937       else if (bb_refs->is_empty ())
   1938 	delete bb_refs;
   1939       else
   1940 	bb->aux = bb_refs;
   1941     }
   1942 
   1943   /* Collect all data references in loop nest.  */
   1944   for (unsigned i = 0; i < loop->num_nodes; i++)
   1945     {
   1946       bb = bbs[i];
   1947       if (!bb->aux)
   1948 	continue;
   1949 
   1950       bb_refs = (vec<data_reference_p> *) bb->aux;
   1951       if (loop_nest && flow_bb_inside_loop_p (loop_nest, bb))
   1952 	datarefs->safe_splice (*bb_refs);
   1953       else
   1954 	free_data_refs (*bb_refs);
   1955 
   1956       delete bb_refs;
   1957       bb->aux = NULL;
   1958     }
   1959   free (bbs);
   1960 
   1961   return loop_nest;
   1962 }
   1963 
   1964 /* Given innermost LOOP, return true if perfect loop nest can be found and
   1965    data dependences can be computed.  If succeed, record the perfect loop
   1966    nest in LOOP_NEST; record all data references in DATAREFS and record all
   1967    data dependence relations in DDRS.
   1968 
   1969    We do support a restricted form of imperfect loop nest, i.e, loop nest
   1970    with load/store in outer loop initializing/finalizing simple reduction
   1971    of the innermost loop.  For such outer loop reference, we require that
   1972    it has no dependence with others sinve it will be moved to inner loop
   1973    in interchange.  */
   1974 
   1975 static bool
   1976 prepare_perfect_loop_nest (struct loop *loop, vec<loop_p> *loop_nest,
   1977 			   vec<data_reference_p> *datarefs, vec<ddr_p> *ddrs)
   1978 {
   1979   struct loop *start_loop = NULL, *innermost = loop;
   1980   struct loop *outermost = loops_for_fn (cfun)->tree_root;
   1981 
   1982   /* Find loop nest from the innermost loop.  The outermost is the innermost
   1983      outer*/
   1984   while (loop->num != 0 && loop->inner == start_loop
   1985 	 && flow_loop_nested_p (outermost, loop))
   1986     {
   1987       if (!proper_loop_form_for_interchange (loop, &outermost))
   1988 	break;
   1989 
   1990       start_loop = loop;
   1991       /* If this loop has sibling loop, the father loop won't be in perfect
   1992 	 loop nest.  */
   1993       if (loop->next != NULL)
   1994 	break;
   1995 
   1996       loop = loop_outer (loop);
   1997     }
   1998   if (!start_loop || !start_loop->inner)
   1999     return false;
   2000 
   2001   /* Prepare the data reference vector for the loop nest, pruning outer
   2002      loops we cannot handle.  */
   2003   start_loop = prepare_data_references (start_loop, datarefs);
   2004   if (!start_loop
   2005       /* Check if there is no data reference.  */
   2006       || datarefs->is_empty ()
   2007       /* Check if there are too many of data references.  */
   2008       || (int) datarefs->length () > MAX_DATAREFS)
   2009     return false;
   2010 
   2011   /* Compute access strides for all data references, pruning outer
   2012      loops we cannot analyze refs in.  */
   2013   start_loop = compute_access_strides (start_loop, innermost, *datarefs);
   2014   if (!start_loop)
   2015     return false;
   2016 
   2017   /* Check if any interchange is profitable in the loop nest.  */
   2018   if (!should_interchange_loop_nest (start_loop, innermost, *datarefs))
   2019     return false;
   2020 
   2021   /* Check if data dependences can be computed for loop nest starting from
   2022      start_loop.  */
   2023   loop = start_loop;
   2024   do {
   2025     loop_nest->truncate (0);
   2026 
   2027     if (loop != start_loop)
   2028       prune_datarefs_not_in_loop (start_loop, *datarefs);
   2029 
   2030     if (find_loop_nest (start_loop, loop_nest)
   2031 	&& tree_loop_interchange_compute_ddrs (*loop_nest, *datarefs, ddrs))
   2032       {
   2033 	if (dump_file && (dump_flags & TDF_DETAILS))
   2034 	  fprintf (dump_file,
   2035 		   "\nConsider loop interchange for loop_nest<%d - %d>\n",
   2036 		   start_loop->num, innermost->num);
   2037 
   2038 	if (loop != start_loop)
   2039 	  prune_access_strides_not_in_loop (start_loop, innermost, *datarefs);
   2040 
   2041 	if (dump_file && (dump_flags & TDF_DETAILS))
   2042 	  dump_access_strides (*datarefs);
   2043 
   2044 	return true;
   2045       }
   2046 
   2047     free_dependence_relations (*ddrs);
   2048     *ddrs = vNULL;
   2049     /* Try to compute data dependences with the outermost loop stripped.  */
   2050     loop = start_loop;
   2051     start_loop = start_loop->inner;
   2052   } while (start_loop && start_loop->inner);
   2053 
   2054   return false;
   2055 }
   2056 
   2057 /* Main entry for loop interchange pass.  */
   2058 
   2059 unsigned int
   2060 pass_linterchange::execute (function *fun)
   2061 {
   2062   if (number_of_loops (fun) <= 2)
   2063     return 0;
   2064 
   2065   bool changed_p = false;
   2066   struct loop *loop;
   2067   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
   2068     {
   2069       vec<loop_p> loop_nest = vNULL;
   2070       vec<data_reference_p> datarefs = vNULL;
   2071       vec<ddr_p> ddrs = vNULL;
   2072       if (prepare_perfect_loop_nest (loop, &loop_nest, &datarefs, &ddrs))
   2073 	{
   2074 	  tree_loop_interchange loop_interchange (loop_nest);
   2075 	  changed_p |= loop_interchange.interchange (datarefs, ddrs);
   2076 	}
   2077       free_dependence_relations (ddrs);
   2078       free_data_refs_with_aux (datarefs);
   2079       loop_nest.release ();
   2080     }
   2081 
   2082   return changed_p ? (TODO_update_ssa_only_virtuals) : 0;
   2083 }
   2084 
   2085 } // anon namespace
   2086 
   2087 gimple_opt_pass *
   2088 make_pass_linterchange (gcc::context *ctxt)
   2089 {
   2090   return new pass_linterchange (ctxt);
   2091 }
   2092