Home | History | Annotate | Line # | Download | only in gcc
tree-loop-distribution.cc revision 1.1.1.1
      1 /* Loop distribution.
      2    Copyright (C) 2006-2022 Free Software Foundation, Inc.
      3    Contributed by Georges-Andre Silber <Georges-Andre.Silber (at) ensmp.fr>
      4    and Sebastian Pop <sebastian.pop (at) amd.com>.
      5 
      6 This file is part of GCC.
      7 
      8 GCC is free software; you can redistribute it and/or modify it
      9 under the terms of the GNU General Public License as published by the
     10 Free Software Foundation; either version 3, or (at your option) any
     11 later version.
     12 
     13 GCC is distributed in the hope that it will be useful, but WITHOUT
     14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
     15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     16 for more details.
     17 
     18 You should have received a copy of the GNU General Public License
     19 along with GCC; see the file COPYING3.  If not see
     20 <http://www.gnu.org/licenses/>.  */
     21 
     22 /* This pass performs loop distribution: for example, the loop
     23 
     24    |DO I = 2, N
     25    |    A(I) = B(I) + C
     26    |    D(I) = A(I-1)*E
     27    |ENDDO
     28 
     29    is transformed to
     30 
     31    |DOALL I = 2, N
     32    |   A(I) = B(I) + C
     33    |ENDDO
     34    |
     35    |DOALL I = 2, N
     36    |   D(I) = A(I-1)*E
     37    |ENDDO
     38 
     39    Loop distribution is the dual of loop fusion.  It separates statements
     40    of a loop (or loop nest) into multiple loops (or loop nests) with the
     41    same loop header.  The major goal is to separate statements which may
     42    be vectorized from those that can't.  This pass implements distribution
     43    in the following steps:
     44 
     45      1) Seed partitions with specific type statements.  For now we support
     46 	two types seed statements: statement defining variable used outside
     47 	of loop; statement storing to memory.
     48      2) Build reduced dependence graph (RDG) for loop to be distributed.
     49 	The vertices (RDG:V) model all statements in the loop and the edges
     50 	(RDG:E) model flow and control dependencies between statements.
     51      3) Apart from RDG, compute data dependencies between memory references.
     52      4) Starting from seed statement, build up partition by adding depended
     53 	statements according to RDG's dependence information.  Partition is
     54 	classified as parallel type if it can be executed paralleled; or as
     55 	sequential type if it can't.  Parallel type partition is further
     56 	classified as different builtin kinds if it can be implemented as
     57 	builtin function calls.
     58      5) Build partition dependence graph (PG) based on data dependencies.
     59 	The vertices (PG:V) model all partitions and the edges (PG:E) model
     60 	all data dependencies between every partitions pair.  In general,
     61 	data dependence is either compilation time known or unknown.  In C
     62 	family languages, there exists quite amount compilation time unknown
     63 	dependencies because of possible alias relation of data references.
     64 	We categorize PG's edge to two types: "true" edge that represents
     65 	compilation time known data dependencies; "alias" edge for all other
     66 	data dependencies.
     67      6) Traverse subgraph of PG as if all "alias" edges don't exist.  Merge
     68 	partitions in each strong connected component (SCC) correspondingly.
     69 	Build new PG for merged partitions.
     70      7) Traverse PG again and this time with both "true" and "alias" edges
     71 	included.  We try to break SCCs by removing some edges.  Because
     72 	SCCs by "true" edges are all fused in step 6), we can break SCCs
     73 	by removing some "alias" edges.  It's NP-hard to choose optimal
     74 	edge set, fortunately simple approximation is good enough for us
     75 	given the small problem scale.
     76      8) Collect all data dependencies of the removed "alias" edges.  Create
     77 	runtime alias checks for collected data dependencies.
     78      9) Version loop under the condition of runtime alias checks.  Given
     79 	loop distribution generally introduces additional overhead, it is
     80 	only useful if vectorization is achieved in distributed loop.  We
     81 	version loop with internal function call IFN_LOOP_DIST_ALIAS.  If
     82 	no distributed loop can be vectorized, we simply remove distributed
     83 	loops and recover to the original one.
     84 
     85    TODO:
     86      1) We only distribute innermost two-level loop nest now.  We should
     87 	extend it for arbitrary loop nests in the future.
     88      2) We only fuse partitions in SCC now.  A better fusion algorithm is
     89 	desired to minimize loop overhead, maximize parallelism and maximize
     90 	data reuse.  */
     91 
     92 #include "config.h"
     93 #include "system.h"
     94 #include "coretypes.h"
     95 #include "backend.h"
     96 #include "tree.h"
     97 #include "gimple.h"
     98 #include "cfghooks.h"
     99 #include "tree-pass.h"
    100 #include "ssa.h"
    101 #include "gimple-pretty-print.h"
    102 #include "fold-const.h"
    103 #include "cfganal.h"
    104 #include "gimple-iterator.h"
    105 #include "gimplify-me.h"
    106 #include "stor-layout.h"
    107 #include "tree-cfg.h"
    108 #include "tree-ssa-loop-manip.h"
    109 #include "tree-ssa-loop-ivopts.h"
    110 #include "tree-ssa-loop.h"
    111 #include "tree-into-ssa.h"
    112 #include "tree-ssa.h"
    113 #include "cfgloop.h"
    114 #include "tree-scalar-evolution.h"
    115 #include "tree-vectorizer.h"
    116 #include "tree-eh.h"
    117 #include "gimple-fold.h"
    118 #include "tree-affine.h"
    119 #include "intl.h"
    120 #include "rtl.h"
    121 #include "memmodel.h"
    122 #include "optabs.h"
    123 
    124 
    125 #define MAX_DATAREFS_NUM \
    126 	((unsigned) param_loop_max_datarefs_for_datadeps)
    127 
    128 /* Threshold controlling number of distributed partitions.  Given it may
    129    be unnecessary if a memory stream cost model is invented in the future,
    130    we define it as a temporary macro, rather than a parameter.  */
    131 #define NUM_PARTITION_THRESHOLD (4)
    132 
    133 /* Hashtable helpers.  */
    134 
    135 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
    136 {
    137   static inline hashval_t hash (const data_dependence_relation *);
    138   static inline bool equal (const data_dependence_relation *,
    139 			    const data_dependence_relation *);
    140 };
    141 
    142 /* Hash function for data dependence.  */
    143 
    144 inline hashval_t
    145 ddr_hasher::hash (const data_dependence_relation *ddr)
    146 {
    147   inchash::hash h;
    148   h.add_ptr (DDR_A (ddr));
    149   h.add_ptr (DDR_B (ddr));
    150   return h.end ();
    151 }
    152 
    153 /* Hash table equality function for data dependence.  */
    154 
    155 inline bool
    156 ddr_hasher::equal (const data_dependence_relation *ddr1,
    157 		   const data_dependence_relation *ddr2)
    158 {
    159   return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
    160 }
    161 
    162 
    163 
    164 #define DR_INDEX(dr)      ((uintptr_t) (dr)->aux)
    165 
    166 /* A Reduced Dependence Graph (RDG) vertex representing a statement.  */
    167 struct rdg_vertex
    168 {
    169   /* The statement represented by this vertex.  */
    170   gimple *stmt;
    171 
    172   /* Vector of data-references in this statement.  */
    173   vec<data_reference_p> datarefs;
    174 
    175   /* True when the statement contains a write to memory.  */
    176   bool has_mem_write;
    177 
    178   /* True when the statement contains a read from memory.  */
    179   bool has_mem_reads;
    180 };
    181 
    182 #define RDGV_STMT(V)     ((struct rdg_vertex *) ((V)->data))->stmt
    183 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
    184 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
    185 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
    186 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
    187 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
    188 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
    189 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
    190 
    191 /* Data dependence type.  */
    192 
    193 enum rdg_dep_type
    194 {
    195   /* Read After Write (RAW).  */
    196   flow_dd = 'f',
    197 
    198   /* Control dependence (execute conditional on).  */
    199   control_dd = 'c'
    200 };
    201 
    202 /* Dependence information attached to an edge of the RDG.  */
    203 
    204 struct rdg_edge
    205 {
    206   /* Type of the dependence.  */
    207   enum rdg_dep_type type;
    208 };
    209 
    210 #define RDGE_TYPE(E)        ((struct rdg_edge *) ((E)->data))->type
    211 
    212 /* Kind of distributed loop.  */
    213 enum partition_kind {
    214     PKIND_NORMAL,
    215     /* Partial memset stands for a paritition can be distributed into a loop
    216        of memset calls, rather than a single memset call.  It's handled just
    217        like a normal parition, i.e, distributed as separate loop, no memset
    218        call is generated.
    219 
    220        Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
    221        loop nest as deep as possible.  As a result, parloop achieves better
    222        parallelization by parallelizing deeper loop nest.  This hack should
    223        be unnecessary and removed once distributed memset can be understood
    224        and analyzed in data reference analysis.  See PR82604 for more.  */
    225     PKIND_PARTIAL_MEMSET,
    226     PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
    227 };
    228 
    229 /* Type of distributed loop.  */
    230 enum partition_type {
    231     /* The distributed loop can be executed parallelly.  */
    232     PTYPE_PARALLEL = 0,
    233     /* The distributed loop has to be executed sequentially.  */
    234     PTYPE_SEQUENTIAL
    235 };
    236 
    237 /* Builtin info for loop distribution.  */
    238 struct builtin_info
    239 {
    240   /* data-references a kind != PKIND_NORMAL partition is about.  */
    241   data_reference_p dst_dr;
    242   data_reference_p src_dr;
    243   /* Base address and size of memory objects operated by the builtin.  Note
    244      both dest and source memory objects must have the same size.  */
    245   tree dst_base;
    246   tree src_base;
    247   tree size;
    248   /* Base and offset part of dst_base after stripping constant offset.  This
    249      is only used in memset builtin distribution for now.  */
    250   tree dst_base_base;
    251   unsigned HOST_WIDE_INT dst_base_offset;
    252 };
    253 
    254 /* Partition for loop distribution.  */
    255 struct partition
    256 {
    257   /* Statements of the partition.  */
    258   bitmap stmts;
    259   /* True if the partition defines variable which is used outside of loop.  */
    260   bool reduction_p;
    261   location_t loc;
    262   enum partition_kind kind;
    263   enum partition_type type;
    264   /* Data references in the partition.  */
    265   bitmap datarefs;
    266   /* Information of builtin parition.  */
    267   struct builtin_info *builtin;
    268 };
    269 
    270 /* Partitions are fused because of different reasons.  */
    271 enum fuse_type
    272 {
    273   FUSE_NON_BUILTIN = 0,
    274   FUSE_REDUCTION = 1,
    275   FUSE_SHARE_REF = 2,
    276   FUSE_SAME_SCC = 3,
    277   FUSE_FINALIZE = 4
    278 };
    279 
    280 /* Description on different fusing reason.  */
    281 static const char *fuse_message[] = {
    282   "they are non-builtins",
    283   "they have reductions",
    284   "they have shared memory refs",
    285   "they are in the same dependence scc",
    286   "there is no point to distribute loop"};
    287 
    288 
    289 /* Dump vertex I in RDG to FILE.  */
    290 
    291 static void
    292 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
    293 {
    294   struct vertex *v = &(rdg->vertices[i]);
    295   struct graph_edge *e;
    296 
    297   fprintf (file, "(vertex %d: (%s%s) (in:", i,
    298 	   RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
    299 	   RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
    300 
    301   if (v->pred)
    302     for (e = v->pred; e; e = e->pred_next)
    303       fprintf (file, " %d", e->src);
    304 
    305   fprintf (file, ") (out:");
    306 
    307   if (v->succ)
    308     for (e = v->succ; e; e = e->succ_next)
    309       fprintf (file, " %d", e->dest);
    310 
    311   fprintf (file, ")\n");
    312   print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
    313   fprintf (file, ")\n");
    314 }
    315 
    316 /* Call dump_rdg_vertex on stderr.  */
    317 
    318 DEBUG_FUNCTION void
    319 debug_rdg_vertex (struct graph *rdg, int i)
    320 {
    321   dump_rdg_vertex (stderr, rdg, i);
    322 }
    323 
    324 /* Dump the reduced dependence graph RDG to FILE.  */
    325 
    326 static void
    327 dump_rdg (FILE *file, struct graph *rdg)
    328 {
    329   fprintf (file, "(rdg\n");
    330   for (int i = 0; i < rdg->n_vertices; i++)
    331     dump_rdg_vertex (file, rdg, i);
    332   fprintf (file, ")\n");
    333 }
    334 
    335 /* Call dump_rdg on stderr.  */
    336 
    337 DEBUG_FUNCTION void
    338 debug_rdg (struct graph *rdg)
    339 {
    340   dump_rdg (stderr, rdg);
    341 }
    342 
    343 static void
    344 dot_rdg_1 (FILE *file, struct graph *rdg)
    345 {
    346   int i;
    347   pretty_printer buffer;
    348   pp_needs_newline (&buffer) = false;
    349   buffer.buffer->stream = file;
    350 
    351   fprintf (file, "digraph RDG {\n");
    352 
    353   for (i = 0; i < rdg->n_vertices; i++)
    354     {
    355       struct vertex *v = &(rdg->vertices[i]);
    356       struct graph_edge *e;
    357 
    358       fprintf (file, "%d [label=\"[%d] ", i, i);
    359       pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
    360       pp_flush (&buffer);
    361       fprintf (file, "\"]\n");
    362 
    363       /* Highlight reads from memory.  */
    364       if (RDG_MEM_READS_STMT (rdg, i))
    365        fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
    366 
    367       /* Highlight stores to memory.  */
    368       if (RDG_MEM_WRITE_STMT (rdg, i))
    369        fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
    370 
    371       if (v->succ)
    372        for (e = v->succ; e; e = e->succ_next)
    373          switch (RDGE_TYPE (e))
    374            {
    375            case flow_dd:
    376              /* These are the most common dependences: don't print these. */
    377              fprintf (file, "%d -> %d \n", i, e->dest);
    378              break;
    379 
    380 	   case control_dd:
    381              fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
    382              break;
    383 
    384            default:
    385              gcc_unreachable ();
    386            }
    387     }
    388 
    389   fprintf (file, "}\n\n");
    390 }
    391 
    392 /* Display the Reduced Dependence Graph using dotty.  */
    393 
    394 DEBUG_FUNCTION void
    395 dot_rdg (struct graph *rdg)
    396 {
    397   /* When debugging, you may want to enable the following code.  */
    398 #ifdef HAVE_POPEN
    399   FILE *file = popen ("dot -Tx11", "w");
    400   if (!file)
    401     return;
    402   dot_rdg_1 (file, rdg);
    403   fflush (file);
    404   close (fileno (file));
    405   pclose (file);
    406 #else
    407   dot_rdg_1 (stderr, rdg);
    408 #endif
    409 }
    410 
    411 /* Returns the index of STMT in RDG.  */
    412 
    413 static int
    414 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
    415 {
    416   int index = gimple_uid (stmt);
    417   gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
    418   return index;
    419 }
    420 
    421 /* Creates dependence edges in RDG for all the uses of DEF.  IDEF is
    422    the index of DEF in RDG.  */
    423 
    424 static void
    425 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
    426 {
    427   use_operand_p imm_use_p;
    428   imm_use_iterator iterator;
    429 
    430   FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
    431     {
    432       struct graph_edge *e;
    433       int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
    434 
    435       if (use < 0)
    436 	continue;
    437 
    438       e = add_edge (rdg, idef, use);
    439       e->data = XNEW (struct rdg_edge);
    440       RDGE_TYPE (e) = flow_dd;
    441     }
    442 }
    443 
    444 /* Creates an edge for the control dependences of BB to the vertex V.  */
    445 
    446 static void
    447 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
    448 				    int v, control_dependences *cd)
    449 {
    450   bitmap_iterator bi;
    451   unsigned edge_n;
    452   EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
    453 			    0, edge_n, bi)
    454     {
    455       basic_block cond_bb = cd->get_edge_src (edge_n);
    456       gimple *stmt = last_stmt (cond_bb);
    457       if (stmt && is_ctrl_stmt (stmt))
    458 	{
    459 	  struct graph_edge *e;
    460 	  int c = rdg_vertex_for_stmt (rdg, stmt);
    461 	  if (c < 0)
    462 	    continue;
    463 
    464 	  e = add_edge (rdg, c, v);
    465 	  e->data = XNEW (struct rdg_edge);
    466 	  RDGE_TYPE (e) = control_dd;
    467 	}
    468     }
    469 }
    470 
    471 /* Creates the edges of the reduced dependence graph RDG.  */
    472 
    473 static void
    474 create_rdg_flow_edges (struct graph *rdg)
    475 {
    476   int i;
    477   def_operand_p def_p;
    478   ssa_op_iter iter;
    479 
    480   for (i = 0; i < rdg->n_vertices; i++)
    481     FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
    482 			      iter, SSA_OP_DEF)
    483       create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
    484 }
    485 
    486 /* Creates the edges of the reduced dependence graph RDG.  */
    487 
    488 static void
    489 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
    490 {
    491   int i;
    492 
    493   for (i = 0; i < rdg->n_vertices; i++)
    494     {
    495       gimple *stmt = RDG_STMT (rdg, i);
    496       if (gimple_code (stmt) == GIMPLE_PHI)
    497 	{
    498 	  edge_iterator ei;
    499 	  edge e;
    500 	  FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
    501 	    if (flow_bb_inside_loop_p (loop, e->src))
    502 	      create_edge_for_control_dependence (rdg, e->src, i, cd);
    503 	}
    504       else
    505 	create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
    506     }
    507 }
    508 
    509 
    510 class loop_distribution
    511 {
    512   private:
    513   /* The loop (nest) to be distributed.  */
    514   vec<loop_p> loop_nest;
    515 
    516   /* Vector of data references in the loop to be distributed.  */
    517   vec<data_reference_p> datarefs_vec;
    518 
    519   /* If there is nonaddressable data reference in above vector.  */
    520   bool has_nonaddressable_dataref_p;
    521 
    522   /* Store index of data reference in aux field.  */
    523 
    524   /* Hash table for data dependence relation in the loop to be distributed.  */
    525   hash_table<ddr_hasher> *ddrs_table;
    526 
    527   /* Array mapping basic block's index to its topological order.  */
    528   int *bb_top_order_index;
    529   /* And size of the array.  */
    530   int bb_top_order_index_size;
    531 
    532   /* Build the vertices of the reduced dependence graph RDG.  Return false
    533      if that failed.  */
    534   bool create_rdg_vertices (struct graph *rdg, const vec<gimple *> &stmts,
    535 			    loop_p loop);
    536 
    537   /* Initialize STMTS with all the statements of LOOP.  We use topological
    538      order to discover all statements.  The order is important because
    539      generate_loops_for_partition is using the same traversal for identifying
    540      statements in loop copies.  */
    541   void stmts_from_loop (class loop *loop, vec<gimple *> *stmts);
    542 
    543 
    544   /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
    545      LOOP, and one edge per flow dependence or control dependence from control
    546      dependence CD.  During visiting each statement, data references are also
    547      collected and recorded in global data DATAREFS_VEC.  */
    548   struct graph * build_rdg (class loop *loop, control_dependences *cd);
    549 
    550 /* Merge PARTITION into the partition DEST.  RDG is the reduced dependence
    551    graph and we update type for result partition if it is non-NULL.  */
    552   void partition_merge_into (struct graph *rdg,
    553 			     partition *dest, partition *partition,
    554 			     enum fuse_type ft);
    555 
    556 
    557   /* Return data dependence relation for data references A and B.  The two
    558      data references must be in lexicographic order wrto reduced dependence
    559      graph RDG.  We firstly try to find ddr from global ddr hash table.  If
    560      it doesn't exist, compute the ddr and cache it.  */
    561   data_dependence_relation * get_data_dependence (struct graph *rdg,
    562 						  data_reference_p a,
    563 						  data_reference_p b);
    564 
    565 
    566   /* In reduced dependence graph RDG for loop distribution, return true if
    567      dependence between references DR1 and DR2 leads to a dependence cycle
    568      and such dependence cycle can't be resolved by runtime alias check.  */
    569   bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1,
    570 			    data_reference_p dr2);
    571 
    572 
    573   /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
    574      PARTITION1's type after merging PARTITION2 into PARTITION1.  */
    575   void update_type_for_merge (struct graph *rdg,
    576 			      partition *partition1, partition *partition2);
    577 
    578 
    579   /* Returns a partition with all the statements needed for computing
    580      the vertex V of the RDG, also including the loop exit conditions.  */
    581   partition *build_rdg_partition_for_vertex (struct graph *rdg, int v);
    582 
    583   /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
    584      if it forms builtin memcpy or memmove call.  */
    585   void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
    586 			      data_reference_p dst_dr, data_reference_p src_dr);
    587 
    588   /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
    589      For the moment we detect memset, memcpy and memmove patterns.  Bitmap
    590      STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.
    591      Returns true if there is a reduction in all partitions and we
    592      possibly did not mark PARTITION as having one for this reason.  */
    593 
    594   bool
    595   classify_partition (loop_p loop,
    596 		      struct graph *rdg, partition *partition,
    597 		      bitmap stmt_in_all_partitions);
    598 
    599 
    600   /* Returns true when PARTITION1 and PARTITION2 access the same memory
    601      object in RDG.  */
    602   bool share_memory_accesses (struct graph *rdg,
    603 			      partition *partition1, partition *partition2);
    604 
    605   /* For each seed statement in STARTING_STMTS, this function builds
    606      partition for it by adding depended statements according to RDG.
    607      All partitions are recorded in PARTITIONS.  */
    608   void rdg_build_partitions (struct graph *rdg,
    609 			     vec<gimple *> starting_stmts,
    610 			     vec<partition *> *partitions);
    611 
    612   /* Compute partition dependence created by the data references in DRS1
    613      and DRS2, modify and return DIR according to that.  IF ALIAS_DDR is
    614      not NULL, we record dependence introduced by possible alias between
    615      two data references in ALIAS_DDRS; otherwise, we simply ignore such
    616      dependence as if it doesn't exist at all.  */
    617   int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1,
    618 			       bitmap drs2, vec<ddr_p> *alias_ddrs);
    619 
    620 
    621   /* Build and return partition dependence graph for PARTITIONS.  RDG is
    622      reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
    623      is true, data dependence caused by possible alias between references
    624      is ignored, as if it doesn't exist at all; otherwise all depdendences
    625      are considered.  */
    626   struct graph *build_partition_graph (struct graph *rdg,
    627 				       vec<struct partition *> *partitions,
    628 				       bool ignore_alias_p);
    629 
    630   /* Given reduced dependence graph RDG merge strong connected components
    631      of PARTITIONS.  If IGNORE_ALIAS_P is true, data dependence caused by
    632      possible alias between references is ignored, as if it doesn't exist
    633      at all; otherwise all depdendences are considered.  */
    634   void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *>
    635 				 *partitions, bool ignore_alias_p);
    636 
    637 /* This is the main function breaking strong conected components in
    638    PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
    639    relations for runtime alias check in ALIAS_DDRS.  */
    640   void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *>
    641 				   *partitions, vec<ddr_p> *alias_ddrs);
    642 
    643 
    644   /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
    645      ALIAS_DDRS contains ddrs which need runtime alias check.  */
    646   void finalize_partitions (class loop *loop, vec<struct partition *>
    647 			    *partitions, vec<ddr_p> *alias_ddrs);
    648 
    649   /* Distributes the code from LOOP in such a way that producer statements
    650      are placed before consumer statements.  Tries to separate only the
    651      statements from STMTS into separate loops.  Returns the number of
    652      distributed loops.  Set NB_CALLS to number of generated builtin calls.
    653      Set *DESTROY_P to whether LOOP needs to be destroyed.  */
    654   int distribute_loop (class loop *loop, const vec<gimple *> &stmts,
    655 		       control_dependences *cd, int *nb_calls, bool *destroy_p,
    656 		       bool only_patterns_p);
    657 
    658   /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
    659      replace them accordingly.  */
    660   bool transform_reduction_loop (loop_p loop);
    661 
    662   /* Compute topological order for basic blocks.  Topological order is
    663      needed because data dependence is computed for data references in
    664      lexicographical order.  */
    665   void bb_top_order_init (void);
    666 
    667   void bb_top_order_destroy (void);
    668 
    669   public:
    670 
    671   /* Getter for bb_top_order.  */
    672 
    673   inline int get_bb_top_order_index_size (void)
    674     {
    675       return bb_top_order_index_size;
    676     }
    677 
    678   inline int get_bb_top_order_index (int i)
    679     {
    680       return bb_top_order_index[i];
    681     }
    682 
    683   unsigned int execute (function *fun);
    684 };
    685 
    686 
    687 /* If X has a smaller topological sort number than Y, returns -1;
    688    if greater, returns 1.  */
    689 static int
    690 bb_top_order_cmp_r (const void *x, const void *y, void *loop)
    691 {
    692   loop_distribution *_loop =
    693     (loop_distribution *) loop;
    694 
    695   basic_block bb1 = *(const basic_block *) x;
    696   basic_block bb2 = *(const basic_block *) y;
    697 
    698   int bb_top_order_index_size = _loop->get_bb_top_order_index_size ();
    699 
    700   gcc_assert (bb1->index < bb_top_order_index_size
    701 	      && bb2->index < bb_top_order_index_size);
    702   gcc_assert (bb1 == bb2
    703 	      || _loop->get_bb_top_order_index(bb1->index)
    704 		 != _loop->get_bb_top_order_index(bb2->index));
    705 
    706   return (_loop->get_bb_top_order_index(bb1->index) -
    707 	  _loop->get_bb_top_order_index(bb2->index));
    708 }
    709 
    710 bool
    711 loop_distribution::create_rdg_vertices (struct graph *rdg,
    712 					const vec<gimple *> &stmts,
    713 					loop_p loop)
    714 {
    715   int i;
    716   gimple *stmt;
    717 
    718   FOR_EACH_VEC_ELT (stmts, i, stmt)
    719     {
    720       struct vertex *v = &(rdg->vertices[i]);
    721 
    722       /* Record statement to vertex mapping.  */
    723       gimple_set_uid (stmt, i);
    724 
    725       v->data = XNEW (struct rdg_vertex);
    726       RDGV_STMT (v) = stmt;
    727       RDGV_DATAREFS (v).create (0);
    728       RDGV_HAS_MEM_WRITE (v) = false;
    729       RDGV_HAS_MEM_READS (v) = false;
    730       if (gimple_code (stmt) == GIMPLE_PHI)
    731 	continue;
    732 
    733       unsigned drp = datarefs_vec.length ();
    734       if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
    735 	return false;
    736       for (unsigned j = drp; j < datarefs_vec.length (); ++j)
    737 	{
    738 	  data_reference_p dr = datarefs_vec[j];
    739 	  if (DR_IS_READ (dr))
    740 	    RDGV_HAS_MEM_READS (v) = true;
    741 	  else
    742 	    RDGV_HAS_MEM_WRITE (v) = true;
    743 	  RDGV_DATAREFS (v).safe_push (dr);
    744 	  has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref);
    745 	}
    746     }
    747   return true;
    748 }
    749 
    750 void
    751 loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts)
    752 {
    753   unsigned int i;
    754   basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r);
    755 
    756   for (i = 0; i < loop->num_nodes; i++)
    757     {
    758       basic_block bb = bbs[i];
    759 
    760       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
    761 	   gsi_next (&bsi))
    762 	if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
    763 	  stmts->safe_push (bsi.phi ());
    764 
    765       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
    766 	   gsi_next (&bsi))
    767 	{
    768 	  gimple *stmt = gsi_stmt (bsi);
    769 	  if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
    770 	    stmts->safe_push (stmt);
    771 	}
    772     }
    773 
    774   free (bbs);
    775 }
    776 
    777 /* Free the reduced dependence graph RDG.  */
    778 
    779 static void
    780 free_rdg (struct graph *rdg)
    781 {
    782   int i;
    783 
    784   for (i = 0; i < rdg->n_vertices; i++)
    785     {
    786       struct vertex *v = &(rdg->vertices[i]);
    787       struct graph_edge *e;
    788 
    789       for (e = v->succ; e; e = e->succ_next)
    790 	free (e->data);
    791 
    792       if (v->data)
    793 	{
    794 	  gimple_set_uid (RDGV_STMT (v), -1);
    795 	  (RDGV_DATAREFS (v)).release ();
    796 	  free (v->data);
    797 	}
    798     }
    799 
    800   free_graph (rdg);
    801 }
    802 
    803 struct graph *
    804 loop_distribution::build_rdg (class loop *loop, control_dependences *cd)
    805 {
    806   struct graph *rdg;
    807 
    808   /* Create the RDG vertices from the stmts of the loop nest.  */
    809   auto_vec<gimple *, 10> stmts;
    810   stmts_from_loop (loop, &stmts);
    811   rdg = new_graph (stmts.length ());
    812   if (!create_rdg_vertices (rdg, stmts, loop))
    813     {
    814       free_rdg (rdg);
    815       return NULL;
    816     }
    817   stmts.release ();
    818 
    819   create_rdg_flow_edges (rdg);
    820   if (cd)
    821     create_rdg_cd_edges (rdg, cd, loop);
    822 
    823   return rdg;
    824 }
    825 
    826 
    827 /* Allocate and initialize a partition from BITMAP.  */
    828 
    829 static partition *
    830 partition_alloc (void)
    831 {
    832   partition *partition = XCNEW (struct partition);
    833   partition->stmts = BITMAP_ALLOC (NULL);
    834   partition->reduction_p = false;
    835   partition->loc = UNKNOWN_LOCATION;
    836   partition->kind = PKIND_NORMAL;
    837   partition->type = PTYPE_PARALLEL;
    838   partition->datarefs = BITMAP_ALLOC (NULL);
    839   return partition;
    840 }
    841 
    842 /* Free PARTITION.  */
    843 
    844 static void
    845 partition_free (partition *partition)
    846 {
    847   BITMAP_FREE (partition->stmts);
    848   BITMAP_FREE (partition->datarefs);
    849   if (partition->builtin)
    850     free (partition->builtin);
    851 
    852   free (partition);
    853 }
    854 
    855 /* Returns true if the partition can be generated as a builtin.  */
    856 
    857 static bool
    858 partition_builtin_p (partition *partition)
    859 {
    860   return partition->kind > PKIND_PARTIAL_MEMSET;
    861 }
    862 
    863 /* Returns true if the partition contains a reduction.  */
    864 
    865 static bool
    866 partition_reduction_p (partition *partition)
    867 {
    868   return partition->reduction_p;
    869 }
    870 
    871 void
    872 loop_distribution::partition_merge_into (struct graph *rdg,
    873 		      partition *dest, partition *partition, enum fuse_type ft)
    874 {
    875   if (dump_file && (dump_flags & TDF_DETAILS))
    876     {
    877       fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
    878       fprintf (dump_file, "  Part 1: ");
    879       dump_bitmap (dump_file, dest->stmts);
    880       fprintf (dump_file, "  Part 2: ");
    881       dump_bitmap (dump_file, partition->stmts);
    882     }
    883 
    884   dest->kind = PKIND_NORMAL;
    885   if (dest->type == PTYPE_PARALLEL)
    886     dest->type = partition->type;
    887 
    888   bitmap_ior_into (dest->stmts, partition->stmts);
    889   if (partition_reduction_p (partition))
    890     dest->reduction_p = true;
    891 
    892   /* Further check if any data dependence prevents us from executing the
    893      new partition parallelly.  */
    894   if (dest->type == PTYPE_PARALLEL && rdg != NULL)
    895     update_type_for_merge (rdg, dest, partition);
    896 
    897   bitmap_ior_into (dest->datarefs, partition->datarefs);
    898 }
    899 
    900 
    901 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
    902    the LOOP.  */
    903 
    904 static bool
    905 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
    906 {
    907   imm_use_iterator imm_iter;
    908   use_operand_p use_p;
    909 
    910   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
    911     {
    912       if (is_gimple_debug (USE_STMT (use_p)))
    913 	continue;
    914 
    915       basic_block use_bb = gimple_bb (USE_STMT (use_p));
    916       if (!flow_bb_inside_loop_p (loop, use_bb))
    917 	return true;
    918     }
    919 
    920   return false;
    921 }
    922 
    923 /* Returns true when STMT defines a scalar variable used after the
    924    loop LOOP.  */
    925 
    926 static bool
    927 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
    928 {
    929   def_operand_p def_p;
    930   ssa_op_iter op_iter;
    931 
    932   if (gimple_code (stmt) == GIMPLE_PHI)
    933     return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
    934 
    935   FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
    936     if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
    937       return true;
    938 
    939   return false;
    940 }
    941 
    942 /* Return a copy of LOOP placed before LOOP.  */
    943 
    944 static class loop *
    945 copy_loop_before (class loop *loop)
    946 {
    947   class loop *res;
    948   edge preheader = loop_preheader_edge (loop);
    949 
    950   initialize_original_copy_tables ();
    951   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
    952   gcc_assert (res != NULL);
    953   free_original_copy_tables ();
    954   delete_update_ssa ();
    955 
    956   return res;
    957 }
    958 
    959 /* Creates an empty basic block after LOOP.  */
    960 
    961 static void
    962 create_bb_after_loop (class loop *loop)
    963 {
    964   edge exit = single_exit (loop);
    965 
    966   if (!exit)
    967     return;
    968 
    969   split_edge (exit);
    970 }
    971 
    972 /* Generate code for PARTITION from the code in LOOP.  The loop is
    973    copied when COPY_P is true.  All the statements not flagged in the
    974    PARTITION bitmap are removed from the loop or from its copy.  The
    975    statements are indexed in sequence inside a basic block, and the
    976    basic blocks of a loop are taken in dom order.  */
    977 
    978 static void
    979 generate_loops_for_partition (class loop *loop, partition *partition,
    980 			      bool copy_p)
    981 {
    982   unsigned i;
    983   basic_block *bbs;
    984 
    985   if (copy_p)
    986     {
    987       int orig_loop_num = loop->orig_loop_num;
    988       loop = copy_loop_before (loop);
    989       gcc_assert (loop != NULL);
    990       loop->orig_loop_num = orig_loop_num;
    991       create_preheader (loop, CP_SIMPLE_PREHEADERS);
    992       create_bb_after_loop (loop);
    993     }
    994   else
    995     {
    996       /* Origin number is set to the new versioned loop's num.  */
    997       gcc_assert (loop->orig_loop_num != loop->num);
    998     }
    999 
   1000   /* Remove stmts not in the PARTITION bitmap.  */
   1001   bbs = get_loop_body_in_dom_order (loop);
   1002 
   1003   if (MAY_HAVE_DEBUG_BIND_STMTS)
   1004     for (i = 0; i < loop->num_nodes; i++)
   1005       {
   1006 	basic_block bb = bbs[i];
   1007 
   1008 	for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
   1009 	     gsi_next (&bsi))
   1010 	  {
   1011 	    gphi *phi = bsi.phi ();
   1012 	    if (!virtual_operand_p (gimple_phi_result (phi))
   1013 		&& !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
   1014 	      reset_debug_uses (phi);
   1015 	  }
   1016 
   1017 	for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
   1018 	  {
   1019 	    gimple *stmt = gsi_stmt (bsi);
   1020 	    if (gimple_code (stmt) != GIMPLE_LABEL
   1021 		&& !is_gimple_debug (stmt)
   1022 		&& !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
   1023 	      reset_debug_uses (stmt);
   1024 	  }
   1025       }
   1026 
   1027   for (i = 0; i < loop->num_nodes; i++)
   1028     {
   1029       basic_block bb = bbs[i];
   1030       edge inner_exit = NULL;
   1031 
   1032       if (loop != bb->loop_father)
   1033 	inner_exit = single_exit (bb->loop_father);
   1034 
   1035       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
   1036 	{
   1037 	  gphi *phi = bsi.phi ();
   1038 	  if (!virtual_operand_p (gimple_phi_result (phi))
   1039 	      && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
   1040 	    remove_phi_node (&bsi, true);
   1041 	  else
   1042 	    gsi_next (&bsi);
   1043 	}
   1044 
   1045       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
   1046 	{
   1047 	  gimple *stmt = gsi_stmt (bsi);
   1048 	  if (gimple_code (stmt) != GIMPLE_LABEL
   1049 	      && !is_gimple_debug (stmt)
   1050 	      && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
   1051 	    {
   1052 	      /* In distribution of loop nest, if bb is inner loop's exit_bb,
   1053 		 we choose its exit edge/path in order to avoid generating
   1054 		 infinite loop.  For all other cases, we choose an arbitrary
   1055 		 path through the empty CFG part that this unnecessary
   1056 		 control stmt controls.  */
   1057 	      if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
   1058 		{
   1059 		  if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
   1060 		    gimple_cond_make_true (cond_stmt);
   1061 		  else
   1062 		    gimple_cond_make_false (cond_stmt);
   1063 		  update_stmt (stmt);
   1064 		}
   1065 	      else if (gimple_code (stmt) == GIMPLE_SWITCH)
   1066 		{
   1067 		  gswitch *switch_stmt = as_a <gswitch *> (stmt);
   1068 		  gimple_switch_set_index
   1069 		      (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
   1070 		  update_stmt (stmt);
   1071 		}
   1072 	      else
   1073 		{
   1074 		  unlink_stmt_vdef (stmt);
   1075 		  gsi_remove (&bsi, true);
   1076 		  release_defs (stmt);
   1077 		  continue;
   1078 		}
   1079 	    }
   1080 	  gsi_next (&bsi);
   1081 	}
   1082     }
   1083 
   1084   free (bbs);
   1085 }
   1086 
   1087 /* If VAL memory representation contains the same value in all bytes,
   1088    return that value, otherwise return -1.
   1089    E.g. for 0x24242424 return 0x24, for IEEE double
   1090    747708026454360457216.0 return 0x44, etc.  */
   1091 
   1092 static int
   1093 const_with_all_bytes_same (tree val)
   1094 {
   1095   unsigned char buf[64];
   1096   int i, len;
   1097 
   1098   if (integer_zerop (val)
   1099       || (TREE_CODE (val) == CONSTRUCTOR
   1100           && !TREE_CLOBBER_P (val)
   1101           && CONSTRUCTOR_NELTS (val) == 0))
   1102     return 0;
   1103 
   1104   if (real_zerop (val))
   1105     {
   1106       /* Only return 0 for +0.0, not for -0.0, which doesn't have
   1107 	 an all bytes same memory representation.  Don't transform
   1108 	 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS.  */
   1109       switch (TREE_CODE (val))
   1110 	{
   1111 	case REAL_CST:
   1112 	  if (!real_isneg (TREE_REAL_CST_PTR (val)))
   1113 	    return 0;
   1114 	  break;
   1115 	case COMPLEX_CST:
   1116 	  if (!const_with_all_bytes_same (TREE_REALPART (val))
   1117 	      && !const_with_all_bytes_same (TREE_IMAGPART (val)))
   1118 	    return 0;
   1119 	  break;
   1120 	case VECTOR_CST:
   1121 	  {
   1122 	    unsigned int count = vector_cst_encoded_nelts (val);
   1123 	    unsigned int j;
   1124 	    for (j = 0; j < count; ++j)
   1125 	      if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
   1126 		break;
   1127 	    if (j == count)
   1128 	      return 0;
   1129 	    break;
   1130 	  }
   1131 	default:
   1132 	  break;
   1133 	}
   1134     }
   1135 
   1136   if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
   1137     return -1;
   1138 
   1139   len = native_encode_expr (val, buf, sizeof (buf));
   1140   if (len == 0)
   1141     return -1;
   1142   for (i = 1; i < len; i++)
   1143     if (buf[i] != buf[0])
   1144       return -1;
   1145   return buf[0];
   1146 }
   1147 
   1148 /* Generate a call to memset for PARTITION in LOOP.  */
   1149 
   1150 static void
   1151 generate_memset_builtin (class loop *loop, partition *partition)
   1152 {
   1153   gimple_stmt_iterator gsi;
   1154   tree mem, fn, nb_bytes;
   1155   tree val;
   1156   struct builtin_info *builtin = partition->builtin;
   1157   gimple *fn_call;
   1158 
   1159   /* The new statements will be placed before LOOP.  */
   1160   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
   1161 
   1162   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
   1163   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
   1164 				       false, GSI_CONTINUE_LINKING);
   1165   mem = rewrite_to_non_trapping_overflow (builtin->dst_base);
   1166   mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
   1167 				  false, GSI_CONTINUE_LINKING);
   1168 
   1169   /* This exactly matches the pattern recognition in classify_partition.  */
   1170   val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
   1171   /* Handle constants like 0x15151515 and similarly
   1172      floating point constants etc. where all bytes are the same.  */
   1173   int bytev = const_with_all_bytes_same (val);
   1174   if (bytev != -1)
   1175     val = build_int_cst (integer_type_node, bytev);
   1176   else if (TREE_CODE (val) == INTEGER_CST)
   1177     val = fold_convert (integer_type_node, val);
   1178   else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
   1179     {
   1180       tree tem = make_ssa_name (integer_type_node);
   1181       gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
   1182       gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
   1183       val = tem;
   1184     }
   1185 
   1186   fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
   1187   fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
   1188   gimple_set_location (fn_call, partition->loc);
   1189   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
   1190   fold_stmt (&gsi);
   1191 
   1192   if (dump_file && (dump_flags & TDF_DETAILS))
   1193     {
   1194       fprintf (dump_file, "generated memset");
   1195       if (bytev == 0)
   1196 	fprintf (dump_file, " zero\n");
   1197       else
   1198 	fprintf (dump_file, "\n");
   1199     }
   1200 }
   1201 
   1202 /* Generate a call to memcpy for PARTITION in LOOP.  */
   1203 
   1204 static void
   1205 generate_memcpy_builtin (class loop *loop, partition *partition)
   1206 {
   1207   gimple_stmt_iterator gsi;
   1208   gimple *fn_call;
   1209   tree dest, src, fn, nb_bytes;
   1210   enum built_in_function kind;
   1211   struct builtin_info *builtin = partition->builtin;
   1212 
   1213   /* The new statements will be placed before LOOP.  */
   1214   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
   1215 
   1216   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
   1217   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
   1218 				       false, GSI_CONTINUE_LINKING);
   1219   dest = rewrite_to_non_trapping_overflow (builtin->dst_base);
   1220   src = rewrite_to_non_trapping_overflow (builtin->src_base);
   1221   if (partition->kind == PKIND_MEMCPY
   1222       || ! ptr_derefs_may_alias_p (dest, src))
   1223     kind = BUILT_IN_MEMCPY;
   1224   else
   1225     kind = BUILT_IN_MEMMOVE;
   1226   /* Try harder if we're copying a constant size.  */
   1227   if (kind == BUILT_IN_MEMMOVE && poly_int_tree_p (nb_bytes))
   1228     {
   1229       aff_tree asrc, adest;
   1230       tree_to_aff_combination (src, ptr_type_node, &asrc);
   1231       tree_to_aff_combination (dest, ptr_type_node, &adest);
   1232       aff_combination_scale (&adest, -1);
   1233       aff_combination_add (&asrc, &adest);
   1234       if (aff_comb_cannot_overlap_p (&asrc, wi::to_poly_widest (nb_bytes),
   1235 				     wi::to_poly_widest (nb_bytes)))
   1236 	kind = BUILT_IN_MEMCPY;
   1237     }
   1238 
   1239   dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
   1240 				   false, GSI_CONTINUE_LINKING);
   1241   src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
   1242 				  false, GSI_CONTINUE_LINKING);
   1243   fn = build_fold_addr_expr (builtin_decl_implicit (kind));
   1244   fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
   1245   gimple_set_location (fn_call, partition->loc);
   1246   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
   1247   fold_stmt (&gsi);
   1248 
   1249   if (dump_file && (dump_flags & TDF_DETAILS))
   1250     {
   1251       if (kind == BUILT_IN_MEMCPY)
   1252 	fprintf (dump_file, "generated memcpy\n");
   1253       else
   1254 	fprintf (dump_file, "generated memmove\n");
   1255     }
   1256 }
   1257 
   1258 /* Remove and destroy the loop LOOP.  */
   1259 
   1260 static void
   1261 destroy_loop (class loop *loop)
   1262 {
   1263   unsigned nbbs = loop->num_nodes;
   1264   edge exit = single_exit (loop);
   1265   basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
   1266   basic_block *bbs;
   1267   unsigned i;
   1268 
   1269   bbs = get_loop_body_in_dom_order (loop);
   1270 
   1271   gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest);
   1272   bool safe_p = single_pred_p (exit->dest);
   1273   for (unsigned i = 0; i < nbbs; ++i)
   1274     {
   1275       /* We have made sure to not leave any dangling uses of SSA
   1276          names defined in the loop.  With the exception of virtuals.
   1277 	 Make sure we replace all uses of virtual defs that will remain
   1278 	 outside of the loop with the bare symbol as delete_basic_block
   1279 	 will release them.  */
   1280       for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
   1281 	   gsi_next (&gsi))
   1282 	{
   1283 	  gphi *phi = gsi.phi ();
   1284 	  if (virtual_operand_p (gimple_phi_result (phi)))
   1285 	    mark_virtual_phi_result_for_renaming (phi);
   1286 	}
   1287       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);)
   1288 	{
   1289 	  gimple *stmt = gsi_stmt (gsi);
   1290 	  tree vdef = gimple_vdef (stmt);
   1291 	  if (vdef && TREE_CODE (vdef) == SSA_NAME)
   1292 	    mark_virtual_operand_for_renaming (vdef);
   1293 	  /* Also move and eventually reset debug stmts.  We can leave
   1294 	     constant values in place in case the stmt dominates the exit.
   1295 	     ???  Non-constant values from the last iteration can be
   1296 	     replaced with final values if we can compute them.  */
   1297 	  if (gimple_debug_bind_p (stmt))
   1298 	    {
   1299 	      tree val = gimple_debug_bind_get_value (stmt);
   1300 	      gsi_move_before (&gsi, &dst_gsi);
   1301 	      if (val
   1302 		  && (!safe_p
   1303 		      || !is_gimple_min_invariant (val)
   1304 		      || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i])))
   1305 		{
   1306 		  gimple_debug_bind_reset_value (stmt);
   1307 		  update_stmt (stmt);
   1308 		}
   1309 	    }
   1310 	  else
   1311 	    gsi_next (&gsi);
   1312 	}
   1313     }
   1314 
   1315   redirect_edge_pred (exit, src);
   1316   exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
   1317   exit->flags |= EDGE_FALLTHRU;
   1318   cancel_loop_tree (loop);
   1319   rescan_loop_exit (exit, false, true);
   1320 
   1321   i = nbbs;
   1322   do
   1323     {
   1324       --i;
   1325       delete_basic_block (bbs[i]);
   1326     }
   1327   while (i != 0);
   1328 
   1329   free (bbs);
   1330 
   1331   set_immediate_dominator (CDI_DOMINATORS, dest,
   1332 			   recompute_dominator (CDI_DOMINATORS, dest));
   1333 }
   1334 
   1335 /* Generates code for PARTITION.  Return whether LOOP needs to be destroyed.  */
   1336 
   1337 static bool
   1338 generate_code_for_partition (class loop *loop,
   1339 			     partition *partition, bool copy_p)
   1340 {
   1341   switch (partition->kind)
   1342     {
   1343     case PKIND_NORMAL:
   1344     case PKIND_PARTIAL_MEMSET:
   1345       /* Reductions all have to be in the last partition.  */
   1346       gcc_assert (!partition_reduction_p (partition)
   1347 		  || !copy_p);
   1348       generate_loops_for_partition (loop, partition, copy_p);
   1349       return false;
   1350 
   1351     case PKIND_MEMSET:
   1352       generate_memset_builtin (loop, partition);
   1353       break;
   1354 
   1355     case PKIND_MEMCPY:
   1356     case PKIND_MEMMOVE:
   1357       generate_memcpy_builtin (loop, partition);
   1358       break;
   1359 
   1360     default:
   1361       gcc_unreachable ();
   1362     }
   1363 
   1364   /* Common tail for partitions we turn into a call.  If this was the last
   1365      partition for which we generate code, we have to destroy the loop.  */
   1366   if (!copy_p)
   1367     return true;
   1368   return false;
   1369 }
   1370 
   1371 data_dependence_relation *
   1372 loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a,
   1373 					data_reference_p b)
   1374 {
   1375   struct data_dependence_relation ent, **slot;
   1376   struct data_dependence_relation *ddr;
   1377 
   1378   gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
   1379   gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
   1380 	      <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
   1381   ent.a = a;
   1382   ent.b = b;
   1383   slot = ddrs_table->find_slot (&ent, INSERT);
   1384   if (*slot == NULL)
   1385     {
   1386       ddr = initialize_data_dependence_relation (a, b, loop_nest);
   1387       compute_affine_dependence (ddr, loop_nest[0]);
   1388       *slot = ddr;
   1389     }
   1390 
   1391   return *slot;
   1392 }
   1393 
   1394 bool
   1395 loop_distribution::data_dep_in_cycle_p (struct graph *rdg,
   1396 					data_reference_p dr1,
   1397 					data_reference_p dr2)
   1398 {
   1399   struct data_dependence_relation *ddr;
   1400 
   1401   /* Re-shuffle data-refs to be in topological order.  */
   1402   if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
   1403       > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
   1404     std::swap (dr1, dr2);
   1405 
   1406   ddr = get_data_dependence (rdg, dr1, dr2);
   1407 
   1408   /* In case of no data dependence.  */
   1409   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
   1410     return false;
   1411   /* For unknown data dependence or known data dependence which can't be
   1412      expressed in classic distance vector, we check if it can be resolved
   1413      by runtime alias check.  If yes, we still consider data dependence
   1414      as won't introduce data dependence cycle.  */
   1415   else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
   1416 	   || DDR_NUM_DIST_VECTS (ddr) == 0)
   1417     return !runtime_alias_check_p (ddr, NULL, true);
   1418   else if (DDR_NUM_DIST_VECTS (ddr) > 1)
   1419     return true;
   1420   else if (DDR_REVERSED_P (ddr)
   1421 	   || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
   1422     return false;
   1423 
   1424   return true;
   1425 }
   1426 
   1427 void
   1428 loop_distribution::update_type_for_merge (struct graph *rdg,
   1429 					   partition *partition1,
   1430 					   partition *partition2)
   1431 {
   1432   unsigned i, j;
   1433   bitmap_iterator bi, bj;
   1434   data_reference_p dr1, dr2;
   1435 
   1436   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
   1437     {
   1438       unsigned start = (partition1 == partition2) ? i + 1 : 0;
   1439 
   1440       dr1 = datarefs_vec[i];
   1441       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
   1442 	{
   1443 	  dr2 = datarefs_vec[j];
   1444 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
   1445 	    continue;
   1446 
   1447 	  /* Partition can only be executed sequentially if there is any
   1448 	     data dependence cycle.  */
   1449 	  if (data_dep_in_cycle_p (rdg, dr1, dr2))
   1450 	    {
   1451 	      partition1->type = PTYPE_SEQUENTIAL;
   1452 	      return;
   1453 	    }
   1454 	}
   1455     }
   1456 }
   1457 
   1458 partition *
   1459 loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v)
   1460 {
   1461   partition *partition = partition_alloc ();
   1462   auto_vec<int, 3> nodes;
   1463   unsigned i, j;
   1464   int x;
   1465   data_reference_p dr;
   1466 
   1467   graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
   1468 
   1469   FOR_EACH_VEC_ELT (nodes, i, x)
   1470     {
   1471       bitmap_set_bit (partition->stmts, x);
   1472 
   1473       for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
   1474 	{
   1475 	  unsigned idx = (unsigned) DR_INDEX (dr);
   1476 	  gcc_assert (idx < datarefs_vec.length ());
   1477 
   1478 	  /* Partition can only be executed sequentially if there is any
   1479 	     unknown data reference.  */
   1480 	  if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
   1481 	      || !DR_INIT (dr) || !DR_STEP (dr))
   1482 	    partition->type = PTYPE_SEQUENTIAL;
   1483 
   1484 	  bitmap_set_bit (partition->datarefs, idx);
   1485 	}
   1486     }
   1487 
   1488   if (partition->type == PTYPE_SEQUENTIAL)
   1489     return partition;
   1490 
   1491   /* Further check if any data dependence prevents us from executing the
   1492      partition parallelly.  */
   1493   update_type_for_merge (rdg, partition, partition);
   1494 
   1495   return partition;
   1496 }
   1497 
   1498 /* Given PARTITION of LOOP and RDG, record single load/store data references
   1499    for builtin partition in SRC_DR/DST_DR, return false if there is no such
   1500    data references.  */
   1501 
   1502 static bool
   1503 find_single_drs (class loop *loop, struct graph *rdg, const bitmap &partition_stmts,
   1504 		 data_reference_p *dst_dr, data_reference_p *src_dr)
   1505 {
   1506   unsigned i;
   1507   data_reference_p single_ld = NULL, single_st = NULL;
   1508   bitmap_iterator bi;
   1509 
   1510   EXECUTE_IF_SET_IN_BITMAP (partition_stmts, 0, i, bi)
   1511     {
   1512       gimple *stmt = RDG_STMT (rdg, i);
   1513       data_reference_p dr;
   1514 
   1515       if (gimple_code (stmt) == GIMPLE_PHI)
   1516 	continue;
   1517 
   1518       /* Any scalar stmts are ok.  */
   1519       if (!gimple_vuse (stmt))
   1520 	continue;
   1521 
   1522       /* Otherwise just regular loads/stores.  */
   1523       if (!gimple_assign_single_p (stmt))
   1524 	return false;
   1525 
   1526       /* But exactly one store and/or load.  */
   1527       for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
   1528 	{
   1529 	  tree type = TREE_TYPE (DR_REF (dr));
   1530 
   1531 	  /* The memset, memcpy and memmove library calls are only
   1532 	     able to deal with generic address space.  */
   1533 	  if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
   1534 	    return false;
   1535 
   1536 	  if (DR_IS_READ (dr))
   1537 	    {
   1538 	      if (single_ld != NULL)
   1539 		return false;
   1540 	      single_ld = dr;
   1541 	    }
   1542 	  else
   1543 	    {
   1544 	      if (single_st != NULL)
   1545 		return false;
   1546 	      single_st = dr;
   1547 	    }
   1548 	}
   1549     }
   1550 
   1551   if (!single_ld && !single_st)
   1552     return false;
   1553 
   1554   basic_block bb_ld = NULL;
   1555   basic_block bb_st = NULL;
   1556 
   1557   if (single_ld)
   1558     {
   1559       /* Bail out if this is a bitfield memory reference.  */
   1560       if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
   1561 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
   1562 	return false;
   1563 
   1564       /* Data reference must be executed exactly once per iteration of each
   1565 	 loop in the loop nest.  We only need to check dominance information
   1566 	 against the outermost one in a perfect loop nest because a bb can't
   1567 	 dominate outermost loop's latch without dominating inner loop's.  */
   1568       bb_ld = gimple_bb (DR_STMT (single_ld));
   1569       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
   1570 	return false;
   1571     }
   1572 
   1573   if (single_st)
   1574     {
   1575       /* Bail out if this is a bitfield memory reference.  */
   1576       if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
   1577 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
   1578 	return false;
   1579 
   1580       /* Data reference must be executed exactly once per iteration.
   1581 	 Same as single_ld, we only need to check against the outermost
   1582 	 loop.  */
   1583       bb_st = gimple_bb (DR_STMT (single_st));
   1584       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
   1585 	return false;
   1586     }
   1587 
   1588   if (single_ld && single_st)
   1589     {
   1590       /* Load and store must be in the same loop nest.  */
   1591       if (bb_st->loop_father != bb_ld->loop_father)
   1592 	return false;
   1593 
   1594       edge e = single_exit (bb_st->loop_father);
   1595       bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
   1596       bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
   1597       if (dom_ld != dom_st)
   1598 	return false;
   1599     }
   1600 
   1601   *src_dr = single_ld;
   1602   *dst_dr = single_st;
   1603   return true;
   1604 }
   1605 
   1606 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
   1607    loops from inner to outer to see if loop's step equals to access size at
   1608    each level of loop.  Return 2 if we can prove this at all level loops;
   1609    record access base and size in BASE and SIZE; save loop's step at each
   1610    level of loop in STEPS if it is not null.  For example:
   1611 
   1612      int arr[100][100][100];
   1613      for (i = 0; i < 100; i++)       ;steps[2] = 40000
   1614        for (j = 100; j > 0; j--)     ;steps[1] = -400
   1615 	 for (k = 0; k < 100; k++)   ;steps[0] = 4
   1616 	   arr[i][j - 1][k] = 0;     ;base = &arr, size = 4000000
   1617 
   1618    Return 1 if we can prove the equality at the innermost loop, but not all
   1619    level loops.  In this case, no information is recorded.
   1620 
   1621    Return 0 if no equality can be proven at any level loops.  */
   1622 
   1623 static int
   1624 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
   1625 		      tree *size, vec<tree> *steps = NULL)
   1626 {
   1627   location_t loc = gimple_location (DR_STMT (dr));
   1628   basic_block bb = gimple_bb (DR_STMT (dr));
   1629   class loop *loop = bb->loop_father;
   1630   tree ref = DR_REF (dr);
   1631   tree access_base = build_fold_addr_expr (ref);
   1632   tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
   1633   int res = 0;
   1634 
   1635   do {
   1636       tree scev_fn = analyze_scalar_evolution (loop, access_base);
   1637       if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
   1638 	return res;
   1639 
   1640       access_base = CHREC_LEFT (scev_fn);
   1641       if (tree_contains_chrecs (access_base, NULL))
   1642 	return res;
   1643 
   1644       tree scev_step = CHREC_RIGHT (scev_fn);
   1645       /* Only support constant steps.  */
   1646       if (TREE_CODE (scev_step) != INTEGER_CST)
   1647 	return res;
   1648 
   1649       enum ev_direction access_dir = scev_direction (scev_fn);
   1650       if (access_dir == EV_DIR_UNKNOWN)
   1651 	return res;
   1652 
   1653       if (steps != NULL)
   1654 	steps->safe_push (scev_step);
   1655 
   1656       scev_step = fold_convert_loc (loc, sizetype, scev_step);
   1657       /* Compute absolute value of scev step.  */
   1658       if (access_dir == EV_DIR_DECREASES)
   1659 	scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
   1660 
   1661       /* At each level of loop, scev step must equal to access size.  In other
   1662 	 words, DR must access consecutive memory between loop iterations.  */
   1663       if (!operand_equal_p (scev_step, access_size, 0))
   1664 	return res;
   1665 
   1666       /* Access stride can be computed for data reference at least for the
   1667 	 innermost loop.  */
   1668       res = 1;
   1669 
   1670       /* Compute DR's execution times in loop.  */
   1671       tree niters = number_of_latch_executions (loop);
   1672       niters = fold_convert_loc (loc, sizetype, niters);
   1673       if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
   1674 	niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
   1675 
   1676       /* Compute DR's overall access size in loop.  */
   1677       access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
   1678 				     niters, scev_step);
   1679       /* Adjust base address in case of negative step.  */
   1680       if (access_dir == EV_DIR_DECREASES)
   1681 	{
   1682 	  tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
   1683 				      scev_step, access_size);
   1684 	  access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
   1685 	}
   1686   } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
   1687 
   1688   *base = access_base;
   1689   *size = access_size;
   1690   /* Access stride can be computed for data reference at each level loop.  */
   1691   return 2;
   1692 }
   1693 
   1694 /* Allocate and return builtin struct.  Record information like DST_DR,
   1695    SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct.  */
   1696 
   1697 static struct builtin_info *
   1698 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
   1699 	       tree dst_base, tree src_base, tree size)
   1700 {
   1701   struct builtin_info *builtin = XNEW (struct builtin_info);
   1702   builtin->dst_dr = dst_dr;
   1703   builtin->src_dr = src_dr;
   1704   builtin->dst_base = dst_base;
   1705   builtin->src_base = src_base;
   1706   builtin->size = size;
   1707   return builtin;
   1708 }
   1709 
   1710 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
   1711    memset call.  */
   1712 
   1713 static void
   1714 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
   1715 {
   1716   gimple *stmt = DR_STMT (dr);
   1717   tree base, size, rhs = gimple_assign_rhs1 (stmt);
   1718 
   1719   if (const_with_all_bytes_same (rhs) == -1
   1720       && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
   1721 	  || (TYPE_MODE (TREE_TYPE (rhs))
   1722 	      != TYPE_MODE (unsigned_char_type_node))))
   1723     return;
   1724 
   1725   if (TREE_CODE (rhs) == SSA_NAME
   1726       && !SSA_NAME_IS_DEFAULT_DEF (rhs)
   1727       && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
   1728     return;
   1729 
   1730   int res = compute_access_range (loop, dr, &base, &size);
   1731   if (res == 0)
   1732     return;
   1733   if (res == 1)
   1734     {
   1735       partition->kind = PKIND_PARTIAL_MEMSET;
   1736       return;
   1737     }
   1738 
   1739   poly_uint64 base_offset;
   1740   unsigned HOST_WIDE_INT const_base_offset;
   1741   tree base_base = strip_offset (base, &base_offset);
   1742   if (!base_offset.is_constant (&const_base_offset))
   1743     return;
   1744 
   1745   struct builtin_info *builtin;
   1746   builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
   1747   builtin->dst_base_base = base_base;
   1748   builtin->dst_base_offset = const_base_offset;
   1749   partition->builtin = builtin;
   1750   partition->kind = PKIND_MEMSET;
   1751 }
   1752 
   1753 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
   1754    if it forms builtin memcpy or memmove call.  */
   1755 
   1756 void
   1757 loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg,
   1758 					  partition *partition,
   1759 					  data_reference_p dst_dr,
   1760 					  data_reference_p src_dr)
   1761 {
   1762   tree base, size, src_base, src_size;
   1763   auto_vec<tree> dst_steps, src_steps;
   1764 
   1765   /* Compute access range of both load and store.  */
   1766   int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
   1767   if (res != 2)
   1768     return;
   1769   res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
   1770   if (res != 2)
   1771     return;
   1772 
   1773   /* They must have the same access size.  */
   1774   if (!operand_equal_p (size, src_size, 0))
   1775     return;
   1776 
   1777   /* They must have the same storage order.  */
   1778   if (reverse_storage_order_for_component_p (DR_REF (dst_dr))
   1779       != reverse_storage_order_for_component_p (DR_REF (src_dr)))
   1780     return;
   1781 
   1782   /* Load and store in loop nest must access memory in the same way, i.e,
   1783      their must have the same steps in each loop of the nest.  */
   1784   if (dst_steps.length () != src_steps.length ())
   1785     return;
   1786   for (unsigned i = 0; i < dst_steps.length (); ++i)
   1787     if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
   1788       return;
   1789 
   1790   /* Now check that if there is a dependence.  */
   1791   ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
   1792 
   1793   /* Classify as memmove if no dependence between load and store.  */
   1794   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
   1795     {
   1796       partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
   1797       partition->kind = PKIND_MEMMOVE;
   1798       return;
   1799     }
   1800 
   1801   /* Can't do memmove in case of unknown dependence or dependence without
   1802      classical distance vector.  */
   1803   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
   1804       || DDR_NUM_DIST_VECTS (ddr) == 0)
   1805     return;
   1806 
   1807   unsigned i;
   1808   lambda_vector dist_v;
   1809   int num_lev = (DDR_LOOP_NEST (ddr)).length ();
   1810   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
   1811     {
   1812       unsigned dep_lev = dependence_level (dist_v, num_lev);
   1813       /* Can't do memmove if load depends on store.  */
   1814       if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
   1815 	return;
   1816     }
   1817 
   1818   partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
   1819   partition->kind = PKIND_MEMMOVE;
   1820   return;
   1821 }
   1822 
   1823 bool
   1824 loop_distribution::classify_partition (loop_p loop,
   1825 				       struct graph *rdg, partition *partition,
   1826 				       bitmap stmt_in_all_partitions)
   1827 {
   1828   bitmap_iterator bi;
   1829   unsigned i;
   1830   data_reference_p single_ld = NULL, single_st = NULL;
   1831   bool volatiles_p = false, has_reduction = false;
   1832 
   1833   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
   1834     {
   1835       gimple *stmt = RDG_STMT (rdg, i);
   1836 
   1837       if (gimple_has_volatile_ops (stmt))
   1838 	volatiles_p = true;
   1839 
   1840       /* If the stmt is not included by all partitions and there is uses
   1841 	 outside of the loop, then mark the partition as reduction.  */
   1842       if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
   1843 	{
   1844 	  /* Due to limitation in the transform phase we have to fuse all
   1845 	     reduction partitions.  As a result, this could cancel valid
   1846 	     loop distribution especially for loop that induction variable
   1847 	     is used outside of loop.  To workaround this issue, we skip
   1848 	     marking partition as reudction if the reduction stmt belongs
   1849 	     to all partitions.  In such case, reduction will be computed
   1850 	     correctly no matter how partitions are fused/distributed.  */
   1851 	  if (!bitmap_bit_p (stmt_in_all_partitions, i))
   1852 	    partition->reduction_p = true;
   1853 	  else
   1854 	    has_reduction = true;
   1855 	}
   1856     }
   1857 
   1858   /* Simple workaround to prevent classifying the partition as builtin
   1859      if it contains any use outside of loop.  For the case where all
   1860      partitions have the reduction this simple workaround is delayed
   1861      to only affect the last partition.  */
   1862   if (partition->reduction_p)
   1863      return has_reduction;
   1864 
   1865   /* Perform general partition disqualification for builtins.  */
   1866   if (volatiles_p
   1867       || !flag_tree_loop_distribute_patterns)
   1868     return has_reduction;
   1869 
   1870   /* Find single load/store data references for builtin partition.  */
   1871   if (!find_single_drs (loop, rdg, partition->stmts, &single_st, &single_ld)
   1872       || !single_st)
   1873     return has_reduction;
   1874 
   1875   if (single_ld && single_st)
   1876     {
   1877       gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
   1878       /* Direct aggregate copy or via an SSA name temporary.  */
   1879       if (load != store
   1880 	  && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
   1881 	return has_reduction;
   1882     }
   1883 
   1884   partition->loc = gimple_location (DR_STMT (single_st));
   1885 
   1886   /* Classify the builtin kind.  */
   1887   if (single_ld == NULL)
   1888     classify_builtin_st (loop, partition, single_st);
   1889   else
   1890     classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
   1891   return has_reduction;
   1892 }
   1893 
   1894 bool
   1895 loop_distribution::share_memory_accesses (struct graph *rdg,
   1896 		       partition *partition1, partition *partition2)
   1897 {
   1898   unsigned i, j;
   1899   bitmap_iterator bi, bj;
   1900   data_reference_p dr1, dr2;
   1901 
   1902   /* First check whether in the intersection of the two partitions are
   1903      any loads or stores.  Common loads are the situation that happens
   1904      most often.  */
   1905   EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
   1906     if (RDG_MEM_WRITE_STMT (rdg, i)
   1907 	|| RDG_MEM_READS_STMT (rdg, i))
   1908       return true;
   1909 
   1910   /* Then check whether the two partitions access the same memory object.  */
   1911   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
   1912     {
   1913       dr1 = datarefs_vec[i];
   1914 
   1915       if (!DR_BASE_ADDRESS (dr1)
   1916 	  || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
   1917 	continue;
   1918 
   1919       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
   1920 	{
   1921 	  dr2 = datarefs_vec[j];
   1922 
   1923 	  if (!DR_BASE_ADDRESS (dr2)
   1924 	      || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
   1925 	    continue;
   1926 
   1927 	  if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
   1928 	      && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
   1929 	      && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
   1930 	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
   1931 	    return true;
   1932 	}
   1933     }
   1934 
   1935   return false;
   1936 }
   1937 
   1938 /* For each seed statement in STARTING_STMTS, this function builds
   1939    partition for it by adding depended statements according to RDG.
   1940    All partitions are recorded in PARTITIONS.  */
   1941 
   1942 void
   1943 loop_distribution::rdg_build_partitions (struct graph *rdg,
   1944 					 vec<gimple *> starting_stmts,
   1945 					 vec<partition *> *partitions)
   1946 {
   1947   auto_bitmap processed;
   1948   int i;
   1949   gimple *stmt;
   1950 
   1951   FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
   1952     {
   1953       int v = rdg_vertex_for_stmt (rdg, stmt);
   1954 
   1955       if (dump_file && (dump_flags & TDF_DETAILS))
   1956 	fprintf (dump_file,
   1957 		 "ldist asked to generate code for vertex %d\n", v);
   1958 
   1959       /* If the vertex is already contained in another partition so
   1960          is the partition rooted at it.  */
   1961       if (bitmap_bit_p (processed, v))
   1962 	continue;
   1963 
   1964       partition *partition = build_rdg_partition_for_vertex (rdg, v);
   1965       bitmap_ior_into (processed, partition->stmts);
   1966 
   1967       if (dump_file && (dump_flags & TDF_DETAILS))
   1968 	{
   1969 	  fprintf (dump_file, "ldist creates useful %s partition:\n",
   1970 		   partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
   1971 	  bitmap_print (dump_file, partition->stmts, "  ", "\n");
   1972 	}
   1973 
   1974       partitions->safe_push (partition);
   1975     }
   1976 
   1977   /* All vertices should have been assigned to at least one partition now,
   1978      other than vertices belonging to dead code.  */
   1979 }
   1980 
   1981 /* Dump to FILE the PARTITIONS.  */
   1982 
   1983 static void
   1984 dump_rdg_partitions (FILE *file, const vec<partition *> &partitions)
   1985 {
   1986   int i;
   1987   partition *partition;
   1988 
   1989   FOR_EACH_VEC_ELT (partitions, i, partition)
   1990     debug_bitmap_file (file, partition->stmts);
   1991 }
   1992 
   1993 /* Debug PARTITIONS.  */
   1994 extern void debug_rdg_partitions (const vec<partition *> &);
   1995 
   1996 DEBUG_FUNCTION void
   1997 debug_rdg_partitions (const vec<partition *> &partitions)
   1998 {
   1999   dump_rdg_partitions (stderr, partitions);
   2000 }
   2001 
   2002 /* Returns the number of read and write operations in the RDG.  */
   2003 
   2004 static int
   2005 number_of_rw_in_rdg (struct graph *rdg)
   2006 {
   2007   int i, res = 0;
   2008 
   2009   for (i = 0; i < rdg->n_vertices; i++)
   2010     {
   2011       if (RDG_MEM_WRITE_STMT (rdg, i))
   2012 	++res;
   2013 
   2014       if (RDG_MEM_READS_STMT (rdg, i))
   2015 	++res;
   2016     }
   2017 
   2018   return res;
   2019 }
   2020 
   2021 /* Returns the number of read and write operations in a PARTITION of
   2022    the RDG.  */
   2023 
   2024 static int
   2025 number_of_rw_in_partition (struct graph *rdg, partition *partition)
   2026 {
   2027   int res = 0;
   2028   unsigned i;
   2029   bitmap_iterator ii;
   2030 
   2031   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
   2032     {
   2033       if (RDG_MEM_WRITE_STMT (rdg, i))
   2034 	++res;
   2035 
   2036       if (RDG_MEM_READS_STMT (rdg, i))
   2037 	++res;
   2038     }
   2039 
   2040   return res;
   2041 }
   2042 
   2043 /* Returns true when one of the PARTITIONS contains all the read or
   2044    write operations of RDG.  */
   2045 
   2046 static bool
   2047 partition_contains_all_rw (struct graph *rdg,
   2048 			   const vec<partition *> &partitions)
   2049 {
   2050   int i;
   2051   partition *partition;
   2052   int nrw = number_of_rw_in_rdg (rdg);
   2053 
   2054   FOR_EACH_VEC_ELT (partitions, i, partition)
   2055     if (nrw == number_of_rw_in_partition (rdg, partition))
   2056       return true;
   2057 
   2058   return false;
   2059 }
   2060 
   2061 int
   2062 loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir,
   2063 			 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
   2064 {
   2065   unsigned i, j;
   2066   bitmap_iterator bi, bj;
   2067   data_reference_p dr1, dr2, saved_dr1;
   2068 
   2069   /* dependence direction - 0 is no dependence, -1 is back,
   2070      1 is forth, 2 is both (we can stop then, merging will occur).  */
   2071   EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
   2072     {
   2073       dr1 = datarefs_vec[i];
   2074 
   2075       EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
   2076 	{
   2077 	  int res, this_dir = 1;
   2078 	  ddr_p ddr;
   2079 
   2080 	  dr2 = datarefs_vec[j];
   2081 
   2082 	  /* Skip all <read, read> data dependence.  */
   2083 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
   2084 	    continue;
   2085 
   2086 	  saved_dr1 = dr1;
   2087 	  /* Re-shuffle data-refs to be in topological order.  */
   2088 	  if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
   2089 	      > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
   2090 	    {
   2091 	      std::swap (dr1, dr2);
   2092 	      this_dir = -this_dir;
   2093 	    }
   2094 	  ddr = get_data_dependence (rdg, dr1, dr2);
   2095 	  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
   2096 	    {
   2097 	      this_dir = 0;
   2098 	      res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
   2099 					   DR_BASE_ADDRESS (dr2));
   2100 	      /* Be conservative.  If data references are not well analyzed,
   2101 		 or the two data references have the same base address and
   2102 		 offset, add dependence and consider it alias to each other.
   2103 		 In other words, the dependence cannot be resolved by
   2104 		 runtime alias check.  */
   2105 	      if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
   2106 		  || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
   2107 		  || !DR_INIT (dr1) || !DR_INIT (dr2)
   2108 		  || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
   2109 		  || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
   2110 		  || res == 0)
   2111 		this_dir = 2;
   2112 	      /* Data dependence could be resolved by runtime alias check,
   2113 		 record it in ALIAS_DDRS.  */
   2114 	      else if (alias_ddrs != NULL)
   2115 		alias_ddrs->safe_push (ddr);
   2116 	      /* Or simply ignore it.  */
   2117 	    }
   2118 	  else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
   2119 	    {
   2120 	      /* Known dependences can still be unordered througout the
   2121 		 iteration space, see gcc.dg/tree-ssa/ldist-16.c and
   2122 		 gcc.dg/tree-ssa/pr94969.c.  */
   2123 	      if (DDR_NUM_DIST_VECTS (ddr) != 1)
   2124 		this_dir = 2;
   2125 	      else
   2126 		{
   2127 		  /* If the overlap is exact preserve stmt order.  */
   2128 		  if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
   2129 					   DDR_NB_LOOPS (ddr)))
   2130 		    ;
   2131 		  /* Else as the distance vector is lexicographic positive swap
   2132 		     the dependence direction.  */
   2133 		  else
   2134 		    {
   2135 		      if (DDR_REVERSED_P (ddr))
   2136 			this_dir = -this_dir;
   2137 		      this_dir = -this_dir;
   2138 		    }
   2139 		  /* When then dependence distance of the innermost common
   2140 		     loop of the DRs is zero we have a conflict.  This is
   2141 		     due to wonky dependence analysis which sometimes
   2142 		     ends up using a zero distance in place of unknown.  */
   2143 		  auto l1 = gimple_bb (DR_STMT (dr1))->loop_father;
   2144 		  auto l2 = gimple_bb (DR_STMT (dr2))->loop_father;
   2145 		  int idx = index_in_loop_nest (find_common_loop (l1, l2)->num,
   2146 						DDR_LOOP_NEST (ddr));
   2147 		  if (DDR_DIST_VECT (ddr, 0)[idx] == 0
   2148 		      /* Unless it is the outermost loop which is the one
   2149 			 we eventually distribute.  */
   2150 		      && idx != 0)
   2151 		    this_dir = 2;
   2152 		}
   2153 	    }
   2154 	  else
   2155 	    this_dir = 0;
   2156 	  if (this_dir == 2)
   2157 	    return 2;
   2158 	  else if (dir == 0)
   2159 	    dir = this_dir;
   2160 	  else if (this_dir != 0 && dir != this_dir)
   2161 	    return 2;
   2162 	  /* Shuffle "back" dr1.  */
   2163 	  dr1 = saved_dr1;
   2164 	}
   2165     }
   2166   return dir;
   2167 }
   2168 
   2169 /* Compare postorder number of the partition graph vertices V1 and V2.  */
   2170 
   2171 static int
   2172 pgcmp (const void *v1_, const void *v2_)
   2173 {
   2174   const vertex *v1 = (const vertex *)v1_;
   2175   const vertex *v2 = (const vertex *)v2_;
   2176   return v2->post - v1->post;
   2177 }
   2178 
   2179 /* Data attached to vertices of partition dependence graph.  */
   2180 struct pg_vdata
   2181 {
   2182   /* ID of the corresponding partition.  */
   2183   int id;
   2184   /* The partition.  */
   2185   struct partition *partition;
   2186 };
   2187 
   2188 /* Data attached to edges of partition dependence graph.  */
   2189 struct pg_edata
   2190 {
   2191   /* If the dependence edge can be resolved by runtime alias check,
   2192      this vector contains data dependence relations for runtime alias
   2193      check.  On the other hand, if the dependence edge is introduced
   2194      because of compilation time known data dependence, this vector
   2195      contains nothing.  */
   2196   vec<ddr_p> alias_ddrs;
   2197 };
   2198 
   2199 /* Callback data for traversing edges in graph.  */
   2200 struct pg_edge_callback_data
   2201 {
   2202   /* Bitmap contains strong connected components should be merged.  */
   2203   bitmap sccs_to_merge;
   2204   /* Array constains component information for all vertices.  */
   2205   int *vertices_component;
   2206   /* Vector to record all data dependence relations which are needed
   2207      to break strong connected components by runtime alias checks.  */
   2208   vec<ddr_p> *alias_ddrs;
   2209 };
   2210 
   2211 /* Initialize vertice's data for partition dependence graph PG with
   2212    PARTITIONS.  */
   2213 
   2214 static void
   2215 init_partition_graph_vertices (struct graph *pg,
   2216 			       vec<struct partition *> *partitions)
   2217 {
   2218   int i;
   2219   partition *partition;
   2220   struct pg_vdata *data;
   2221 
   2222   for (i = 0; partitions->iterate (i, &partition); ++i)
   2223     {
   2224       data = new pg_vdata;
   2225       pg->vertices[i].data = data;
   2226       data->id = i;
   2227       data->partition = partition;
   2228     }
   2229 }
   2230 
   2231 /* Add edge <I, J> to partition dependence graph PG.  Attach vector of data
   2232    dependence relations to the EDGE if DDRS isn't NULL.  */
   2233 
   2234 static void
   2235 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
   2236 {
   2237   struct graph_edge *e = add_edge (pg, i, j);
   2238 
   2239   /* If the edge is attached with data dependence relations, it means this
   2240      dependence edge can be resolved by runtime alias checks.  */
   2241   if (ddrs != NULL)
   2242     {
   2243       struct pg_edata *data = new pg_edata;
   2244 
   2245       gcc_assert (ddrs->length () > 0);
   2246       e->data = data;
   2247       data->alias_ddrs = vNULL;
   2248       data->alias_ddrs.safe_splice (*ddrs);
   2249     }
   2250 }
   2251 
   2252 /* Callback function for graph travesal algorithm.  It returns true
   2253    if edge E should skipped when traversing the graph.  */
   2254 
   2255 static bool
   2256 pg_skip_alias_edge (struct graph_edge *e)
   2257 {
   2258   struct pg_edata *data = (struct pg_edata *)e->data;
   2259   return (data != NULL && data->alias_ddrs.length () > 0);
   2260 }
   2261 
   2262 /* Callback function freeing data attached to edge E of graph.  */
   2263 
   2264 static void
   2265 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
   2266 {
   2267   if (e->data != NULL)
   2268     {
   2269       struct pg_edata *data = (struct pg_edata *)e->data;
   2270       data->alias_ddrs.release ();
   2271       delete data;
   2272     }
   2273 }
   2274 
   2275 /* Free data attached to vertice of partition dependence graph PG.  */
   2276 
   2277 static void
   2278 free_partition_graph_vdata (struct graph *pg)
   2279 {
   2280   int i;
   2281   struct pg_vdata *data;
   2282 
   2283   for (i = 0; i < pg->n_vertices; ++i)
   2284     {
   2285       data = (struct pg_vdata *)pg->vertices[i].data;
   2286       delete data;
   2287     }
   2288 }
   2289 
   2290 /* Build and return partition dependence graph for PARTITIONS.  RDG is
   2291    reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
   2292    is true, data dependence caused by possible alias between references
   2293    is ignored, as if it doesn't exist at all; otherwise all depdendences
   2294    are considered.  */
   2295 
   2296 struct graph *
   2297 loop_distribution::build_partition_graph (struct graph *rdg,
   2298 					  vec<struct partition *> *partitions,
   2299 					  bool ignore_alias_p)
   2300 {
   2301   int i, j;
   2302   struct partition *partition1, *partition2;
   2303   graph *pg = new_graph (partitions->length ());
   2304   auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
   2305 
   2306   alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
   2307 
   2308   init_partition_graph_vertices (pg, partitions);
   2309 
   2310   for (i = 0; partitions->iterate (i, &partition1); ++i)
   2311     {
   2312       for (j = i + 1; partitions->iterate (j, &partition2); ++j)
   2313 	{
   2314 	  /* dependence direction - 0 is no dependence, -1 is back,
   2315 	     1 is forth, 2 is both (we can stop then, merging will occur).  */
   2316 	  int dir = 0;
   2317 
   2318 	  /* If the first partition has reduction, add back edge; if the
   2319 	     second partition has reduction, add forth edge.  This makes
   2320 	     sure that reduction partition will be sorted as the last one.  */
   2321 	  if (partition_reduction_p (partition1))
   2322 	    dir = -1;
   2323 	  else if (partition_reduction_p (partition2))
   2324 	    dir = 1;
   2325 
   2326 	  /* Cleanup the temporary vector.  */
   2327 	  alias_ddrs.truncate (0);
   2328 
   2329 	  dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
   2330 					 partition2->datarefs, alias_ddrs_p);
   2331 
   2332 	  /* Add edge to partition graph if there exists dependence.  There
   2333 	     are two types of edges.  One type edge is caused by compilation
   2334 	     time known dependence, this type cannot be resolved by runtime
   2335 	     alias check.  The other type can be resolved by runtime alias
   2336 	     check.  */
   2337 	  if (dir == 1 || dir == 2
   2338 	      || alias_ddrs.length () > 0)
   2339 	    {
   2340 	      /* Attach data dependence relations to edge that can be resolved
   2341 		 by runtime alias check.  */
   2342 	      bool alias_edge_p = (dir != 1 && dir != 2);
   2343 	      add_partition_graph_edge (pg, i, j,
   2344 					(alias_edge_p) ? &alias_ddrs : NULL);
   2345 	    }
   2346 	  if (dir == -1 || dir == 2
   2347 	      || alias_ddrs.length () > 0)
   2348 	    {
   2349 	      /* Attach data dependence relations to edge that can be resolved
   2350 		 by runtime alias check.  */
   2351 	      bool alias_edge_p = (dir != -1 && dir != 2);
   2352 	      add_partition_graph_edge (pg, j, i,
   2353 					(alias_edge_p) ? &alias_ddrs : NULL);
   2354 	    }
   2355 	}
   2356     }
   2357   return pg;
   2358 }
   2359 
   2360 /* Sort partitions in PG in descending post order and store them in
   2361    PARTITIONS.  */
   2362 
   2363 static void
   2364 sort_partitions_by_post_order (struct graph *pg,
   2365 			       vec<struct partition *> *partitions)
   2366 {
   2367   int i;
   2368   struct pg_vdata *data;
   2369 
   2370   /* Now order the remaining nodes in descending postorder.  */
   2371   qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
   2372   partitions->truncate (0);
   2373   for (i = 0; i < pg->n_vertices; ++i)
   2374     {
   2375       data = (struct pg_vdata *)pg->vertices[i].data;
   2376       if (data->partition)
   2377 	partitions->safe_push (data->partition);
   2378     }
   2379 }
   2380 
   2381 void
   2382 loop_distribution::merge_dep_scc_partitions (struct graph *rdg,
   2383 					     vec<struct partition *> *partitions,
   2384 					     bool ignore_alias_p)
   2385 {
   2386   struct partition *partition1, *partition2;
   2387   struct pg_vdata *data;
   2388   graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
   2389   int i, j, num_sccs = graphds_scc (pg, NULL);
   2390 
   2391   /* Strong connected compoenent means dependence cycle, we cannot distribute
   2392      them.  So fuse them together.  */
   2393   if ((unsigned) num_sccs < partitions->length ())
   2394     {
   2395       for (i = 0; i < num_sccs; ++i)
   2396 	{
   2397 	  for (j = 0; partitions->iterate (j, &partition1); ++j)
   2398 	    if (pg->vertices[j].component == i)
   2399 	      break;
   2400 	  for (j = j + 1; partitions->iterate (j, &partition2); ++j)
   2401 	    if (pg->vertices[j].component == i)
   2402 	      {
   2403 		partition_merge_into (NULL, partition1,
   2404 				      partition2, FUSE_SAME_SCC);
   2405 		partition1->type = PTYPE_SEQUENTIAL;
   2406 		(*partitions)[j] = NULL;
   2407 		partition_free (partition2);
   2408 		data = (struct pg_vdata *)pg->vertices[j].data;
   2409 		data->partition = NULL;
   2410 	      }
   2411 	}
   2412     }
   2413 
   2414   sort_partitions_by_post_order (pg, partitions);
   2415   gcc_assert (partitions->length () == (unsigned)num_sccs);
   2416   free_partition_graph_vdata (pg);
   2417   for_each_edge (pg, free_partition_graph_edata_cb, NULL);
   2418   free_graph (pg);
   2419 }
   2420 
   2421 /* Callback function for traversing edge E in graph G.  DATA is private
   2422    callback data.  */
   2423 
   2424 static void
   2425 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
   2426 {
   2427   int i, j, component;
   2428   struct pg_edge_callback_data *cbdata;
   2429   struct pg_edata *edata = (struct pg_edata *) e->data;
   2430 
   2431   /* If the edge doesn't have attached data dependence, it represents
   2432      compilation time known dependences.  This type dependence cannot
   2433      be resolved by runtime alias check.  */
   2434   if (edata == NULL || edata->alias_ddrs.length () == 0)
   2435     return;
   2436 
   2437   cbdata = (struct pg_edge_callback_data *) data;
   2438   i = e->src;
   2439   j = e->dest;
   2440   component = cbdata->vertices_component[i];
   2441   /* Vertices are topologically sorted according to compilation time
   2442      known dependences, so we can break strong connected components
   2443      by removing edges of the opposite direction, i.e, edges pointing
   2444      from vertice with smaller post number to vertice with bigger post
   2445      number.  */
   2446   if (g->vertices[i].post < g->vertices[j].post
   2447       /* We only need to remove edges connecting vertices in the same
   2448 	 strong connected component to break it.  */
   2449       && component == cbdata->vertices_component[j]
   2450       /* Check if we want to break the strong connected component or not.  */
   2451       && !bitmap_bit_p (cbdata->sccs_to_merge, component))
   2452     cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
   2453 }
   2454 
   2455 /* Callback function for traversing edge E.  DATA is private
   2456    callback data.  */
   2457 
   2458 static void
   2459 pg_unmark_merged_alias_ddrs (struct graph *, struct graph_edge *e, void *data)
   2460 {
   2461   int i, j, component;
   2462   struct pg_edge_callback_data *cbdata;
   2463   struct pg_edata *edata = (struct pg_edata *) e->data;
   2464 
   2465   if (edata == NULL || edata->alias_ddrs.length () == 0)
   2466     return;
   2467 
   2468   cbdata = (struct pg_edge_callback_data *) data;
   2469   i = e->src;
   2470   j = e->dest;
   2471   component = cbdata->vertices_component[i];
   2472   /* Make sure to not skip vertices inside SCCs we are going to merge.  */
   2473   if (component == cbdata->vertices_component[j]
   2474       && bitmap_bit_p (cbdata->sccs_to_merge, component))
   2475     {
   2476       edata->alias_ddrs.release ();
   2477       delete edata;
   2478       e->data = NULL;
   2479     }
   2480 }
   2481 
   2482 /* This is the main function breaking strong conected components in
   2483    PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
   2484    relations for runtime alias check in ALIAS_DDRS.  */
   2485 void
   2486 loop_distribution::break_alias_scc_partitions (struct graph *rdg,
   2487 					       vec<struct partition *> *partitions,
   2488 					       vec<ddr_p> *alias_ddrs)
   2489 {
   2490   int i, j, k, num_sccs, num_sccs_no_alias = 0;
   2491   /* Build partition dependence graph.  */
   2492   graph *pg = build_partition_graph (rdg, partitions, false);
   2493 
   2494   alias_ddrs->truncate (0);
   2495   /* Find strong connected components in the graph, with all dependence edges
   2496      considered.  */
   2497   num_sccs = graphds_scc (pg, NULL);
   2498   /* All SCCs now can be broken by runtime alias checks because SCCs caused by
   2499      compilation time known dependences are merged before this function.  */
   2500   if ((unsigned) num_sccs < partitions->length ())
   2501     {
   2502       struct pg_edge_callback_data cbdata;
   2503       auto_bitmap sccs_to_merge;
   2504       auto_vec<enum partition_type> scc_types;
   2505       struct partition *partition, *first;
   2506 
   2507       /* If all partitions in a SCC have the same type, we can simply merge the
   2508 	 SCC.  This loop finds out such SCCS and record them in bitmap.  */
   2509       bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
   2510       for (i = 0; i < num_sccs; ++i)
   2511 	{
   2512 	  for (j = 0; partitions->iterate (j, &first); ++j)
   2513 	    if (pg->vertices[j].component == i)
   2514 	      break;
   2515 
   2516 	  bool same_type = true, all_builtins = partition_builtin_p (first);
   2517 	  for (++j; partitions->iterate (j, &partition); ++j)
   2518 	    {
   2519 	      if (pg->vertices[j].component != i)
   2520 		continue;
   2521 
   2522 	      if (first->type != partition->type)
   2523 		{
   2524 		  same_type = false;
   2525 		  break;
   2526 		}
   2527 	      all_builtins &= partition_builtin_p (partition);
   2528 	    }
   2529 	  /* Merge SCC if all partitions in SCC have the same type, though the
   2530 	     result partition is sequential, because vectorizer can do better
   2531 	     runtime alias check.  One expecption is all partitions in SCC are
   2532 	     builtins.  */
   2533 	  if (!same_type || all_builtins)
   2534 	    bitmap_clear_bit (sccs_to_merge, i);
   2535 	}
   2536 
   2537       /* Initialize callback data for traversing.  */
   2538       cbdata.sccs_to_merge = sccs_to_merge;
   2539       cbdata.alias_ddrs = alias_ddrs;
   2540       cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
   2541       /* Record the component information which will be corrupted by next
   2542 	 graph scc finding call.  */
   2543       for (i = 0; i < pg->n_vertices; ++i)
   2544 	cbdata.vertices_component[i] = pg->vertices[i].component;
   2545 
   2546       /* Collect data dependences for runtime alias checks to break SCCs.  */
   2547       if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
   2548 	{
   2549 	  /* For SCCs we want to merge clear all alias_ddrs for edges
   2550 	     inside the component.  */
   2551 	  for_each_edge (pg, pg_unmark_merged_alias_ddrs, &cbdata);
   2552 
   2553 	  /* Run SCC finding algorithm again, with alias dependence edges
   2554 	     skipped.  This is to topologically sort partitions according to
   2555 	     compilation time known dependence.  Note the topological order
   2556 	     is stored in the form of pg's post order number.  */
   2557 	  num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
   2558 	  /* We cannot assert partitions->length () == num_sccs_no_alias
   2559 	     since we are not ignoring alias edges in cycles we are
   2560 	     going to merge.  That's required to compute correct postorder.  */
   2561 	  /* With topological order, we can construct two subgraphs L and R.
   2562 	     L contains edge <x, y> where x < y in terms of post order, while
   2563 	     R contains edge <x, y> where x > y.  Edges for compilation time
   2564 	     known dependence all fall in R, so we break SCCs by removing all
   2565 	     (alias) edges of in subgraph L.  */
   2566 	  for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
   2567 	}
   2568 
   2569       /* For SCC that doesn't need to be broken, merge it.  */
   2570       for (i = 0; i < num_sccs; ++i)
   2571 	{
   2572 	  if (!bitmap_bit_p (sccs_to_merge, i))
   2573 	    continue;
   2574 
   2575 	  for (j = 0; partitions->iterate (j, &first); ++j)
   2576 	    if (cbdata.vertices_component[j] == i)
   2577 	      break;
   2578 	  for (k = j + 1; partitions->iterate (k, &partition); ++k)
   2579 	    {
   2580 	      struct pg_vdata *data;
   2581 
   2582 	      if (cbdata.vertices_component[k] != i)
   2583 		continue;
   2584 
   2585 	      partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
   2586 	      (*partitions)[k] = NULL;
   2587 	      partition_free (partition);
   2588 	      data = (struct pg_vdata *)pg->vertices[k].data;
   2589 	      gcc_assert (data->id == k);
   2590 	      data->partition = NULL;
   2591 	      /* The result partition of merged SCC must be sequential.  */
   2592 	      first->type = PTYPE_SEQUENTIAL;
   2593 	    }
   2594 	}
   2595       /* If reduction partition's SCC is broken by runtime alias checks,
   2596 	 we force a negative post order to it making sure it will be scheduled
   2597 	 in the last.  */
   2598       if (num_sccs_no_alias > 0)
   2599 	{
   2600 	  j = -1;
   2601 	  for (i = 0; i < pg->n_vertices; ++i)
   2602 	    {
   2603 	      struct pg_vdata *data = (struct pg_vdata *)pg->vertices[i].data;
   2604 	      if (data->partition && partition_reduction_p (data->partition))
   2605 		{
   2606 		  gcc_assert (j == -1);
   2607 		  j = i;
   2608 		}
   2609 	    }
   2610 	  if (j >= 0)
   2611 	    pg->vertices[j].post = -1;
   2612 	}
   2613 
   2614       free (cbdata.vertices_component);
   2615     }
   2616 
   2617   sort_partitions_by_post_order (pg, partitions);
   2618   free_partition_graph_vdata (pg);
   2619   for_each_edge (pg, free_partition_graph_edata_cb, NULL);
   2620   free_graph (pg);
   2621 
   2622   if (dump_file && (dump_flags & TDF_DETAILS))
   2623     {
   2624       fprintf (dump_file, "Possible alias data dependence to break:\n");
   2625       dump_data_dependence_relations (dump_file, *alias_ddrs);
   2626     }
   2627 }
   2628 
   2629 /* Compute and return an expression whose value is the segment length which
   2630    will be accessed by DR in NITERS iterations.  */
   2631 
   2632 static tree
   2633 data_ref_segment_size (struct data_reference *dr, tree niters)
   2634 {
   2635   niters = size_binop (MINUS_EXPR,
   2636 		       fold_convert (sizetype, niters),
   2637 		       size_one_node);
   2638   return size_binop (MULT_EXPR,
   2639 		     fold_convert (sizetype, DR_STEP (dr)),
   2640 		     fold_convert (sizetype, niters));
   2641 }
   2642 
   2643 /* Return true if LOOP's latch is dominated by statement for data reference
   2644    DR.  */
   2645 
   2646 static inline bool
   2647 latch_dominated_by_data_ref (class loop *loop, data_reference *dr)
   2648 {
   2649   return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
   2650 			 gimple_bb (DR_STMT (dr)));
   2651 }
   2652 
   2653 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
   2654    data dependence relations ALIAS_DDRS.  */
   2655 
   2656 static void
   2657 compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs,
   2658 			   vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
   2659 {
   2660   unsigned int i;
   2661   unsigned HOST_WIDE_INT factor = 1;
   2662   tree niters_plus_one, niters = number_of_latch_executions (loop);
   2663 
   2664   gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
   2665   niters = fold_convert (sizetype, niters);
   2666   niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
   2667 
   2668   if (dump_file && (dump_flags & TDF_DETAILS))
   2669     fprintf (dump_file, "Creating alias check pairs:\n");
   2670 
   2671   /* Iterate all data dependence relations and compute alias check pairs.  */
   2672   for (i = 0; i < alias_ddrs->length (); i++)
   2673     {
   2674       ddr_p ddr = (*alias_ddrs)[i];
   2675       struct data_reference *dr_a = DDR_A (ddr);
   2676       struct data_reference *dr_b = DDR_B (ddr);
   2677       tree seg_length_a, seg_length_b;
   2678 
   2679       if (latch_dominated_by_data_ref (loop, dr_a))
   2680 	seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
   2681       else
   2682 	seg_length_a = data_ref_segment_size (dr_a, niters);
   2683 
   2684       if (latch_dominated_by_data_ref (loop, dr_b))
   2685 	seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
   2686       else
   2687 	seg_length_b = data_ref_segment_size (dr_b, niters);
   2688 
   2689       unsigned HOST_WIDE_INT access_size_a
   2690 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
   2691       unsigned HOST_WIDE_INT access_size_b
   2692 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
   2693       unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
   2694       unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
   2695 
   2696       dr_with_seg_len_pair_t dr_with_seg_len_pair
   2697 	(dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
   2698 	 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b),
   2699 	 /* ??? Would WELL_ORDERED be safe?  */
   2700 	 dr_with_seg_len_pair_t::REORDERED);
   2701 
   2702       comp_alias_pairs->safe_push (dr_with_seg_len_pair);
   2703     }
   2704 
   2705   if (tree_fits_uhwi_p (niters))
   2706     factor = tree_to_uhwi (niters);
   2707 
   2708   /* Prune alias check pairs.  */
   2709   prune_runtime_alias_test_list (comp_alias_pairs, factor);
   2710   if (dump_file && (dump_flags & TDF_DETAILS))
   2711     fprintf (dump_file,
   2712 	     "Improved number of alias checks from %d to %d\n",
   2713 	     alias_ddrs->length (), comp_alias_pairs->length ());
   2714 }
   2715 
   2716 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
   2717    checks and version LOOP under condition of these runtime alias checks.  */
   2718 
   2719 static void
   2720 version_loop_by_alias_check (vec<struct partition *> *partitions,
   2721 			     class loop *loop, vec<ddr_p> *alias_ddrs)
   2722 {
   2723   profile_probability prob;
   2724   basic_block cond_bb;
   2725   class loop *nloop;
   2726   tree lhs, arg0, cond_expr = NULL_TREE;
   2727   gimple_seq cond_stmts = NULL;
   2728   gimple *call_stmt = NULL;
   2729   auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
   2730 
   2731   /* Generate code for runtime alias checks if necessary.  */
   2732   gcc_assert (alias_ddrs->length () > 0);
   2733 
   2734   if (dump_file && (dump_flags & TDF_DETAILS))
   2735     fprintf (dump_file,
   2736 	     "Version loop <%d> with runtime alias check\n", loop->num);
   2737 
   2738   compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
   2739   create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
   2740   cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
   2741 				      is_gimple_val, NULL_TREE);
   2742 
   2743   /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS.  */
   2744   bool cancelable_p = flag_tree_loop_vectorize;
   2745   if (cancelable_p)
   2746     {
   2747       unsigned i = 0;
   2748       struct partition *partition;
   2749       for (; partitions->iterate (i, &partition); ++i)
   2750 	if (!partition_builtin_p (partition))
   2751 	  break;
   2752 
   2753      /* If all partitions are builtins, distributing it would be profitable and
   2754 	we don't want to cancel the runtime alias checks.  */
   2755       if (i == partitions->length ())
   2756 	cancelable_p = false;
   2757     }
   2758 
   2759   /* Generate internal function call for loop distribution alias check if the
   2760      runtime alias check should be cancelable.  */
   2761   if (cancelable_p)
   2762     {
   2763       call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
   2764 					      2, NULL_TREE, cond_expr);
   2765       lhs = make_ssa_name (boolean_type_node);
   2766       gimple_call_set_lhs (call_stmt, lhs);
   2767     }
   2768   else
   2769     lhs = cond_expr;
   2770 
   2771   prob = profile_probability::guessed_always ().apply_scale (9, 10);
   2772   initialize_original_copy_tables ();
   2773   nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
   2774 			prob, prob.invert (), true);
   2775   free_original_copy_tables ();
   2776   /* Record the original loop number in newly generated loops.  In case of
   2777      distribution, the original loop will be distributed and the new loop
   2778      is kept.  */
   2779   loop->orig_loop_num = nloop->num;
   2780   nloop->orig_loop_num = nloop->num;
   2781   nloop->dont_vectorize = true;
   2782   nloop->force_vectorize = false;
   2783 
   2784   if (call_stmt)
   2785     {
   2786       /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
   2787 	 loop could be destroyed.  */
   2788       arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
   2789       gimple_call_set_arg (call_stmt, 0, arg0);
   2790       gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
   2791     }
   2792 
   2793   if (cond_stmts)
   2794     {
   2795       gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
   2796       gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
   2797     }
   2798   update_ssa (TODO_update_ssa);
   2799 }
   2800 
   2801 /* Return true if loop versioning is needed to distrubute PARTITIONS.
   2802    ALIAS_DDRS are data dependence relations for runtime alias check.  */
   2803 
   2804 static inline bool
   2805 version_for_distribution_p (vec<struct partition *> *partitions,
   2806 			    vec<ddr_p> *alias_ddrs)
   2807 {
   2808   /* No need to version loop if we have only one partition.  */
   2809   if (partitions->length () == 1)
   2810     return false;
   2811 
   2812   /* Need to version loop if runtime alias check is necessary.  */
   2813   return (alias_ddrs->length () > 0);
   2814 }
   2815 
   2816 /* Compare base offset of builtin mem* partitions P1 and P2.  */
   2817 
   2818 static int
   2819 offset_cmp (const void *vp1, const void *vp2)
   2820 {
   2821   struct partition *p1 = *(struct partition *const *) vp1;
   2822   struct partition *p2 = *(struct partition *const *) vp2;
   2823   unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset;
   2824   unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset;
   2825   return (o2 < o1) - (o1 < o2);
   2826 }
   2827 
   2828 /* Fuse adjacent memset builtin PARTITIONS if possible.  This is a special
   2829    case optimization transforming below code:
   2830 
   2831      __builtin_memset (&obj, 0, 100);
   2832      _1 = &obj + 100;
   2833      __builtin_memset (_1, 0, 200);
   2834      _2 = &obj + 300;
   2835      __builtin_memset (_2, 0, 100);
   2836 
   2837    into:
   2838 
   2839      __builtin_memset (&obj, 0, 400);
   2840 
   2841    Note we don't have dependence information between different partitions
   2842    at this point, as a result, we can't handle nonadjacent memset builtin
   2843    partitions since dependence might be broken.  */
   2844 
   2845 static void
   2846 fuse_memset_builtins (vec<struct partition *> *partitions)
   2847 {
   2848   unsigned i, j;
   2849   struct partition *part1, *part2;
   2850   tree rhs1, rhs2;
   2851 
   2852   for (i = 0; partitions->iterate (i, &part1);)
   2853     {
   2854       if (part1->kind != PKIND_MEMSET)
   2855 	{
   2856 	  i++;
   2857 	  continue;
   2858 	}
   2859 
   2860       /* Find sub-array of memset builtins of the same base.  Index range
   2861 	 of the sub-array is [i, j) with "j > i".  */
   2862       for (j = i + 1; partitions->iterate (j, &part2); ++j)
   2863 	{
   2864 	  if (part2->kind != PKIND_MEMSET
   2865 	      || !operand_equal_p (part1->builtin->dst_base_base,
   2866 				   part2->builtin->dst_base_base, 0))
   2867 	    break;
   2868 
   2869 	  /* Memset calls setting different values can't be merged.  */
   2870 	  rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
   2871 	  rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
   2872 	  if (!operand_equal_p (rhs1, rhs2, 0))
   2873 	    break;
   2874 	}
   2875 
   2876       /* Stable sort is required in order to avoid breaking dependence.  */
   2877       gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i],
   2878 		      offset_cmp);
   2879       /* Continue with next partition.  */
   2880       i = j;
   2881     }
   2882 
   2883   /* Merge all consecutive memset builtin partitions.  */
   2884   for (i = 0; i < partitions->length () - 1;)
   2885     {
   2886       part1 = (*partitions)[i];
   2887       if (part1->kind != PKIND_MEMSET)
   2888 	{
   2889 	  i++;
   2890 	  continue;
   2891 	}
   2892 
   2893       part2 = (*partitions)[i + 1];
   2894       /* Only merge memset partitions of the same base and with constant
   2895 	 access sizes.  */
   2896       if (part2->kind != PKIND_MEMSET
   2897 	  || TREE_CODE (part1->builtin->size) != INTEGER_CST
   2898 	  || TREE_CODE (part2->builtin->size) != INTEGER_CST
   2899 	  || !operand_equal_p (part1->builtin->dst_base_base,
   2900 			       part2->builtin->dst_base_base, 0))
   2901 	{
   2902 	  i++;
   2903 	  continue;
   2904 	}
   2905       rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
   2906       rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
   2907       int bytev1 = const_with_all_bytes_same (rhs1);
   2908       int bytev2 = const_with_all_bytes_same (rhs2);
   2909       /* Only merge memset partitions of the same value.  */
   2910       if (bytev1 != bytev2 || bytev1 == -1)
   2911 	{
   2912 	  i++;
   2913 	  continue;
   2914 	}
   2915       wide_int end1 = wi::add (part1->builtin->dst_base_offset,
   2916 			       wi::to_wide (part1->builtin->size));
   2917       /* Only merge adjacent memset partitions.  */
   2918       if (wi::ne_p (end1, part2->builtin->dst_base_offset))
   2919 	{
   2920 	  i++;
   2921 	  continue;
   2922 	}
   2923       /* Merge partitions[i] and partitions[i+1].  */
   2924       part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
   2925 					  part1->builtin->size,
   2926 					  part2->builtin->size);
   2927       partition_free (part2);
   2928       partitions->ordered_remove (i + 1);
   2929     }
   2930 }
   2931 
   2932 void
   2933 loop_distribution::finalize_partitions (class loop *loop,
   2934 					vec<struct partition *> *partitions,
   2935 					vec<ddr_p> *alias_ddrs)
   2936 {
   2937   unsigned i;
   2938   struct partition *partition, *a;
   2939 
   2940   if (partitions->length () == 1
   2941       || alias_ddrs->length () > 0)
   2942     return;
   2943 
   2944   unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
   2945   bool same_type_p = true;
   2946   enum partition_type type = ((*partitions)[0])->type;
   2947   for (i = 0; partitions->iterate (i, &partition); ++i)
   2948     {
   2949       same_type_p &= (type == partition->type);
   2950       if (partition_builtin_p (partition))
   2951 	{
   2952 	  num_builtin++;
   2953 	  continue;
   2954 	}
   2955       num_normal++;
   2956       if (partition->kind == PKIND_PARTIAL_MEMSET)
   2957 	num_partial_memset++;
   2958     }
   2959 
   2960   /* Don't distribute current loop into too many loops given we don't have
   2961      memory stream cost model.  Be even more conservative in case of loop
   2962      nest distribution.  */
   2963   if ((same_type_p && num_builtin == 0
   2964        && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
   2965       || (loop->inner != NULL
   2966 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
   2967       || (loop->inner == NULL
   2968 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
   2969     {
   2970       a = (*partitions)[0];
   2971       for (i = 1; partitions->iterate (i, &partition); ++i)
   2972 	{
   2973 	  partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
   2974 	  partition_free (partition);
   2975 	}
   2976       partitions->truncate (1);
   2977     }
   2978 
   2979   /* Fuse memset builtins if possible.  */
   2980   if (partitions->length () > 1)
   2981     fuse_memset_builtins (partitions);
   2982 }
   2983 
   2984 /* Distributes the code from LOOP in such a way that producer statements
   2985    are placed before consumer statements.  Tries to separate only the
   2986    statements from STMTS into separate loops.  Returns the number of
   2987    distributed loops.  Set NB_CALLS to number of generated builtin calls.
   2988    Set *DESTROY_P to whether LOOP needs to be destroyed.  */
   2989 
   2990 int
   2991 loop_distribution::distribute_loop (class loop *loop,
   2992 		 const vec<gimple *> &stmts,
   2993 		 control_dependences *cd, int *nb_calls, bool *destroy_p,
   2994 		 bool only_patterns_p)
   2995 {
   2996   ddrs_table = new hash_table<ddr_hasher> (389);
   2997   struct graph *rdg;
   2998   partition *partition;
   2999   int i, nbp;
   3000 
   3001   *destroy_p = false;
   3002   *nb_calls = 0;
   3003   loop_nest.create (0);
   3004   if (!find_loop_nest (loop, &loop_nest))
   3005     {
   3006       loop_nest.release ();
   3007       delete ddrs_table;
   3008       return 0;
   3009     }
   3010 
   3011   datarefs_vec.create (20);
   3012   has_nonaddressable_dataref_p = false;
   3013   rdg = build_rdg (loop, cd);
   3014   if (!rdg)
   3015     {
   3016       if (dump_file && (dump_flags & TDF_DETAILS))
   3017 	fprintf (dump_file,
   3018 		 "Loop %d not distributed: failed to build the RDG.\n",
   3019 		 loop->num);
   3020 
   3021       loop_nest.release ();
   3022       free_data_refs (datarefs_vec);
   3023       delete ddrs_table;
   3024       return 0;
   3025     }
   3026 
   3027   if (datarefs_vec.length () > MAX_DATAREFS_NUM)
   3028     {
   3029       if (dump_file && (dump_flags & TDF_DETAILS))
   3030 	fprintf (dump_file,
   3031 		 "Loop %d not distributed: too many memory references.\n",
   3032 		 loop->num);
   3033 
   3034       free_rdg (rdg);
   3035       loop_nest.release ();
   3036       free_data_refs (datarefs_vec);
   3037       delete ddrs_table;
   3038       return 0;
   3039     }
   3040 
   3041   data_reference_p dref;
   3042   for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
   3043     dref->aux = (void *) (uintptr_t) i;
   3044 
   3045   if (dump_file && (dump_flags & TDF_DETAILS))
   3046     dump_rdg (dump_file, rdg);
   3047 
   3048   auto_vec<struct partition *, 3> partitions;
   3049   rdg_build_partitions (rdg, stmts, &partitions);
   3050 
   3051   auto_vec<ddr_p> alias_ddrs;
   3052 
   3053   auto_bitmap stmt_in_all_partitions;
   3054   bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
   3055   for (i = 1; partitions.iterate (i, &partition); ++i)
   3056     bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
   3057 
   3058   bool any_builtin = false;
   3059   bool reduction_in_all = false;
   3060   FOR_EACH_VEC_ELT (partitions, i, partition)
   3061     {
   3062       reduction_in_all
   3063 	|= classify_partition (loop, rdg, partition, stmt_in_all_partitions);
   3064       any_builtin |= partition_builtin_p (partition);
   3065     }
   3066 
   3067   /* If we are only distributing patterns but did not detect any,
   3068      simply bail out.  */
   3069   if (only_patterns_p
   3070       && !any_builtin)
   3071     {
   3072       nbp = 0;
   3073       goto ldist_done;
   3074     }
   3075 
   3076   /* If we are only distributing patterns fuse all partitions that
   3077      were not classified as builtins.  This also avoids chopping
   3078      a loop into pieces, separated by builtin calls.  That is, we
   3079      only want no or a single loop body remaining.  */
   3080   struct partition *into;
   3081   if (only_patterns_p)
   3082     {
   3083       for (i = 0; partitions.iterate (i, &into); ++i)
   3084 	if (!partition_builtin_p (into))
   3085 	  break;
   3086       for (++i; partitions.iterate (i, &partition); ++i)
   3087 	if (!partition_builtin_p (partition))
   3088 	  {
   3089 	    partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
   3090 	    partitions.unordered_remove (i);
   3091 	    partition_free (partition);
   3092 	    i--;
   3093 	  }
   3094     }
   3095 
   3096   /* Due to limitations in the transform phase we have to fuse all
   3097      reduction partitions into the last partition so the existing
   3098      loop will contain all loop-closed PHI nodes.  */
   3099   for (i = 0; partitions.iterate (i, &into); ++i)
   3100     if (partition_reduction_p (into))
   3101       break;
   3102   for (i = i + 1; partitions.iterate (i, &partition); ++i)
   3103     if (partition_reduction_p (partition))
   3104       {
   3105 	partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
   3106 	partitions.unordered_remove (i);
   3107 	partition_free (partition);
   3108 	i--;
   3109       }
   3110 
   3111   /* Apply our simple cost model - fuse partitions with similar
   3112      memory accesses.  */
   3113   for (i = 0; partitions.iterate (i, &into); ++i)
   3114     {
   3115       bool changed = false;
   3116       if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
   3117 	continue;
   3118       for (int j = i + 1;
   3119 	   partitions.iterate (j, &partition); ++j)
   3120 	{
   3121 	  if (share_memory_accesses (rdg, into, partition))
   3122 	    {
   3123 	      partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
   3124 	      partitions.unordered_remove (j);
   3125 	      partition_free (partition);
   3126 	      j--;
   3127 	      changed = true;
   3128 	    }
   3129 	}
   3130       /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
   3131          accesses when 1 and 2 have similar accesses but not 0 and 1
   3132 	 then in the next iteration we will fail to consider merging
   3133 	 1 into 0,2.  So try again if we did any merging into 0.  */
   3134       if (changed)
   3135 	i--;
   3136     }
   3137 
   3138   /* Put a non-builtin partition last if we need to preserve a reduction.
   3139      ???  This is a workaround that makes sort_partitions_by_post_order do
   3140      the correct thing while in reality it should sort each component
   3141      separately and then put the component with a reduction or a non-builtin
   3142      last.  */
   3143   if (reduction_in_all
   3144       && partition_builtin_p (partitions.last()))
   3145     FOR_EACH_VEC_ELT (partitions, i, partition)
   3146       if (!partition_builtin_p (partition))
   3147 	{
   3148 	  partitions.unordered_remove (i);
   3149 	  partitions.quick_push (partition);
   3150 	  break;
   3151 	}
   3152 
   3153   /* Build the partition dependency graph and fuse partitions in strong
   3154      connected component.  */
   3155   if (partitions.length () > 1)
   3156     {
   3157       /* Don't support loop nest distribution under runtime alias check
   3158 	 since it's not likely to enable many vectorization opportunities.
   3159 	 Also if loop has any data reference which may be not addressable
   3160 	 since alias check needs to take, compare address of the object.  */
   3161       if (loop->inner || has_nonaddressable_dataref_p)
   3162 	merge_dep_scc_partitions (rdg, &partitions, false);
   3163       else
   3164 	{
   3165 	  merge_dep_scc_partitions (rdg, &partitions, true);
   3166 	  if (partitions.length () > 1)
   3167 	    break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
   3168 	}
   3169     }
   3170 
   3171   finalize_partitions (loop, &partitions, &alias_ddrs);
   3172 
   3173   /* If there is a reduction in all partitions make sure the last one
   3174      is not classified for builtin code generation.  */
   3175   if (reduction_in_all)
   3176     {
   3177       partition = partitions.last ();
   3178       if (only_patterns_p
   3179 	  && partition_builtin_p (partition)
   3180 	  && !partition_builtin_p (partitions[0]))
   3181 	{
   3182 	  nbp = 0;
   3183 	  goto ldist_done;
   3184 	}
   3185       partition->kind = PKIND_NORMAL;
   3186     }
   3187 
   3188   nbp = partitions.length ();
   3189   if (nbp == 0
   3190       || (nbp == 1 && !partition_builtin_p (partitions[0]))
   3191       || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
   3192     {
   3193       nbp = 0;
   3194       goto ldist_done;
   3195     }
   3196 
   3197   if (version_for_distribution_p (&partitions, &alias_ddrs))
   3198     version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
   3199 
   3200   if (dump_file && (dump_flags & TDF_DETAILS))
   3201     {
   3202       fprintf (dump_file,
   3203 	       "distribute loop <%d> into partitions:\n", loop->num);
   3204       dump_rdg_partitions (dump_file, partitions);
   3205     }
   3206 
   3207   FOR_EACH_VEC_ELT (partitions, i, partition)
   3208     {
   3209       if (partition_builtin_p (partition))
   3210 	(*nb_calls)++;
   3211       *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
   3212     }
   3213 
   3214  ldist_done:
   3215   loop_nest.release ();
   3216   free_data_refs (datarefs_vec);
   3217   for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
   3218        iter != ddrs_table->end (); ++iter)
   3219     {
   3220       free_dependence_relation (*iter);
   3221       *iter = NULL;
   3222     }
   3223   delete ddrs_table;
   3224 
   3225   FOR_EACH_VEC_ELT (partitions, i, partition)
   3226     partition_free (partition);
   3227 
   3228   free_rdg (rdg);
   3229   return nbp - *nb_calls;
   3230 }
   3231 
   3232 
   3233 void loop_distribution::bb_top_order_init (void)
   3234 {
   3235   int rpo_num;
   3236   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (cfun) - NUM_FIXED_BLOCKS);
   3237   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
   3238   bitmap exit_bbs = BITMAP_ALLOC (NULL);
   3239 
   3240   bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
   3241   bb_top_order_index_size = last_basic_block_for_fn (cfun);
   3242 
   3243   entry->flags &= ~EDGE_DFS_BACK;
   3244   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
   3245   rpo_num = rev_post_order_and_mark_dfs_back_seme (cfun, entry, exit_bbs, true,
   3246 						   rpo, NULL);
   3247   BITMAP_FREE (exit_bbs);
   3248 
   3249   for (int i = 0; i < rpo_num; i++)
   3250     bb_top_order_index[rpo[i]] = i;
   3251 
   3252   free (rpo);
   3253 }
   3254 
   3255 void loop_distribution::bb_top_order_destroy ()
   3256 {
   3257   free (bb_top_order_index);
   3258   bb_top_order_index = NULL;
   3259   bb_top_order_index_size = 0;
   3260 }
   3261 
   3262 
   3263 /* Given LOOP, this function records seed statements for distribution in
   3264    WORK_LIST.  Return false if there is nothing for distribution.  */
   3265 
   3266 static bool
   3267 find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list)
   3268 {
   3269   basic_block *bbs = get_loop_body_in_dom_order (loop);
   3270 
   3271   /* Initialize the worklist with stmts we seed the partitions with.  */
   3272   for (unsigned i = 0; i < loop->num_nodes; ++i)
   3273     {
   3274       /* In irreducible sub-regions we don't know how to redirect
   3275 	 conditions, so fail.  See PR100492.  */
   3276       if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
   3277 	{
   3278 	  if (dump_file && (dump_flags & TDF_DETAILS))
   3279 	    fprintf (dump_file, "loop %d contains an irreducible region.\n",
   3280 		     loop->num);
   3281 	  work_list->truncate (0);
   3282 	  break;
   3283 	}
   3284       for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
   3285 	   !gsi_end_p (gsi); gsi_next (&gsi))
   3286 	{
   3287 	  gphi *phi = gsi.phi ();
   3288 	  if (virtual_operand_p (gimple_phi_result (phi)))
   3289 	    continue;
   3290 	  /* Distribute stmts which have defs that are used outside of
   3291 	     the loop.  */
   3292 	  if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
   3293 	    continue;
   3294 	  work_list->safe_push (phi);
   3295 	}
   3296       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
   3297 	   !gsi_end_p (gsi); gsi_next (&gsi))
   3298 	{
   3299 	  gimple *stmt = gsi_stmt (gsi);
   3300 
   3301 	  /* Ignore clobbers, they do not have true side effects.  */
   3302 	  if (gimple_clobber_p (stmt))
   3303 	    continue;
   3304 
   3305 	  /* If there is a stmt with side-effects bail out - we
   3306 	     cannot and should not distribute this loop.  */
   3307 	  if (gimple_has_side_effects (stmt))
   3308 	    {
   3309 	      free (bbs);
   3310 	      return false;
   3311 	    }
   3312 
   3313 	  /* Distribute stmts which have defs that are used outside of
   3314 	     the loop.  */
   3315 	  if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
   3316 	    ;
   3317 	  /* Otherwise only distribute stores for now.  */
   3318 	  else if (!gimple_vdef (stmt))
   3319 	    continue;
   3320 
   3321 	  work_list->safe_push (stmt);
   3322 	}
   3323     }
   3324   bool res = work_list->length () > 0;
   3325   if (res && !can_copy_bbs_p (bbs, loop->num_nodes))
   3326     {
   3327       if (dump_file && (dump_flags & TDF_DETAILS))
   3328 	fprintf (dump_file, "cannot copy loop %d.\n", loop->num);
   3329       res = false;
   3330     }
   3331   free (bbs);
   3332   return res;
   3333 }
   3334 
   3335 /* A helper function for generate_{rawmemchr,strlen}_builtin functions in order
   3336    to place new statements SEQ before LOOP and replace the old reduction
   3337    variable with the new one.  */
   3338 
   3339 static void
   3340 generate_reduction_builtin_1 (loop_p loop, gimple_seq &seq,
   3341 			      tree reduction_var_old, tree reduction_var_new,
   3342 			      const char *info, machine_mode load_mode)
   3343 {
   3344   gcc_assert (flag_tree_loop_distribute_patterns);
   3345 
   3346   /* Place new statements before LOOP.  */
   3347   gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
   3348   gsi_insert_seq_after (&gsi, seq, GSI_CONTINUE_LINKING);
   3349 
   3350   /* Replace old reduction variable with new one.  */
   3351   imm_use_iterator iter;
   3352   gimple *stmt;
   3353   use_operand_p use_p;
   3354   FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var_old)
   3355     {
   3356       FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
   3357 	SET_USE (use_p, reduction_var_new);
   3358 
   3359       update_stmt (stmt);
   3360     }
   3361 
   3362   if (dump_file && (dump_flags & TDF_DETAILS))
   3363     fprintf (dump_file, info, GET_MODE_NAME (load_mode));
   3364 }
   3365 
   3366 /* Generate a call to rawmemchr and place it before LOOP.  REDUCTION_VAR is
   3367    replaced with a fresh SSA name representing the result of the call.  */
   3368 
   3369 static void
   3370 generate_rawmemchr_builtin (loop_p loop, tree reduction_var,
   3371 			    data_reference_p store_dr, tree base, tree pattern,
   3372 			    location_t loc)
   3373 {
   3374   gimple_seq seq = NULL;
   3375 
   3376   tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
   3377   gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, pattern);
   3378   tree reduction_var_new = copy_ssa_name (reduction_var);
   3379   gimple_call_set_lhs (fn_call, reduction_var_new);
   3380   gimple_set_location (fn_call, loc);
   3381   gimple_seq_add_stmt (&seq, fn_call);
   3382 
   3383   if (store_dr)
   3384     {
   3385       gassign *g = gimple_build_assign (DR_REF (store_dr), reduction_var_new);
   3386       gimple_seq_add_stmt (&seq, g);
   3387     }
   3388 
   3389   generate_reduction_builtin_1 (loop, seq, reduction_var, reduction_var_new,
   3390 				"generated rawmemchr%s\n",
   3391 				TYPE_MODE (TREE_TYPE (TREE_TYPE (base))));
   3392 }
   3393 
   3394 /* Helper function for generate_strlen_builtin(,_using_rawmemchr)  */
   3395 
   3396 static void
   3397 generate_strlen_builtin_1 (loop_p loop, gimple_seq &seq,
   3398 			   tree reduction_var_old, tree reduction_var_new,
   3399 			   machine_mode mode, tree start_len)
   3400 {
   3401   /* REDUCTION_VAR_NEW has either size type or ptrdiff type and must be
   3402      converted if types of old and new reduction variable are not compatible. */
   3403   reduction_var_new = gimple_convert (&seq, TREE_TYPE (reduction_var_old),
   3404 				      reduction_var_new);
   3405 
   3406   /* Loops of the form `for (i=42; s[i]; ++i);` have an additional start
   3407      length.  */
   3408   if (!integer_zerop (start_len))
   3409     {
   3410       tree lhs = make_ssa_name (TREE_TYPE (reduction_var_new));
   3411       gimple *g = gimple_build_assign (lhs, PLUS_EXPR, reduction_var_new,
   3412 				       start_len);
   3413       gimple_seq_add_stmt (&seq, g);
   3414       reduction_var_new = lhs;
   3415     }
   3416 
   3417   generate_reduction_builtin_1 (loop, seq, reduction_var_old, reduction_var_new,
   3418 				"generated strlen%s\n", mode);
   3419 }
   3420 
   3421 /* Generate a call to strlen and place it before LOOP.  REDUCTION_VAR is
   3422    replaced with a fresh SSA name representing the result of the call.  */
   3423 
   3424 static void
   3425 generate_strlen_builtin (loop_p loop, tree reduction_var, tree base,
   3426 			 tree start_len, location_t loc)
   3427 {
   3428   gimple_seq seq = NULL;
   3429 
   3430   tree reduction_var_new = make_ssa_name (size_type_node);
   3431 
   3432   tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
   3433   tree fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_STRLEN));
   3434   gimple *fn_call = gimple_build_call (fn, 1, mem);
   3435   gimple_call_set_lhs (fn_call, reduction_var_new);
   3436   gimple_set_location (fn_call, loc);
   3437   gimple_seq_add_stmt (&seq, fn_call);
   3438 
   3439   generate_strlen_builtin_1 (loop, seq, reduction_var, reduction_var_new,
   3440 			     QImode, start_len);
   3441 }
   3442 
   3443 /* Generate code in order to mimic the behaviour of strlen but this time over
   3444    an array of elements with mode different than QI.  REDUCTION_VAR is replaced
   3445    with a fresh SSA name representing the result, i.e., the length.  */
   3446 
   3447 static void
   3448 generate_strlen_builtin_using_rawmemchr (loop_p loop, tree reduction_var,
   3449 					 tree base, tree load_type,
   3450 					 tree start_len, location_t loc)
   3451 {
   3452   gimple_seq seq = NULL;
   3453 
   3454   tree start = force_gimple_operand (base, &seq, true, NULL_TREE);
   3455   tree zero = build_zero_cst (load_type);
   3456   gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, start, zero);
   3457   tree end = make_ssa_name (TREE_TYPE (base));
   3458   gimple_call_set_lhs (fn_call, end);
   3459   gimple_set_location (fn_call, loc);
   3460   gimple_seq_add_stmt (&seq, fn_call);
   3461 
   3462   /* Determine the number of elements between START and END by
   3463      evaluating (END - START) / sizeof (*START).  */
   3464   tree diff = make_ssa_name (ptrdiff_type_node);
   3465   gimple *diff_stmt = gimple_build_assign (diff, POINTER_DIFF_EXPR, end, start);
   3466   gimple_seq_add_stmt (&seq, diff_stmt);
   3467   /* Let SIZE be the size of each character.  */
   3468   tree size = gimple_convert (&seq, ptrdiff_type_node,
   3469 			      TYPE_SIZE_UNIT (load_type));
   3470   tree count = make_ssa_name (ptrdiff_type_node);
   3471   gimple *count_stmt = gimple_build_assign (count, TRUNC_DIV_EXPR, diff, size);
   3472   gimple_seq_add_stmt (&seq, count_stmt);
   3473 
   3474   generate_strlen_builtin_1 (loop, seq, reduction_var, count,
   3475 			     TYPE_MODE (load_type),
   3476 			     start_len);
   3477 }
   3478 
   3479 /* Return true if we can count at least as many characters by taking pointer
   3480    difference as we can count via reduction_var without an overflow.  Thus
   3481    compute 2^n < (2^(m-1) / s) where n = TYPE_PRECISION (reduction_var_type),
   3482    m = TYPE_PRECISION (ptrdiff_type_node), and s = size of each character.  */
   3483 static bool
   3484 reduction_var_overflows_first (tree reduction_var_type, tree load_type)
   3485 {
   3486   widest_int n2 = wi::lshift (1, TYPE_PRECISION (reduction_var_type));;
   3487   widest_int m2 = wi::lshift (1, TYPE_PRECISION (ptrdiff_type_node) - 1);
   3488   widest_int s = wi::to_widest (TYPE_SIZE_UNIT (load_type));
   3489   return wi::ltu_p (n2, wi::udiv_trunc (m2, s));
   3490 }
   3491 
   3492 static gimple *
   3493 determine_reduction_stmt_1 (const loop_p loop, const basic_block *bbs)
   3494 {
   3495   gimple *reduction_stmt = NULL;
   3496 
   3497   for (unsigned i = 0, ninsns = 0; i < loop->num_nodes; ++i)
   3498     {
   3499       basic_block bb = bbs[i];
   3500 
   3501       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
   3502 	   gsi_next (&bsi))
   3503 	{
   3504 	  gphi *phi = bsi.phi ();
   3505 	  if (virtual_operand_p (gimple_phi_result (phi)))
   3506 	    continue;
   3507 	  if (stmt_has_scalar_dependences_outside_loop (loop, phi))
   3508 	    {
   3509 	      if (reduction_stmt)
   3510 		return NULL;
   3511 	      reduction_stmt = phi;
   3512 	    }
   3513 	}
   3514 
   3515       for (gimple_stmt_iterator bsi = gsi_start_nondebug_bb (bb);
   3516 	   !gsi_end_p (bsi); gsi_next_nondebug (&bsi), ++ninsns)
   3517 	{
   3518 	  /* Bail out early for loops which are unlikely to match.  */
   3519 	  if (ninsns > 16)
   3520 	    return NULL;
   3521 	  gimple *stmt = gsi_stmt (bsi);
   3522 	  if (gimple_clobber_p (stmt))
   3523 	    continue;
   3524 	  if (gimple_code (stmt) == GIMPLE_LABEL)
   3525 	    continue;
   3526 	  if (gimple_has_volatile_ops (stmt))
   3527 	    return NULL;
   3528 	  if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
   3529 	    {
   3530 	      if (reduction_stmt)
   3531 		return NULL;
   3532 	      reduction_stmt = stmt;
   3533 	    }
   3534 	}
   3535     }
   3536 
   3537   return reduction_stmt;
   3538 }
   3539 
   3540 /* If LOOP has a single non-volatile reduction statement, then return a pointer
   3541    to it.  Otherwise return NULL.  */
   3542 static gimple *
   3543 determine_reduction_stmt (const loop_p loop)
   3544 {
   3545   basic_block *bbs = get_loop_body (loop);
   3546   gimple *reduction_stmt = determine_reduction_stmt_1 (loop, bbs);
   3547   XDELETEVEC (bbs);
   3548   return reduction_stmt;
   3549 }
   3550 
   3551 /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
   3552    replace them accordingly.  For example, a loop of the form
   3553 
   3554      for (; *p != 42; ++p);
   3555 
   3556    is replaced by
   3557 
   3558      p = rawmemchr<MODE> (p, 42);
   3559 
   3560    under the assumption that rawmemchr is available for a particular MODE.
   3561    Another example is
   3562 
   3563      int i;
   3564      for (i = 42; s[i]; ++i);
   3565 
   3566    which is replaced by
   3567 
   3568      i = (int)strlen (&s[42]) + 42;
   3569 
   3570    for some character array S.  In case array S is not of type character array
   3571    we end up with
   3572 
   3573      i = (int)(rawmemchr<MODE> (&s[42], 0) - &s[42]) + 42;
   3574 
   3575    assuming that rawmemchr is available for a particular MODE.  */
   3576 
   3577 bool
   3578 loop_distribution::transform_reduction_loop (loop_p loop)
   3579 {
   3580   gimple *reduction_stmt;
   3581   data_reference_p load_dr = NULL, store_dr = NULL;
   3582 
   3583   edge e = single_exit (loop);
   3584   gcond *cond = safe_dyn_cast <gcond *> (last_stmt (e->src));
   3585   if (!cond)
   3586     return false;
   3587   /* Ensure loop condition is an (in)equality test and loop is exited either if
   3588      the inequality test fails or the equality test succeeds.  */
   3589   if (!(e->flags & EDGE_FALSE_VALUE && gimple_cond_code (cond) == NE_EXPR)
   3590       && !(e->flags & EDGE_TRUE_VALUE && gimple_cond_code (cond) == EQ_EXPR))
   3591     return false;
   3592   /* A limitation of the current implementation is that we only support
   3593      constant patterns in (in)equality tests.  */
   3594   tree pattern = gimple_cond_rhs (cond);
   3595   if (TREE_CODE (pattern) != INTEGER_CST)
   3596     return false;
   3597 
   3598   reduction_stmt = determine_reduction_stmt (loop);
   3599 
   3600   /* A limitation of the current implementation is that we require a reduction
   3601      statement.  Therefore, loops without a reduction statement as in the
   3602      following are not recognized:
   3603      int *p;
   3604      void foo (void) { for (; *p; ++p); } */
   3605   if (reduction_stmt == NULL)
   3606     return false;
   3607 
   3608   /* Reduction variables are guaranteed to be SSA names.  */
   3609   tree reduction_var;
   3610   switch (gimple_code (reduction_stmt))
   3611     {
   3612     case GIMPLE_ASSIGN:
   3613     case GIMPLE_PHI:
   3614       reduction_var = gimple_get_lhs (reduction_stmt);
   3615       break;
   3616     default:
   3617       /* Bail out e.g. for GIMPLE_CALL.  */
   3618       return false;
   3619     }
   3620 
   3621   struct graph *rdg = build_rdg (loop, NULL);
   3622   if (rdg == NULL)
   3623     {
   3624       if (dump_file && (dump_flags & TDF_DETAILS))
   3625 	fprintf (dump_file,
   3626 		 "Loop %d not transformed: failed to build the RDG.\n",
   3627 		 loop->num);
   3628 
   3629       return false;
   3630     }
   3631   auto_bitmap partition_stmts;
   3632   bitmap_set_range (partition_stmts, 0, rdg->n_vertices);
   3633   find_single_drs (loop, rdg, partition_stmts, &store_dr, &load_dr);
   3634   free_rdg (rdg);
   3635 
   3636   /* Bail out if there is no single load.  */
   3637   if (load_dr == NULL)
   3638     return false;
   3639 
   3640   /* Reaching this point we have a loop with a single reduction variable,
   3641      a single load, and an optional single store.  */
   3642 
   3643   tree load_ref = DR_REF (load_dr);
   3644   tree load_type = TREE_TYPE (load_ref);
   3645   tree load_access_base = build_fold_addr_expr (load_ref);
   3646   tree load_access_size = TYPE_SIZE_UNIT (load_type);
   3647   affine_iv load_iv, reduction_iv;
   3648 
   3649   if (!INTEGRAL_TYPE_P (load_type)
   3650       || !type_has_mode_precision_p (load_type))
   3651     return false;
   3652 
   3653   /* We already ensured that the loop condition tests for (in)equality where the
   3654      rhs is a constant pattern. Now ensure that the lhs is the result of the
   3655      load.  */
   3656   if (gimple_cond_lhs (cond) != gimple_assign_lhs (DR_STMT (load_dr)))
   3657     return false;
   3658 
   3659   /* Bail out if no affine induction variable with constant step can be
   3660      determined.  */
   3661   if (!simple_iv (loop, loop, load_access_base, &load_iv, false))
   3662     return false;
   3663 
   3664   /* Bail out if memory accesses are not consecutive or not growing.  */
   3665   if (!operand_equal_p (load_iv.step, load_access_size, 0))
   3666     return false;
   3667 
   3668   if (!simple_iv (loop, loop, reduction_var, &reduction_iv, false))
   3669     return false;
   3670 
   3671   /* Handle rawmemchr like loops.  */
   3672   if (operand_equal_p (load_iv.base, reduction_iv.base)
   3673       && operand_equal_p (load_iv.step, reduction_iv.step))
   3674     {
   3675       if (store_dr)
   3676 	{
   3677 	  /* Ensure that we store to X and load from X+I where I>0.  */
   3678 	  if (TREE_CODE (load_iv.base) != POINTER_PLUS_EXPR
   3679 	      || !integer_onep (TREE_OPERAND (load_iv.base, 1)))
   3680 	    return false;
   3681 	  tree ptr_base = TREE_OPERAND (load_iv.base, 0);
   3682 	  if (TREE_CODE (ptr_base) != SSA_NAME)
   3683 	    return false;
   3684 	  gimple *def = SSA_NAME_DEF_STMT (ptr_base);
   3685 	  if (!gimple_assign_single_p (def)
   3686 	      || gimple_assign_rhs1 (def) != DR_REF (store_dr))
   3687 	    return false;
   3688 	  /* Ensure that the reduction value is stored.  */
   3689 	  if (gimple_assign_rhs1 (DR_STMT (store_dr)) != reduction_var)
   3690 	    return false;
   3691 	}
   3692       /* Bail out if target does not provide rawmemchr for a certain mode.  */
   3693       machine_mode mode = TYPE_MODE (load_type);
   3694       if (direct_optab_handler (rawmemchr_optab, mode) == CODE_FOR_nothing)
   3695 	return false;
   3696       location_t loc = gimple_location (DR_STMT (load_dr));
   3697       generate_rawmemchr_builtin (loop, reduction_var, store_dr, load_iv.base,
   3698 				  pattern, loc);
   3699       return true;
   3700     }
   3701 
   3702   /* Handle strlen like loops.  */
   3703   if (store_dr == NULL
   3704       && integer_zerop (pattern)
   3705       && INTEGRAL_TYPE_P (TREE_TYPE (reduction_var))
   3706       && TREE_CODE (reduction_iv.base) == INTEGER_CST
   3707       && TREE_CODE (reduction_iv.step) == INTEGER_CST
   3708       && integer_onep (reduction_iv.step))
   3709     {
   3710       location_t loc = gimple_location (DR_STMT (load_dr));
   3711       tree reduction_var_type = TREE_TYPE (reduction_var);
   3712       /* While determining the length of a string an overflow might occur.
   3713 	 If an overflow only occurs in the loop implementation and not in the
   3714 	 strlen implementation, then either the overflow is undefined or the
   3715 	 truncated result of strlen equals the one of the loop.  Otherwise if
   3716 	 an overflow may also occur in the strlen implementation, then
   3717 	 replacing a loop by a call to strlen is sound whenever we ensure that
   3718 	 if an overflow occurs in the strlen implementation, then also an
   3719 	 overflow occurs in the loop implementation which is undefined.  It
   3720 	 seems reasonable to relax this and assume that the strlen
   3721 	 implementation cannot overflow in case sizetype is big enough in the
   3722 	 sense that an overflow can only happen for string objects which are
   3723 	 bigger than half of the address space; at least for 32-bit targets and
   3724 	 up.
   3725 
   3726 	 For strlen which makes use of rawmemchr the maximal length of a string
   3727 	 which can be determined without an overflow is PTRDIFF_MAX / S where
   3728 	 each character has size S.  Since an overflow for ptrdiff type is
   3729 	 undefined we have to make sure that if an overflow occurs, then an
   3730 	 overflow occurs in the loop implementation, too, and this is
   3731 	 undefined, too.  Similar as before we relax this and assume that no
   3732 	 string object is larger than half of the address space; at least for
   3733 	 32-bit targets and up.  */
   3734       if (TYPE_MODE (load_type) == TYPE_MODE (char_type_node)
   3735 	  && TYPE_PRECISION (load_type) == TYPE_PRECISION (char_type_node)
   3736 	  && ((TYPE_PRECISION (sizetype) >= TYPE_PRECISION (ptr_type_node) - 1
   3737 	       && TYPE_PRECISION (ptr_type_node) >= 32)
   3738 	      || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type)
   3739 		  && TYPE_PRECISION (reduction_var_type) <= TYPE_PRECISION (sizetype)))
   3740 	  && builtin_decl_implicit (BUILT_IN_STRLEN))
   3741 	generate_strlen_builtin (loop, reduction_var, load_iv.base,
   3742 				 reduction_iv.base, loc);
   3743       else if (direct_optab_handler (rawmemchr_optab, TYPE_MODE (load_type))
   3744 	       != CODE_FOR_nothing
   3745 	       && ((TYPE_PRECISION (ptrdiff_type_node) == TYPE_PRECISION (ptr_type_node)
   3746 		    && TYPE_PRECISION (ptrdiff_type_node) >= 32)
   3747 		   || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type)
   3748 		       && reduction_var_overflows_first (reduction_var_type, load_type))))
   3749 	generate_strlen_builtin_using_rawmemchr (loop, reduction_var,
   3750 						 load_iv.base,
   3751 						 load_type,
   3752 						 reduction_iv.base, loc);
   3753       else
   3754 	return false;
   3755       return true;
   3756     }
   3757 
   3758   return false;
   3759 }
   3760 
   3761 /* Given innermost LOOP, return the outermost enclosing loop that forms a
   3762    perfect loop nest.  */
   3763 
   3764 static class loop *
   3765 prepare_perfect_loop_nest (class loop *loop)
   3766 {
   3767   class loop *outer = loop_outer (loop);
   3768   tree niters = number_of_latch_executions (loop);
   3769 
   3770   /* TODO: We only support the innermost 3-level loop nest distribution
   3771      because of compilation time issue for now.  This should be relaxed
   3772      in the future.  Note we only allow 3-level loop nest distribution
   3773      when parallelizing loops.  */
   3774   while ((loop->inner == NULL
   3775 	  || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
   3776 	 && loop_outer (outer)
   3777 	 && outer->inner == loop && loop->next == NULL
   3778 	 && single_exit (outer)
   3779 	 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
   3780 	 && (niters = number_of_latch_executions (outer)) != NULL_TREE
   3781 	 && niters != chrec_dont_know)
   3782     {
   3783       loop = outer;
   3784       outer = loop_outer (loop);
   3785     }
   3786 
   3787   return loop;
   3788 }
   3789 
   3790 
   3791 unsigned int
   3792 loop_distribution::execute (function *fun)
   3793 {
   3794   bool changed = false;
   3795   basic_block bb;
   3796   control_dependences *cd = NULL;
   3797   auto_vec<loop_p> loops_to_be_destroyed;
   3798 
   3799   if (number_of_loops (fun) <= 1)
   3800     return 0;
   3801 
   3802   bb_top_order_init ();
   3803 
   3804   FOR_ALL_BB_FN (bb, fun)
   3805     {
   3806       gimple_stmt_iterator gsi;
   3807       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
   3808 	gimple_set_uid (gsi_stmt (gsi), -1);
   3809       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
   3810 	gimple_set_uid (gsi_stmt (gsi), -1);
   3811     }
   3812 
   3813   /* We can at the moment only distribute non-nested loops, thus restrict
   3814      walking to innermost loops.  */
   3815   for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
   3816     {
   3817       /* Don't distribute multiple exit edges loop, or cold loop when
   3818          not doing pattern detection.  */
   3819       if (!single_exit (loop)
   3820 	  || (!flag_tree_loop_distribute_patterns
   3821 	      && !optimize_loop_for_speed_p (loop)))
   3822 	continue;
   3823 
   3824       /* If niters is unknown don't distribute loop but rather try to transform
   3825 	 it to a call to a builtin.  */
   3826       tree niters = number_of_latch_executions (loop);
   3827       if (niters == NULL_TREE || niters == chrec_dont_know)
   3828 	{
   3829 	  datarefs_vec.create (20);
   3830 	  if (flag_tree_loop_distribute_patterns
   3831 	      && transform_reduction_loop (loop))
   3832 	    {
   3833 	      changed = true;
   3834 	      loops_to_be_destroyed.safe_push (loop);
   3835 	      if (dump_enabled_p ())
   3836 		{
   3837 		  dump_user_location_t loc = find_loop_location (loop);
   3838 		  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
   3839 				   loc, "Loop %d transformed into a builtin.\n",
   3840 				   loop->num);
   3841 		}
   3842 	    }
   3843 	  free_data_refs (datarefs_vec);
   3844 	  continue;
   3845 	}
   3846 
   3847       /* Get the perfect loop nest for distribution.  */
   3848       loop = prepare_perfect_loop_nest (loop);
   3849       for (; loop; loop = loop->inner)
   3850 	{
   3851 	  auto_vec<gimple *> work_list;
   3852 	  if (!find_seed_stmts_for_distribution (loop, &work_list))
   3853 	    break;
   3854 
   3855 	  const char *str = loop->inner ? " nest" : "";
   3856 	  dump_user_location_t loc = find_loop_location (loop);
   3857 	  if (!cd)
   3858 	    {
   3859 	      calculate_dominance_info (CDI_DOMINATORS);
   3860 	      calculate_dominance_info (CDI_POST_DOMINATORS);
   3861 	      cd = new control_dependences ();
   3862 	      free_dominance_info (CDI_POST_DOMINATORS);
   3863 	    }
   3864 
   3865 	  bool destroy_p;
   3866 	  int nb_generated_loops, nb_generated_calls;
   3867 	  nb_generated_loops
   3868 	    = distribute_loop (loop, work_list, cd, &nb_generated_calls,
   3869 			       &destroy_p, (!optimize_loop_for_speed_p (loop)
   3870 					    || !flag_tree_loop_distribution));
   3871 	  if (destroy_p)
   3872 	    loops_to_be_destroyed.safe_push (loop);
   3873 
   3874 	  if (nb_generated_loops + nb_generated_calls > 0)
   3875 	    {
   3876 	      changed = true;
   3877 	      if (dump_enabled_p ())
   3878 		dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
   3879 				 loc, "Loop%s %d distributed: split to %d loops "
   3880 				 "and %d library calls.\n", str, loop->num,
   3881 				 nb_generated_loops, nb_generated_calls);
   3882 
   3883 	      break;
   3884 	    }
   3885 
   3886 	  if (dump_file && (dump_flags & TDF_DETAILS))
   3887 	    fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
   3888 	}
   3889     }
   3890 
   3891   if (cd)
   3892     delete cd;
   3893 
   3894   if (bb_top_order_index != NULL)
   3895     bb_top_order_destroy ();
   3896 
   3897   if (changed)
   3898     {
   3899       /* Destroy loop bodies that could not be reused.  Do this late as we
   3900 	 otherwise can end up refering to stale data in control dependences.  */
   3901       unsigned i;
   3902       class loop *loop;
   3903       FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
   3904 	destroy_loop (loop);
   3905 
   3906       /* Cached scalar evolutions now may refer to wrong or non-existing
   3907 	 loops.  */
   3908       scev_reset ();
   3909       mark_virtual_operands_for_renaming (fun);
   3910       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
   3911     }
   3912 
   3913   checking_verify_loop_structure ();
   3914 
   3915   return changed ? TODO_cleanup_cfg : 0;
   3916 }
   3917 
   3918 
   3919 /* Distribute all loops in the current function.  */
   3920 
   3921 namespace {
   3922 
   3923 const pass_data pass_data_loop_distribution =
   3924 {
   3925   GIMPLE_PASS, /* type */
   3926   "ldist", /* name */
   3927   OPTGROUP_LOOP, /* optinfo_flags */
   3928   TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
   3929   ( PROP_cfg | PROP_ssa ), /* properties_required */
   3930   0, /* properties_provided */
   3931   0, /* properties_destroyed */
   3932   0, /* todo_flags_start */
   3933   0, /* todo_flags_finish */
   3934 };
   3935 
   3936 class pass_loop_distribution : public gimple_opt_pass
   3937 {
   3938 public:
   3939   pass_loop_distribution (gcc::context *ctxt)
   3940     : gimple_opt_pass (pass_data_loop_distribution, ctxt)
   3941   {}
   3942 
   3943   /* opt_pass methods: */
   3944   virtual bool gate (function *)
   3945     {
   3946       return flag_tree_loop_distribution
   3947 	|| flag_tree_loop_distribute_patterns;
   3948     }
   3949 
   3950   virtual unsigned int execute (function *);
   3951 
   3952 }; // class pass_loop_distribution
   3953 
   3954 unsigned int
   3955 pass_loop_distribution::execute (function *fun)
   3956 {
   3957   return loop_distribution ().execute (fun);
   3958 }
   3959 
   3960 } // anon namespace
   3961 
   3962 gimple_opt_pass *
   3963 make_pass_loop_distribution (gcc::context *ctxt)
   3964 {
   3965   return new pass_loop_distribution (ctxt);
   3966 }
   3967