Home | History | Annotate | Line # | Download | only in gcc
omp-oacc-neuter-broadcast.cc revision 1.1.1.1
      1 /* OpenACC worker partitioning via middle end neutering/broadcasting scheme
      2 
      3    Copyright (C) 2015-2022 Free Software Foundation, Inc.
      4 
      5    This file is part of GCC.
      6 
      7    GCC is free software; you can redistribute it and/or modify it
      8    under the terms of the GNU General Public License as published
      9    by the Free Software Foundation; either version 3, or (at your
     10    option) any later version.
     11 
     12    GCC is distributed in the hope that it will be useful, but WITHOUT
     13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
     15    License for more details.
     16 
     17    You should have received a copy of the GNU General Public License
     18    along with GCC; see the file COPYING3.  If not see
     19    <http://www.gnu.org/licenses/>.  */
     20 
     21 #include "config.h"
     22 #include "system.h"
     23 #include "coretypes.h"
     24 #include "backend.h"
     25 #include "rtl.h"
     26 #include "tree.h"
     27 #include "gimple.h"
     28 #include "tree-pass.h"
     29 #include "ssa.h"
     30 #include "cgraph.h"
     31 #include "pretty-print.h"
     32 #include "fold-const.h"
     33 #include "gimplify.h"
     34 #include "gimple-iterator.h"
     35 #include "gimple-walk.h"
     36 #include "tree-inline.h"
     37 #include "langhooks.h"
     38 #include "omp-general.h"
     39 #include "omp-low.h"
     40 #include "gimple-pretty-print.h"
     41 #include "cfghooks.h"
     42 #include "insn-config.h"
     43 #include "recog.h"
     44 #include "internal-fn.h"
     45 #include "bitmap.h"
     46 #include "tree-nested.h"
     47 #include "stor-layout.h"
     48 #include "tree-ssa-threadupdate.h"
     49 #include "tree-into-ssa.h"
     50 #include "splay-tree.h"
     51 #include "target.h"
     52 #include "cfgloop.h"
     53 #include "tree-cfg.h"
     54 #include "omp-offload.h"
     55 #include "attribs.h"
     56 #include "targhooks.h"
     57 #include "diagnostic-core.h"
     58 
     59 /* Loop structure of the function.  The entire function is described as
     60    a NULL loop.  */
     61 /* Adapted from 'gcc/config/nvptx/nvptx.cc:struct parallel'.  */
     62 
     63 struct parallel_g
     64 {
     65   /* Parent parallel.  */
     66   parallel_g *parent;
     67 
     68   /* Next sibling parallel.  */
     69   parallel_g *next;
     70 
     71   /* First child parallel.  */
     72   parallel_g *inner;
     73 
     74   /* Partitioning mask of the parallel.  */
     75   unsigned mask;
     76 
     77   /* Partitioning used within inner parallels. */
     78   unsigned inner_mask;
     79 
     80   /* Location of parallel forked and join.  The forked is the first
     81      block in the parallel and the join is the first block after of
     82      the partition.  */
     83   basic_block forked_block;
     84   basic_block join_block;
     85 
     86   gimple *forked_stmt;
     87   gimple *join_stmt;
     88 
     89   gimple *fork_stmt;
     90   gimple *joining_stmt;
     91 
     92   /* Basic blocks in this parallel, but not in child parallels.  The
     93      FORKED and JOINING blocks are in the partition.  The FORK and JOIN
     94      blocks are not.  */
     95   auto_vec<basic_block> blocks;
     96 
     97   tree record_type;
     98   tree sender_decl;
     99   tree receiver_decl;
    100 
    101 public:
    102   parallel_g (parallel_g *parent, unsigned mode);
    103   ~parallel_g ();
    104 };
    105 
    106 /* Constructor links the new parallel into it's parent's chain of
    107    children.  */
    108 
    109 parallel_g::parallel_g (parallel_g *parent_, unsigned mask_)
    110   :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
    111 {
    112   forked_block = join_block = 0;
    113   forked_stmt = join_stmt = NULL;
    114   fork_stmt = joining_stmt = NULL;
    115 
    116   record_type = NULL_TREE;
    117   sender_decl = NULL_TREE;
    118   receiver_decl = NULL_TREE;
    119 
    120   if (parent)
    121     {
    122       next = parent->inner;
    123       parent->inner = this;
    124     }
    125 }
    126 
    127 parallel_g::~parallel_g ()
    128 {
    129   delete inner;
    130   delete next;
    131 }
    132 
    133 static bool
    134 local_var_based_p (tree decl)
    135 {
    136   switch (TREE_CODE (decl))
    137     {
    138     case VAR_DECL:
    139       return !is_global_var (decl);
    140 
    141     case COMPONENT_REF:
    142     case BIT_FIELD_REF:
    143     case ARRAY_REF:
    144       return local_var_based_p (TREE_OPERAND (decl, 0));
    145 
    146     default:
    147       return false;
    148     }
    149 }
    150 
    151 /* Map of basic blocks to gimple stmts.  */
    152 typedef hash_map<basic_block, gimple *> bb_stmt_map_t;
    153 
    154 /* Calls to OpenACC routines are made by all workers/wavefronts/warps, since
    155    the routine likely contains partitioned loops (else will do its own
    156    neutering and variable propagation). Return TRUE if a function call CALL
    157    should be made in (worker) single mode instead, rather than redundant
    158    mode.  */
    159 
    160 static bool
    161 omp_sese_active_worker_call (gcall *call)
    162 {
    163 #define GOMP_DIM_SEQ GOMP_DIM_MAX
    164   tree fndecl = gimple_call_fndecl (call);
    165 
    166   if (!fndecl)
    167     return true;
    168 
    169   tree attrs = oacc_get_fn_attrib (fndecl);
    170 
    171   if (!attrs)
    172     return true;
    173 
    174   int level = oacc_fn_attrib_level (attrs);
    175 
    176   /* Neither regular functions nor "seq" routines should be run by all threads
    177      in worker-single mode.  */
    178   return level == -1 || level == GOMP_DIM_SEQ;
    179 #undef GOMP_DIM_SEQ
    180 }
    181 
    182 /* Split basic blocks such that each forked and join unspecs are at
    183    the start of their basic blocks.  Thus afterwards each block will
    184    have a single partitioning mode.  We also do the same for return
    185    insns, as they are executed by every thread.  Return the
    186    partitioning mode of the function as a whole.  Populate MAP with
    187    head and tail blocks.  We also clear the BB visited flag, which is
    188    used when finding partitions.  */
    189 /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_split_blocks'.  */
    190 
    191 static void
    192 omp_sese_split_blocks (bb_stmt_map_t *map)
    193 {
    194   auto_vec<gimple *> worklist;
    195   basic_block block;
    196 
    197   /* Locate all the reorg instructions of interest.  */
    198   FOR_ALL_BB_FN (block, cfun)
    199     {
    200       /* Clear visited flag, for use by parallel locator  */
    201       block->flags &= ~BB_VISITED;
    202 
    203       for (gimple_stmt_iterator gsi = gsi_start_bb (block);
    204 	   !gsi_end_p (gsi);
    205 	   gsi_next (&gsi))
    206 	{
    207 	  gimple *stmt = gsi_stmt (gsi);
    208 
    209 	  if (gimple_call_internal_p (stmt, IFN_UNIQUE))
    210 	    {
    211 	      enum ifn_unique_kind k = ((enum ifn_unique_kind)
    212 		TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
    213 
    214 	      if (k == IFN_UNIQUE_OACC_JOIN)
    215 		worklist.safe_push (stmt);
    216 	      else if (k == IFN_UNIQUE_OACC_FORK)
    217 		{
    218 		  gcc_assert (gsi_one_before_end_p (gsi));
    219 		  basic_block forked_block = single_succ (block);
    220 		  gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block);
    221 
    222 		  /* We push a NOP as a placeholder for the "forked" stmt.
    223 		     This is then recognized in omp_sese_find_par.  */
    224 		  gimple *nop = gimple_build_nop ();
    225 		  gsi_insert_before (&gsi2, nop, GSI_SAME_STMT);
    226 
    227 		  worklist.safe_push (nop);
    228 		}
    229 	    }
    230 	  else if (gimple_code (stmt) == GIMPLE_RETURN
    231 		   || gimple_code (stmt) == GIMPLE_COND
    232 		   || gimple_code (stmt) == GIMPLE_SWITCH
    233 		   || (gimple_code (stmt) == GIMPLE_CALL
    234 		       && !gimple_call_internal_p (stmt)
    235 		       && !omp_sese_active_worker_call (as_a <gcall *> (stmt))))
    236 	    worklist.safe_push (stmt);
    237 	  else if (is_gimple_assign (stmt))
    238 	    {
    239 	      tree lhs = gimple_assign_lhs (stmt);
    240 
    241 	      /* Force assignments to components/fields/elements of local
    242 		 aggregates into fully-partitioned (redundant) mode.  This
    243 		 avoids having to broadcast the whole aggregate.  The RHS of
    244 		 the assignment will be propagated using the normal
    245 		 mechanism.  */
    246 
    247 	      switch (TREE_CODE (lhs))
    248 		{
    249 		case COMPONENT_REF:
    250 		case BIT_FIELD_REF:
    251 		case ARRAY_REF:
    252 		  {
    253 		    tree aggr = TREE_OPERAND (lhs, 0);
    254 
    255 		    if (local_var_based_p (aggr))
    256 		      worklist.safe_push (stmt);
    257 		  }
    258 		  break;
    259 
    260 		default:
    261 		  ;
    262 		}
    263 	    }
    264 	}
    265     }
    266 
    267   /* Split blocks on the worklist.  */
    268   unsigned ix;
    269   gimple *stmt;
    270 
    271   for (ix = 0; worklist.iterate (ix, &stmt); ix++)
    272     {
    273       basic_block block = gimple_bb (stmt);
    274 
    275       if (gimple_code (stmt) == GIMPLE_COND)
    276 	{
    277 	  gcond *orig_cond = as_a <gcond *> (stmt);
    278 	  tree_code code = gimple_expr_code (orig_cond);
    279 	  tree pred = make_ssa_name (boolean_type_node);
    280 	  gimple *asgn = gimple_build_assign (pred, code,
    281 			   gimple_cond_lhs (orig_cond),
    282 			   gimple_cond_rhs (orig_cond));
    283 	  gcond *new_cond
    284 	    = gimple_build_cond (NE_EXPR, pred, boolean_false_node,
    285 				 gimple_cond_true_label (orig_cond),
    286 				 gimple_cond_false_label (orig_cond));
    287 
    288 	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
    289 	  gsi_insert_before (&gsi, asgn, GSI_SAME_STMT);
    290 	  gsi_replace (&gsi, new_cond, true);
    291 
    292 	  edge e = split_block (block, asgn);
    293 	  block = e->dest;
    294 	  map->get_or_insert (block) = new_cond;
    295 	}
    296       else if ((gimple_code (stmt) == GIMPLE_CALL
    297 		&& !gimple_call_internal_p (stmt))
    298 	       || is_gimple_assign (stmt))
    299 	{
    300 	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
    301 	  gsi_prev (&gsi);
    302 
    303 	  edge call = split_block (block, gsi_stmt (gsi));
    304 
    305 	  gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest));
    306 
    307 	  edge call_to_ret = split_block (call->dest, call_stmt);
    308 
    309 	  map->get_or_insert (call_to_ret->src) = call_stmt;
    310 	}
    311       else
    312 	{
    313 	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
    314 	  gsi_prev (&gsi);
    315 
    316 	  if (gsi_end_p (gsi))
    317 	    map->get_or_insert (block) = stmt;
    318 	  else
    319 	    {
    320 	      /* Split block before insn. The insn is in the new block.  */
    321 	      edge e = split_block (block, gsi_stmt (gsi));
    322 
    323 	      block = e->dest;
    324 	      map->get_or_insert (block) = stmt;
    325 	    }
    326 	}
    327     }
    328 }
    329 
    330 static const char *
    331 mask_name (unsigned mask)
    332 {
    333   switch (mask)
    334     {
    335     case 0: return "gang redundant";
    336     case 1: return "gang partitioned";
    337     case 2: return "worker partitioned";
    338     case 3: return "gang+worker partitioned";
    339     case 4: return "vector partitioned";
    340     case 5: return "gang+vector partitioned";
    341     case 6: return "worker+vector partitioned";
    342     case 7: return "fully partitioned";
    343     default: return "<illegal>";
    344     }
    345 }
    346 
    347 /* Dump this parallel and all its inner parallels.  */
    348 /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_dump_pars'.  */
    349 
    350 static void
    351 omp_sese_dump_pars (parallel_g *par, unsigned depth)
    352 {
    353   fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n",
    354 	   depth, par->mask, mask_name (par->mask),
    355 	   par->forked_block ? par->forked_block->index : -1,
    356 	   par->join_block ? par->join_block->index : -1);
    357 
    358   fprintf (dump_file, "    blocks:");
    359 
    360   basic_block block;
    361   for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
    362     fprintf (dump_file, " %d", block->index);
    363   fprintf (dump_file, "\n");
    364   if (par->inner)
    365     omp_sese_dump_pars (par->inner, depth + 1);
    366 
    367   if (par->next)
    368     omp_sese_dump_pars (par->next, depth);
    369 }
    370 
    371 /* If BLOCK contains a fork/join marker, process it to create or
    372    terminate a loop structure.  Add this block to the current loop,
    373    and then walk successor blocks.   */
    374 /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_find_par'.  */
    375 
    376 static parallel_g *
    377 omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block)
    378 {
    379   if (block->flags & BB_VISITED)
    380     return par;
    381   block->flags |= BB_VISITED;
    382 
    383   if (gimple **stmtp = map->get (block))
    384     {
    385       gimple *stmt = *stmtp;
    386 
    387       if (gimple_code (stmt) == GIMPLE_COND
    388 	  || gimple_code (stmt) == GIMPLE_SWITCH
    389 	  || gimple_code (stmt) == GIMPLE_RETURN
    390 	  || (gimple_code (stmt) == GIMPLE_CALL
    391 	      && !gimple_call_internal_p (stmt))
    392 	  || is_gimple_assign (stmt))
    393 	{
    394 	  /* A single block that is forced to be at the maximum partition
    395 	     level.  Make a singleton par for it.  */
    396 	  par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG)
    397 				   | GOMP_DIM_MASK (GOMP_DIM_WORKER)
    398 				   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
    399 	  par->forked_block = block;
    400 	  par->forked_stmt = stmt;
    401 	  par->blocks.safe_push (block);
    402 	  par = par->parent;
    403 	  goto walk_successors;
    404 	}
    405       else if (gimple_nop_p (stmt))
    406 	{
    407 	  basic_block pred = single_pred (block);
    408 	  gcc_assert (pred);
    409 	  gimple_stmt_iterator gsi = gsi_last_bb (pred);
    410 	  gimple *final_stmt = gsi_stmt (gsi);
    411 
    412 	  if (gimple_call_internal_p (final_stmt, IFN_UNIQUE))
    413 	    {
    414 	      gcall *call = as_a <gcall *> (final_stmt);
    415 	      enum ifn_unique_kind k = ((enum ifn_unique_kind)
    416 		TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
    417 
    418 	      if (k == IFN_UNIQUE_OACC_FORK)
    419 		{
    420 		  HOST_WIDE_INT dim
    421 		    = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
    422 		  unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
    423 
    424 		  par = new parallel_g (par, mask);
    425 		  par->forked_block = block;
    426 		  par->forked_stmt = final_stmt;
    427 		  par->fork_stmt = stmt;
    428 		}
    429 	      else
    430 		gcc_unreachable ();
    431 	    }
    432 	  else
    433 	    gcc_unreachable ();
    434 	}
    435       else if (gimple_call_internal_p (stmt, IFN_UNIQUE))
    436 	{
    437 	  gcall *call = as_a <gcall *> (stmt);
    438 	  enum ifn_unique_kind k = ((enum ifn_unique_kind)
    439 	    TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
    440 	  if (k == IFN_UNIQUE_OACC_JOIN)
    441 	    {
    442 	      HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
    443 	      unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
    444 
    445 	      gcc_assert (par->mask == mask);
    446 	      par->join_block = block;
    447 	      par->join_stmt = stmt;
    448 	      par = par->parent;
    449 	    }
    450 	  else
    451 	    gcc_unreachable ();
    452 	}
    453       else
    454 	gcc_unreachable ();
    455     }
    456 
    457   if (par)
    458     /* Add this block onto the current loop's list of blocks.  */
    459     par->blocks.safe_push (block);
    460   else
    461     /* This must be the entry block.  Create a NULL parallel.  */
    462     par = new parallel_g (0, 0);
    463 
    464 walk_successors:
    465   /* Walk successor blocks.  */
    466   edge e;
    467   edge_iterator ei;
    468 
    469   FOR_EACH_EDGE (e, ei, block->succs)
    470     omp_sese_find_par (map, par, e->dest);
    471 
    472   return par;
    473 }
    474 
    475 /* DFS walk the CFG looking for fork & join markers.  Construct
    476    loop structures as we go.  MAP is a mapping of basic blocks
    477    to head & tail markers, discovered when splitting blocks.  This
    478    speeds up the discovery.  We rely on the BB visited flag having
    479    been cleared when splitting blocks.  */
    480 /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_discover_pars'.  */
    481 
    482 static parallel_g *
    483 omp_sese_discover_pars (bb_stmt_map_t *map)
    484 {
    485   basic_block block;
    486 
    487   /* Mark exit blocks as visited.  */
    488   block = EXIT_BLOCK_PTR_FOR_FN (cfun);
    489   block->flags |= BB_VISITED;
    490 
    491   /* And entry block as not.  */
    492   block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
    493   block->flags &= ~BB_VISITED;
    494 
    495   parallel_g *par = omp_sese_find_par (map, 0, block);
    496 
    497   if (dump_file)
    498     {
    499       fprintf (dump_file, "\nLoops\n");
    500       omp_sese_dump_pars (par, 0);
    501       fprintf (dump_file, "\n");
    502     }
    503 
    504   return par;
    505 }
    506 
    507 static void
    508 populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single,
    509 			      bitmap vector_single, unsigned outer_mask,
    510 			      int depth)
    511 {
    512   unsigned mask = outer_mask | par->mask;
    513 
    514   basic_block block;
    515 
    516   for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
    517     {
    518       if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
    519 	bitmap_set_bit (worker_single, block->index);
    520 
    521       if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0)
    522 	bitmap_set_bit (vector_single, block->index);
    523     }
    524 
    525   if (par->inner)
    526     populate_single_mode_bitmaps (par->inner, worker_single, vector_single,
    527 				  mask, depth + 1);
    528   if (par->next)
    529     populate_single_mode_bitmaps (par->next, worker_single, vector_single,
    530 				  outer_mask, depth);
    531 }
    532 
    533 /* A map from SSA names or var decls to record fields.  */
    534 
    535 typedef hash_map<tree, tree> field_map_t;
    536 
    537 /* For each propagation record type, this is a map from SSA names or var decls
    538    to propagate, to the field in the record type that should be used for
    539    transmission and reception.  */
    540 
    541 typedef hash_map<tree, field_map_t> record_field_map_t;
    542 
    543 static void
    544 install_var_field (tree var, tree record_type, field_map_t *fields)
    545 {
    546   tree name;
    547   char tmp[20];
    548 
    549   if (TREE_CODE (var) == SSA_NAME)
    550     {
    551       name = SSA_NAME_IDENTIFIER (var);
    552       if (!name)
    553 	{
    554 	  sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var));
    555 	  name = get_identifier (tmp);
    556 	}
    557     }
    558   else if (TREE_CODE (var) == VAR_DECL)
    559     {
    560       name = DECL_NAME (var);
    561       if (!name)
    562 	{
    563 	  sprintf (tmp, "D_%u", (unsigned) DECL_UID (var));
    564 	  name = get_identifier (tmp);
    565 	}
    566     }
    567   else
    568     gcc_unreachable ();
    569 
    570   gcc_assert (!fields->get (var));
    571 
    572   tree type = TREE_TYPE (var);
    573 
    574   if (POINTER_TYPE_P (type)
    575       && TYPE_RESTRICT (type))
    576     type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT);
    577 
    578   tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type);
    579 
    580   if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var))
    581     {
    582       SET_DECL_ALIGN (field, DECL_ALIGN (var));
    583       DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
    584       TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
    585     }
    586   else
    587     SET_DECL_ALIGN (field, TYPE_ALIGN (type));
    588 
    589   fields->put (var, field);
    590 
    591   insert_field_into_struct (record_type, field);
    592 }
    593 
    594 /* Sets of SSA_NAMES or VAR_DECLs to propagate.  */
    595 typedef hash_set<tree> propagation_set;
    596 
    597 static void
    598 find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask,
    599 			     bitmap worker_single, bitmap vector_single,
    600 			     vec<propagation_set *> *prop_set)
    601 {
    602   unsigned mask = outer_mask | par->mask;
    603 
    604   if (par->inner)
    605     find_ssa_names_to_propagate (par->inner, mask, worker_single,
    606 				 vector_single, prop_set);
    607   if (par->next)
    608     find_ssa_names_to_propagate (par->next, outer_mask, worker_single,
    609 				 vector_single, prop_set);
    610 
    611   if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
    612     {
    613       basic_block block;
    614       int ix;
    615 
    616       for (ix = 0; par->blocks.iterate (ix, &block); ix++)
    617 	{
    618 	  for (gphi_iterator psi = gsi_start_phis (block);
    619 	       !gsi_end_p (psi); gsi_next (&psi))
    620 	    {
    621 	      gphi *phi = psi.phi ();
    622 	      use_operand_p use;
    623 	      ssa_op_iter iter;
    624 
    625 	      FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE)
    626 		{
    627 		  tree var = USE_FROM_PTR (use);
    628 
    629 		  if (TREE_CODE (var) != SSA_NAME)
    630 		    continue;
    631 
    632 		  gimple *def_stmt = SSA_NAME_DEF_STMT (var);
    633 
    634 		  if (gimple_nop_p (def_stmt))
    635 		    continue;
    636 
    637 		  basic_block def_bb = gimple_bb (def_stmt);
    638 
    639 		  if (bitmap_bit_p (worker_single, def_bb->index))
    640 		    {
    641 		      if (!(*prop_set)[def_bb->index])
    642 			(*prop_set)[def_bb->index] = new propagation_set;
    643 
    644 		      propagation_set *ws_prop = (*prop_set)[def_bb->index];
    645 
    646 		      ws_prop->add (var);
    647 		    }
    648 		}
    649 	    }
    650 
    651 	  for (gimple_stmt_iterator gsi = gsi_start_bb (block);
    652 	       !gsi_end_p (gsi); gsi_next (&gsi))
    653 	    {
    654 	      use_operand_p use;
    655 	      ssa_op_iter iter;
    656 	      gimple *stmt = gsi_stmt (gsi);
    657 
    658 	      FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
    659 		{
    660 		  tree var = USE_FROM_PTR (use);
    661 
    662 		  gimple *def_stmt = SSA_NAME_DEF_STMT (var);
    663 
    664 		  if (gimple_nop_p (def_stmt))
    665 		    continue;
    666 
    667 		  basic_block def_bb = gimple_bb (def_stmt);
    668 
    669 		  if (bitmap_bit_p (worker_single, def_bb->index))
    670 		    {
    671 		      if (!(*prop_set)[def_bb->index])
    672 			(*prop_set)[def_bb->index] = new propagation_set;
    673 
    674 		      propagation_set *ws_prop = (*prop_set)[def_bb->index];
    675 
    676 		      ws_prop->add (var);
    677 		    }
    678 		}
    679 	    }
    680 	}
    681     }
    682 }
    683 
    684 /* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a
    685    statement.  */
    686 
    687 static tree
    688 find_partitioned_var_uses_1 (tree *node, int *, void *data)
    689 {
    690   walk_stmt_info *wi = (walk_stmt_info *) data;
    691   hash_set<tree> *partitioned_var_uses = (hash_set<tree> *) wi->info;
    692 
    693   if (!wi->is_lhs && VAR_P (*node))
    694     partitioned_var_uses->add (*node);
    695 
    696   return NULL_TREE;
    697 }
    698 
    699 static void
    700 find_partitioned_var_uses (parallel_g *par, unsigned outer_mask,
    701 			   hash_set<tree> *partitioned_var_uses)
    702 {
    703   unsigned mask = outer_mask | par->mask;
    704 
    705   if (par->inner)
    706     find_partitioned_var_uses (par->inner, mask, partitioned_var_uses);
    707   if (par->next)
    708     find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses);
    709 
    710   if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
    711     {
    712       basic_block block;
    713       int ix;
    714 
    715       for (ix = 0; par->blocks.iterate (ix, &block); ix++)
    716 	for (gimple_stmt_iterator gsi = gsi_start_bb (block);
    717 	     !gsi_end_p (gsi); gsi_next (&gsi))
    718 	  {
    719 	    walk_stmt_info wi;
    720 	    memset (&wi, 0, sizeof (wi));
    721 	    wi.info = (void *) partitioned_var_uses;
    722 	    walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi);
    723 	  }
    724     }
    725 }
    726 
    727 /* Gang-private variables (typically placed in a GPU's shared memory) do not
    728    need to be processed by the worker-propagation mechanism.  Populate the
    729    GANG_PRIVATE_VARS set with any such variables found in the current
    730    function.  */
    731 
    732 static void
    733 find_gang_private_vars (hash_set<tree> *gang_private_vars)
    734 {
    735   basic_block block;
    736 
    737   FOR_EACH_BB_FN (block, cfun)
    738     {
    739       for (gimple_stmt_iterator gsi = gsi_start_bb (block);
    740 	   !gsi_end_p (gsi);
    741 	   gsi_next (&gsi))
    742 	{
    743 	  gimple *stmt = gsi_stmt (gsi);
    744 
    745 	  if (gimple_call_internal_p (stmt, IFN_UNIQUE))
    746 	    {
    747 	      enum ifn_unique_kind k = ((enum ifn_unique_kind)
    748 		TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
    749 	      if (k == IFN_UNIQUE_OACC_PRIVATE)
    750 		{
    751 		  HOST_WIDE_INT level
    752 		    = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
    753 		  if (level != GOMP_DIM_GANG)
    754 		    continue;
    755 		  for (unsigned i = 3; i < gimple_call_num_args (stmt); i++)
    756 		    {
    757 		      tree arg = gimple_call_arg (stmt, i);
    758 		      gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
    759 		      tree decl = TREE_OPERAND (arg, 0);
    760 		      gang_private_vars->add (decl);
    761 		    }
    762 		}
    763 	    }
    764 	}
    765     }
    766 }
    767 
    768 static void
    769 find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
    770 			      hash_set<tree> *partitioned_var_uses,
    771 			      hash_set<tree> *gang_private_vars,
    772 			      bitmap writes_gang_private,
    773 			      vec<propagation_set *> *prop_set)
    774 {
    775   unsigned mask = outer_mask | par->mask;
    776 
    777   if (par->inner)
    778     find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
    779 				  gang_private_vars, writes_gang_private,
    780 				  prop_set);
    781   if (par->next)
    782     find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
    783 				  gang_private_vars, writes_gang_private,
    784 				  prop_set);
    785 
    786   if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
    787     {
    788       basic_block block;
    789       int ix;
    790 
    791       for (ix = 0; par->blocks.iterate (ix, &block); ix++)
    792 	{
    793 	  for (gimple_stmt_iterator gsi = gsi_start_bb (block);
    794 	       !gsi_end_p (gsi); gsi_next (&gsi))
    795 	    {
    796 	      gimple *stmt = gsi_stmt (gsi);
    797 	      tree var;
    798 	      unsigned i;
    799 
    800 	      FOR_EACH_LOCAL_DECL (cfun, i, var)
    801 		{
    802 		  if (!VAR_P (var)
    803 		      || is_global_var (var)
    804 		      || AGGREGATE_TYPE_P (TREE_TYPE (var))
    805 		      || !partitioned_var_uses->contains (var))
    806 		    continue;
    807 
    808 		  if (stmt_may_clobber_ref_p (stmt, var))
    809 		    {
    810 		      if (dump_file)
    811 			{
    812 			  fprintf (dump_file, "bb %u: local variable may be "
    813 				   "clobbered in %s mode: ", block->index,
    814 				   mask_name (mask));
    815 			  print_generic_expr (dump_file, var, TDF_SLIM);
    816 			  fprintf (dump_file, "\n");
    817 			}
    818 
    819 		      if (gang_private_vars->contains (var))
    820 			{
    821 			  /* If we write a gang-private variable, we want a
    822 			     barrier at the end of the block.  */
    823 			  bitmap_set_bit (writes_gang_private, block->index);
    824 			  continue;
    825 			}
    826 
    827 		      if (!(*prop_set)[block->index])
    828 			(*prop_set)[block->index] = new propagation_set;
    829 
    830 		      propagation_set *ws_prop
    831 			= (*prop_set)[block->index];
    832 
    833 		      ws_prop->add (var);
    834 		    }
    835 		}
    836 	    }
    837 	}
    838     }
    839 }
    840 
    841 /* Transform basic blocks FROM, TO (which may be the same block) into:
    842    if (GOACC_single_start ())
    843      BLOCK;
    844    GOACC_barrier ();
    845 			      \  |  /
    846 			      +----+
    847 			      |    |        (new) predicate block
    848 			      +----+--
    849    \  |  /   \  |  /	        |t    \
    850    +----+    +----+	      +----+  |
    851    |	|    |    |	===>  |    |  | f   (old) from block
    852    +----+    +----+	      +----+  |
    853      |       t/  \f	        |    /
    854 			      +----+/
    855   (split  (split before       |    |        skip block
    856   at end)   condition)	      +----+
    857 			      t/  \f
    858 */
    859 
    860 static void
    861 worker_single_simple (basic_block from, basic_block to,
    862 		      hash_set<tree> *def_escapes_block)
    863 {
    864   gimple *call, *cond;
    865   tree lhs, decl;
    866   basic_block skip_block;
    867 
    868   gimple_stmt_iterator gsi = gsi_last_bb (to);
    869   if (EDGE_COUNT (to->succs) > 1)
    870     {
    871       gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND);
    872       gsi_prev (&gsi);
    873     }
    874   edge e = split_block (to, gsi_stmt (gsi));
    875   skip_block = e->dest;
    876 
    877   gimple_stmt_iterator start = gsi_after_labels (from);
    878 
    879   decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START);
    880   lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
    881   call = gimple_build_call (decl, 0);
    882   gimple_call_set_lhs (call, lhs);
    883   gsi_insert_before (&start, call, GSI_NEW_STMT);
    884   update_stmt (call);
    885 
    886   cond = gimple_build_cond (EQ_EXPR, lhs,
    887 			    fold_convert_loc (UNKNOWN_LOCATION,
    888 					      TREE_TYPE (lhs),
    889 					      boolean_true_node),
    890 			    NULL_TREE, NULL_TREE);
    891   gsi_insert_after (&start, cond, GSI_NEW_STMT);
    892   update_stmt (cond);
    893 
    894   edge et = split_block (from, cond);
    895   et->flags &= ~EDGE_FALLTHRU;
    896   et->flags |= EDGE_TRUE_VALUE;
    897   /* Make the active worker the more probable path so we prefer fallthrough
    898      (letting the idle workers jump around more).  */
    899   et->probability = profile_probability::likely ();
    900 
    901   edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE);
    902   ef->probability = et->probability.invert ();
    903 
    904   basic_block neutered = split_edge (ef);
    905   gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered);
    906 
    907   for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi))
    908     {
    909       gimple *stmt = gsi_stmt (gsi);
    910       ssa_op_iter iter;
    911       tree var;
    912 
    913       FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF)
    914 	{
    915 	  if (def_escapes_block->contains (var))
    916 	    {
    917 	      gphi *join_phi = create_phi_node (NULL_TREE, skip_block);
    918 	      create_new_def_for (var, join_phi,
    919 				  gimple_phi_result_ptr (join_phi));
    920 	      add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION);
    921 
    922 	      tree neutered_def = copy_ssa_name (var, NULL);
    923 	      /* We really want "don't care" or some value representing
    924 		 undefined here, but optimizers will probably get rid of the
    925 		 zero-assignments anyway.  */
    926 	      gassign *zero = gimple_build_assign (neutered_def,
    927 				build_zero_cst (TREE_TYPE (neutered_def)));
    928 
    929 	      gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING);
    930 	      update_stmt (zero);
    931 
    932 	      add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered),
    933 			   UNKNOWN_LOCATION);
    934 	      update_stmt (join_phi);
    935 	    }
    936 	}
    937     }
    938 }
    939 
    940 static tree
    941 build_receiver_ref (tree var, tree receiver_decl, field_map_t *fields)
    942 {
    943   tree x = build_simple_mem_ref (receiver_decl);
    944   tree field = *fields->get (var);
    945   TREE_THIS_NOTRAP (x) = 1;
    946   x = omp_build_component_ref (x, field);
    947   return x;
    948 }
    949 
    950 static tree
    951 build_sender_ref (tree var, tree sender_decl, field_map_t *fields)
    952 {
    953   if (POINTER_TYPE_P (TREE_TYPE (sender_decl)))
    954     sender_decl = build_simple_mem_ref (sender_decl);
    955   tree field = *fields->get (var);
    956   return omp_build_component_ref (sender_decl, field);
    957 }
    958 
    959 static int
    960 sort_by_ssa_version_or_uid (const void *p1, const void *p2)
    961 {
    962   const tree t1 = *(const tree *)p1;
    963   const tree t2 = *(const tree *)p2;
    964 
    965   if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME)
    966     return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2);
    967   else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME)
    968     return -1;
    969   else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME)
    970     return 1;
    971   else
    972     return DECL_UID (t1) - DECL_UID (t2);
    973 }
    974 
    975 static int
    976 sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2)
    977 {
    978   const tree t1 = *(const tree *)p1;
    979   const tree t2 = *(const tree *)p2;
    980   unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1)));
    981   unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2)));
    982   if (s1 != s2)
    983     return s2 - s1;
    984   else
    985     return sort_by_ssa_version_or_uid (p1, p2);
    986 }
    987 
    988 static void
    989 worker_single_copy (basic_block from, basic_block to,
    990 		    hash_set<tree> *def_escapes_block,
    991 		    hash_set<tree> *worker_partitioned_uses,
    992 		    tree record_type, record_field_map_t *record_field_map,
    993 		    unsigned HOST_WIDE_INT placement,
    994 		    bool isolate_broadcasts, bool has_gang_private_write)
    995 {
    996   /* If we only have virtual defs, we'll have no record type, but we still want
    997      to emit single_copy_start and (particularly) single_copy_end to act as
    998      a vdef source on the neutered edge representing memory writes on the
    999      non-neutered edge.  */
   1000   if (!record_type)
   1001     record_type = char_type_node;
   1002 
   1003   tree sender_decl
   1004     = targetm.goacc.create_worker_broadcast_record (record_type, true,
   1005 						    ".oacc_worker_o",
   1006 						    placement);
   1007   tree receiver_decl
   1008     = targetm.goacc.create_worker_broadcast_record (record_type, false,
   1009 						    ".oacc_worker_i",
   1010 						    placement);
   1011 
   1012   gimple_stmt_iterator gsi = gsi_last_bb (to);
   1013   if (EDGE_COUNT (to->succs) > 1)
   1014     gsi_prev (&gsi);
   1015   edge e = split_block (to, gsi_stmt (gsi));
   1016   basic_block barrier_block = e->dest;
   1017 
   1018   gimple_stmt_iterator start = gsi_after_labels (from);
   1019 
   1020   tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START);
   1021 
   1022   tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
   1023 
   1024   gimple *call
   1025     = gimple_build_call (decl, 1,
   1026 			 POINTER_TYPE_P (TREE_TYPE (sender_decl))
   1027 			 ? sender_decl : build_fold_addr_expr (sender_decl));
   1028   gimple_call_set_lhs (call, lhs);
   1029   gsi_insert_before (&start, call, GSI_NEW_STMT);
   1030   update_stmt (call);
   1031 
   1032   /* The shared-memory range for this block overflowed.  Add a barrier before
   1033      the GOACC_single_copy_start call.  */
   1034   if (isolate_broadcasts)
   1035     {
   1036       decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
   1037       gimple *acc_bar = gimple_build_call (decl, 0);
   1038       gsi_insert_before (&start, acc_bar, GSI_SAME_STMT);
   1039     }
   1040 
   1041   tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
   1042 
   1043   gimple *conv = gimple_build_assign (conv_tmp,
   1044 				      fold_convert (TREE_TYPE (receiver_decl),
   1045 						    lhs));
   1046   update_stmt (conv);
   1047   gsi_insert_after (&start, conv, GSI_NEW_STMT);
   1048   gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp);
   1049   gsi_insert_after (&start, asgn, GSI_NEW_STMT);
   1050   update_stmt (asgn);
   1051 
   1052   tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0);
   1053 
   1054   tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
   1055   asgn = gimple_build_assign (recv_tmp, receiver_decl);
   1056   gsi_insert_after (&start, asgn, GSI_NEW_STMT);
   1057   update_stmt (asgn);
   1058 
   1059   gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE,
   1060 				    NULL_TREE);
   1061   update_stmt (cond);
   1062 
   1063   gsi_insert_after (&start, cond, GSI_NEW_STMT);
   1064 
   1065   edge et = split_block (from, cond);
   1066   et->flags &= ~EDGE_FALLTHRU;
   1067   et->flags |= EDGE_TRUE_VALUE;
   1068   /* Make the active worker the more probable path so we prefer fallthrough
   1069      (letting the idle workers jump around more).  */
   1070   et->probability = profile_probability::likely ();
   1071 
   1072   basic_block body = et->dest;
   1073 
   1074   edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE);
   1075   ef->probability = et->probability.invert ();
   1076 
   1077   gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block);
   1078   cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE);
   1079 
   1080   if (record_type != char_type_node || has_gang_private_write)
   1081     {
   1082       decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
   1083       gimple *acc_bar = gimple_build_call (decl, 0);
   1084 
   1085       gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
   1086       gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
   1087     }
   1088   else
   1089     gsi_insert_before (&bar_gsi, cond, GSI_NEW_STMT);
   1090 
   1091   edge et2 = split_block (barrier_block, cond);
   1092   et2->flags &= ~EDGE_FALLTHRU;
   1093   et2->flags |= EDGE_TRUE_VALUE;
   1094   et2->probability = profile_probability::unlikely ();
   1095 
   1096   basic_block exit_block = et2->dest;
   1097 
   1098   basic_block copyout_block = split_edge (et2);
   1099   edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE);
   1100   ef2->probability = et2->probability.invert ();
   1101 
   1102   gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block);
   1103 
   1104   edge copyout_to_exit = single_succ_edge (copyout_block);
   1105 
   1106   gimple_seq sender_seq = NULL;
   1107 
   1108   /* Make sure we iterate over definitions in a stable order.  */
   1109   auto_vec<tree> escape_vec (def_escapes_block->elements ());
   1110   for (hash_set<tree>::iterator it = def_escapes_block->begin ();
   1111        it != def_escapes_block->end (); ++it)
   1112     escape_vec.quick_push (*it);
   1113   escape_vec.qsort (sort_by_ssa_version_or_uid);
   1114 
   1115   for (unsigned i = 0; i < escape_vec.length (); i++)
   1116     {
   1117       tree var = escape_vec[i];
   1118 
   1119       if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var))
   1120 	continue;
   1121 
   1122       tree barrier_def = 0;
   1123 
   1124       if (TREE_CODE (var) == SSA_NAME)
   1125 	{
   1126 	  gimple *def_stmt = SSA_NAME_DEF_STMT (var);
   1127 
   1128 	  if (gimple_nop_p (def_stmt))
   1129 	    continue;
   1130 
   1131 	  /* The barrier phi takes one result from the actual work of the
   1132 	     block we're neutering, and the other result is constant zero of
   1133 	     the same type.  */
   1134 
   1135 	  gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block);
   1136 	  barrier_def = create_new_def_for (var, barrier_phi,
   1137 			  gimple_phi_result_ptr (barrier_phi));
   1138 
   1139 	  add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION);
   1140 	  add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef,
   1141 		       UNKNOWN_LOCATION);
   1142 
   1143 	  update_stmt (barrier_phi);
   1144 	}
   1145       else
   1146 	gcc_assert (TREE_CODE (var) == VAR_DECL);
   1147 
   1148       /* If we had no record type, we will have no fields map.  */
   1149       field_map_t *fields = record_field_map->get (record_type);
   1150 
   1151       if (worker_partitioned_uses->contains (var)
   1152 	  && fields
   1153 	  && fields->get (var))
   1154 	{
   1155 	  tree neutered_def = make_ssa_name (TREE_TYPE (var));
   1156 
   1157 	  /* Receive definition from shared memory block.  */
   1158 
   1159 	  tree receiver_ref = build_receiver_ref (var, receiver_decl, fields);
   1160 	  gassign *recv = gimple_build_assign (neutered_def,
   1161 					       receiver_ref);
   1162 	  gsi_insert_after (&copyout_gsi, recv, GSI_CONTINUE_LINKING);
   1163 	  update_stmt (recv);
   1164 
   1165 	  if (TREE_CODE (var) == VAR_DECL)
   1166 	    {
   1167 	      /* If it's a VAR_DECL, we only copied to an SSA temporary.  Copy
   1168 		 to the final location now.  */
   1169 	      gassign *asgn = gimple_build_assign (var, neutered_def);
   1170 	      gsi_insert_after (&copyout_gsi, asgn, GSI_CONTINUE_LINKING);
   1171 	      update_stmt (asgn);
   1172 	    }
   1173 	  else
   1174 	    {
   1175 	      /* If it's an SSA name, create a new phi at the join node to
   1176 		 represent either the output from the active worker (the
   1177 		 barrier) or the inactive workers (the copyout block).  */
   1178 	      gphi *join_phi = create_phi_node (NULL_TREE, exit_block);
   1179 	      create_new_def_for (barrier_def, join_phi,
   1180 				  gimple_phi_result_ptr (join_phi));
   1181 	      add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION);
   1182 	      add_phi_arg (join_phi, neutered_def, copyout_to_exit,
   1183 			   UNKNOWN_LOCATION);
   1184 	      update_stmt (join_phi);
   1185 	    }
   1186 
   1187 	  /* Send definition to shared memory block.  */
   1188 
   1189 	  tree sender_ref = build_sender_ref (var, sender_decl, fields);
   1190 
   1191 	  if (TREE_CODE (var) == SSA_NAME)
   1192 	    {
   1193 	      gassign *send = gimple_build_assign (sender_ref, var);
   1194 	      gimple_seq_add_stmt (&sender_seq, send);
   1195 	      update_stmt (send);
   1196 	    }
   1197 	  else if (TREE_CODE (var) == VAR_DECL)
   1198 	    {
   1199 	      tree tmp = make_ssa_name (TREE_TYPE (var));
   1200 	      gassign *send = gimple_build_assign (tmp, var);
   1201 	      gimple_seq_add_stmt (&sender_seq, send);
   1202 	      update_stmt (send);
   1203 	      send = gimple_build_assign (sender_ref, tmp);
   1204 	      gimple_seq_add_stmt (&sender_seq, send);
   1205 	      update_stmt (send);
   1206 	    }
   1207 	  else
   1208 	    gcc_unreachable ();
   1209 	}
   1210     }
   1211 
   1212   /* The shared-memory range for this block overflowed.  Add a barrier at the
   1213      end.  */
   1214   if (isolate_broadcasts)
   1215     {
   1216       gsi = gsi_start_bb (exit_block);
   1217       decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
   1218       gimple *acc_bar = gimple_build_call (decl, 0);
   1219       gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
   1220     }
   1221 
   1222   /* It's possible for the ET->DEST block (the work done by the active thread)
   1223      to finish with a control-flow insn, e.g. a UNIQUE function call.  Split
   1224      the block and add SENDER_SEQ in the latter part to avoid having control
   1225      flow in the middle of a BB.  */
   1226 
   1227   decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END);
   1228   call = gimple_build_call (decl, 1,
   1229 			    POINTER_TYPE_P (TREE_TYPE (sender_decl))
   1230 			    ? sender_decl
   1231 			    : build_fold_addr_expr (sender_decl));
   1232   gimple_seq_add_stmt (&sender_seq, call);
   1233 
   1234   gsi = gsi_last_bb (body);
   1235   gimple *last = gsi_stmt (gsi);
   1236   basic_block sender_block = split_block (body, last)->dest;
   1237   gsi = gsi_last_bb (sender_block);
   1238   gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING);
   1239 }
   1240 
   1241 typedef hash_map<basic_block, std::pair<unsigned HOST_WIDE_INT, bool> >
   1242   blk_offset_map_t;
   1243 
   1244 static void
   1245 neuter_worker_single (parallel_g *par, unsigned outer_mask,
   1246 		      bitmap worker_single, bitmap vector_single,
   1247 		      vec<propagation_set *> *prop_set,
   1248 		      hash_set<tree> *partitioned_var_uses,
   1249 		      record_field_map_t *record_field_map,
   1250 		      blk_offset_map_t *blk_offset_map,
   1251 		      bitmap writes_gang_private)
   1252 {
   1253   unsigned mask = outer_mask | par->mask;
   1254 
   1255   if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
   1256     {
   1257       basic_block block;
   1258 
   1259       for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
   1260 	{
   1261 	  bool has_defs = false;
   1262 	  hash_set<tree> def_escapes_block;
   1263 	  hash_set<tree> worker_partitioned_uses;
   1264 	  unsigned j;
   1265 	  tree var;
   1266 
   1267 	  FOR_EACH_SSA_NAME (j, var, cfun)
   1268 	    {
   1269 	      if (SSA_NAME_IS_VIRTUAL_OPERAND (var))
   1270 		{
   1271 		  has_defs = true;
   1272 		  continue;
   1273 		}
   1274 
   1275 	      gimple *def_stmt = SSA_NAME_DEF_STMT (var);
   1276 
   1277 	      if (gimple_nop_p (def_stmt))
   1278 		continue;
   1279 
   1280 	      if (gimple_bb (def_stmt)->index != block->index)
   1281 		continue;
   1282 
   1283 	      gimple *use_stmt;
   1284 	      imm_use_iterator use_iter;
   1285 	      bool uses_outside_block = false;
   1286 	      bool worker_partitioned_use = false;
   1287 
   1288 	      FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var)
   1289 		{
   1290 		  int blocknum = gimple_bb (use_stmt)->index;
   1291 
   1292 		  /* Don't propagate SSA names that are only used in the
   1293 		     current block, unless the usage is in a phi node: that
   1294 		     means the name left the block, then came back in at the
   1295 		     top.  */
   1296 		  if (blocknum != block->index
   1297 		      || gimple_code (use_stmt) == GIMPLE_PHI)
   1298 		    uses_outside_block = true;
   1299 		  if (!bitmap_bit_p (worker_single, blocknum))
   1300 		    worker_partitioned_use = true;
   1301 		}
   1302 
   1303 	      if (uses_outside_block)
   1304 		def_escapes_block.add (var);
   1305 
   1306 	      if (worker_partitioned_use)
   1307 		{
   1308 		  worker_partitioned_uses.add (var);
   1309 		  has_defs = true;
   1310 		}
   1311 	    }
   1312 
   1313 	  propagation_set *ws_prop = (*prop_set)[block->index];
   1314 
   1315 	  if (ws_prop)
   1316 	    {
   1317 	      for (propagation_set::iterator it = ws_prop->begin ();
   1318 		   it != ws_prop->end ();
   1319 		   ++it)
   1320 		{
   1321 		  tree var = *it;
   1322 		  if (TREE_CODE (var) == VAR_DECL)
   1323 		    {
   1324 		      def_escapes_block.add (var);
   1325 		      if (partitioned_var_uses->contains (var))
   1326 			{
   1327 			  worker_partitioned_uses.add (var);
   1328 			  has_defs = true;
   1329 			}
   1330 		    }
   1331 		}
   1332 
   1333 	      delete ws_prop;
   1334 	      (*prop_set)[block->index] = 0;
   1335 	    }
   1336 
   1337 	  bool only_marker_fns = true;
   1338 	  bool join_block = false;
   1339 
   1340 	  for (gimple_stmt_iterator gsi = gsi_start_bb (block);
   1341 	       !gsi_end_p (gsi);
   1342 	       gsi_next (&gsi))
   1343 	    {
   1344 	      gimple *stmt = gsi_stmt (gsi);
   1345 	      if (gimple_code (stmt) == GIMPLE_CALL
   1346 		  && gimple_call_internal_p (stmt, IFN_UNIQUE))
   1347 		{
   1348 		  enum ifn_unique_kind k = ((enum ifn_unique_kind)
   1349 		    TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
   1350 		  if (k != IFN_UNIQUE_OACC_PRIVATE
   1351 		      && k != IFN_UNIQUE_OACC_JOIN
   1352 		      && k != IFN_UNIQUE_OACC_FORK
   1353 		      && k != IFN_UNIQUE_OACC_HEAD_MARK
   1354 		      && k != IFN_UNIQUE_OACC_TAIL_MARK)
   1355 		    only_marker_fns = false;
   1356 		  else if (k == IFN_UNIQUE_OACC_JOIN)
   1357 		    /* The JOIN marker is special in that it *cannot* be
   1358 		       predicated for worker zero, because it may be lowered
   1359 		       to a barrier instruction and all workers must typically
   1360 		       execute that barrier.  We shouldn't be doing any
   1361 		       broadcasts from the join block anyway.  */
   1362 		    join_block = true;
   1363 		}
   1364 	      else if (gimple_code (stmt) == GIMPLE_CALL
   1365 		       && gimple_call_internal_p (stmt, IFN_GOACC_LOOP))
   1366 		/* Empty.  */;
   1367 	      else if (gimple_nop_p (stmt))
   1368 		/* Empty.  */;
   1369 	      else
   1370 		only_marker_fns = false;
   1371 	    }
   1372 
   1373 	  /* We can skip predicating this block for worker zero if the only
   1374 	     thing it contains is marker functions that will be removed in the
   1375 	     oaccdevlow pass anyway.
   1376 	     Don't do this if the block has (any) phi nodes, because those
   1377 	     might define SSA names that need broadcasting.
   1378 	     TODO: We might be able to skip transforming blocks that only
   1379 	     contain some other trivial statements too.  */
   1380 	  if (only_marker_fns && !phi_nodes (block))
   1381 	    continue;
   1382 
   1383 	  gcc_assert (!join_block);
   1384 
   1385 	  if (has_defs)
   1386 	    {
   1387 	      tree record_type = (tree) block->aux;
   1388 	      std::pair<unsigned HOST_WIDE_INT, bool> *off_rngalloc
   1389 		= blk_offset_map->get (block);
   1390 	      gcc_assert (!record_type || off_rngalloc);
   1391 	      unsigned HOST_WIDE_INT offset
   1392 		= off_rngalloc ? off_rngalloc->first : 0;
   1393 	      bool range_allocated
   1394 		= off_rngalloc ? off_rngalloc->second : true;
   1395 	      bool has_gang_private_write
   1396 		= bitmap_bit_p (writes_gang_private, block->index);
   1397 	      worker_single_copy (block, block, &def_escapes_block,
   1398 				  &worker_partitioned_uses, record_type,
   1399 				  record_field_map,
   1400 				  offset, !range_allocated,
   1401 				  has_gang_private_write);
   1402 	    }
   1403 	  else
   1404 	    worker_single_simple (block, block, &def_escapes_block);
   1405 	}
   1406     }
   1407 
   1408   if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
   1409     {
   1410       basic_block block;
   1411 
   1412       for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
   1413 	for (gimple_stmt_iterator gsi = gsi_start_bb (block);
   1414 	     !gsi_end_p (gsi);
   1415 	     gsi_next (&gsi))
   1416 	  {
   1417 	    gimple *stmt = gsi_stmt (gsi);
   1418 
   1419 	    if (gimple_code (stmt) == GIMPLE_CALL
   1420 		&& !gimple_call_internal_p (stmt)
   1421 		&& !omp_sese_active_worker_call (as_a <gcall *> (stmt)))
   1422 	      {
   1423 		/* If we have an OpenACC routine call in worker-single mode,
   1424 		   place barriers before and afterwards to prevent
   1425 		   clobbering re-used shared memory regions (as are used
   1426 		   for AMDGCN at present, for example).  */
   1427 		tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
   1428 		gsi_insert_before (&gsi, gimple_build_call (decl, 0),
   1429 				   GSI_SAME_STMT);
   1430 		gsi_insert_after (&gsi, gimple_build_call (decl, 0),
   1431 				  GSI_NEW_STMT);
   1432 	      }
   1433 	  }
   1434     }
   1435 
   1436   if (par->inner)
   1437     neuter_worker_single (par->inner, mask, worker_single, vector_single,
   1438 			  prop_set, partitioned_var_uses, record_field_map,
   1439 			  blk_offset_map, writes_gang_private);
   1440   if (par->next)
   1441     neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
   1442 			  prop_set, partitioned_var_uses, record_field_map,
   1443 			  blk_offset_map, writes_gang_private);
   1444 }
   1445 
   1446 static void
   1447 dfs_broadcast_reachable_1 (basic_block bb, sbitmap reachable)
   1448 {
   1449   if (bb->flags & BB_VISITED)
   1450     return;
   1451 
   1452   bb->flags |= BB_VISITED;
   1453 
   1454   if (bb->succs)
   1455     {
   1456       edge e;
   1457       edge_iterator ei;
   1458       FOR_EACH_EDGE (e, ei, bb->succs)
   1459 	{
   1460 	  basic_block dest = e->dest;
   1461 	  if (dest->aux)
   1462 	    bitmap_set_bit (reachable, dest->index);
   1463 	  else
   1464 	    dfs_broadcast_reachable_1 (dest, reachable);
   1465 	}
   1466     }
   1467 }
   1468 
   1469 typedef std::pair<int, tree> idx_decl_pair_t;
   1470 
   1471 typedef auto_vec<splay_tree> used_range_vec_t;
   1472 
   1473 static int
   1474 sort_size_descending (const void *a, const void *b)
   1475 {
   1476   const idx_decl_pair_t *pa = (const idx_decl_pair_t *) a;
   1477   const idx_decl_pair_t *pb = (const idx_decl_pair_t *) b;
   1478   unsigned HOST_WIDE_INT asize = tree_to_uhwi (TYPE_SIZE_UNIT (pa->second));
   1479   unsigned HOST_WIDE_INT bsize = tree_to_uhwi (TYPE_SIZE_UNIT (pb->second));
   1480   return bsize - asize;
   1481 }
   1482 
   1483 class addr_range
   1484 {
   1485 public:
   1486   addr_range (unsigned HOST_WIDE_INT addr_lo, unsigned HOST_WIDE_INT addr_hi)
   1487     : lo (addr_lo), hi (addr_hi)
   1488     { }
   1489   addr_range (const addr_range &ar) : lo (ar.lo), hi (ar.hi)
   1490     { }
   1491   addr_range () : lo (0), hi (0)
   1492     { }
   1493 
   1494   bool invalid () { return lo == 0 && hi == 0; }
   1495 
   1496   unsigned HOST_WIDE_INT lo;
   1497   unsigned HOST_WIDE_INT hi;
   1498 };
   1499 
   1500 static int
   1501 splay_tree_compare_addr_range (splay_tree_key a, splay_tree_key b)
   1502 {
   1503   addr_range *ar = (addr_range *) a;
   1504   addr_range *br = (addr_range *) b;
   1505   if (ar->lo == br->lo && ar->hi == br->hi)
   1506     return 0;
   1507   if (ar->hi <= br->lo)
   1508     return -1;
   1509   else if (ar->lo >= br->hi)
   1510     return 1;
   1511   return 0;
   1512 }
   1513 
   1514 static void
   1515 splay_tree_free_key (splay_tree_key k)
   1516 {
   1517   addr_range *ar = (addr_range *) k;
   1518   delete ar;
   1519 }
   1520 
   1521 static addr_range
   1522 first_fit_range (splay_tree s, unsigned HOST_WIDE_INT size,
   1523 		 unsigned HOST_WIDE_INT align, addr_range *bounds)
   1524 {
   1525   splay_tree_node min = splay_tree_min (s);
   1526   if (min)
   1527     {
   1528       splay_tree_node next;
   1529       while ((next = splay_tree_successor (s, min->key)))
   1530 	{
   1531 	  unsigned HOST_WIDE_INT lo = ((addr_range *) min->key)->hi;
   1532 	  unsigned HOST_WIDE_INT hi = ((addr_range *) next->key)->lo;
   1533 	  unsigned HOST_WIDE_INT base = (lo + align - 1) & ~(align - 1);
   1534 	  if (base + size <= hi)
   1535 	    return addr_range (base, base + size);
   1536 	  min = next;
   1537 	}
   1538 
   1539       unsigned HOST_WIDE_INT base = ((addr_range *)min->key)->hi;
   1540       base = (base + align - 1) & ~(align - 1);
   1541       if (base + size <= bounds->hi)
   1542 	return addr_range (base, base + size);
   1543       else
   1544 	return addr_range ();
   1545     }
   1546   else
   1547     {
   1548       unsigned HOST_WIDE_INT lo = bounds->lo;
   1549       lo = (lo + align - 1) & ~(align - 1);
   1550       if (lo + size <= bounds->hi)
   1551 	return addr_range (lo, lo + size);
   1552       else
   1553 	return addr_range ();
   1554     }
   1555 }
   1556 
   1557 static int
   1558 merge_ranges_1 (splay_tree_node n, void *ptr)
   1559 {
   1560   splay_tree accum = (splay_tree) ptr;
   1561   addr_range ar = *(addr_range *) n->key;
   1562 
   1563   splay_tree_node old = splay_tree_lookup (accum, n->key);
   1564 
   1565   /* We might have an overlap.  Create a new range covering the
   1566      overlapping parts.  */
   1567   if (old)
   1568     {
   1569       addr_range *old_ar = (addr_range *) old->key;
   1570       ar.lo = MIN (old_ar->lo, ar.lo);
   1571       ar.hi = MAX (old_ar->hi, ar.hi);
   1572       splay_tree_remove (accum, old->key);
   1573     }
   1574 
   1575   addr_range *new_ar = new addr_range (ar);
   1576 
   1577   splay_tree_insert (accum, (splay_tree_key) new_ar, n->value);
   1578 
   1579   return 0;
   1580 }
   1581 
   1582 static void
   1583 merge_ranges (splay_tree accum, splay_tree sp)
   1584 {
   1585   splay_tree_foreach (sp, merge_ranges_1, (void *) accum);
   1586 }
   1587 
   1588 static void
   1589 oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
   1590 		   unsigned HOST_WIDE_INT bounds_hi)
   1591 {
   1592   bb_stmt_map_t bb_stmt_map;
   1593   auto_bitmap worker_single, vector_single;
   1594 
   1595   omp_sese_split_blocks (&bb_stmt_map);
   1596 
   1597   if (dump_file)
   1598     {
   1599       fprintf (dump_file, "\n\nAfter splitting:\n\n");
   1600       dump_function_to_file (current_function_decl, dump_file, dump_flags);
   1601     }
   1602 
   1603   unsigned mask = 0;
   1604 
   1605   /* If this is a routine, calculate MASK as if the outer levels are already
   1606      partitioned.  */
   1607   {
   1608     tree attr = oacc_get_fn_attrib (current_function_decl);
   1609     tree dims = TREE_VALUE (attr);
   1610     unsigned ix;
   1611     for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
   1612       {
   1613 	tree allowed = TREE_PURPOSE (dims);
   1614 	if (allowed && integer_zerop (allowed))
   1615 	  mask |= GOMP_DIM_MASK (ix);
   1616       }
   1617   }
   1618 
   1619   parallel_g *par = omp_sese_discover_pars (&bb_stmt_map);
   1620   populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0);
   1621 
   1622   basic_block bb;
   1623   FOR_ALL_BB_FN (bb, cfun)
   1624     bb->aux = NULL;
   1625 
   1626   vec<propagation_set *> prop_set (vNULL);
   1627   prop_set.safe_grow_cleared (last_basic_block_for_fn (cfun), true);
   1628 
   1629   find_ssa_names_to_propagate (par, mask, worker_single, vector_single,
   1630 			       &prop_set);
   1631 
   1632   hash_set<tree> partitioned_var_uses;
   1633   hash_set<tree> gang_private_vars;
   1634   auto_bitmap writes_gang_private;
   1635 
   1636   find_gang_private_vars (&gang_private_vars);
   1637   find_partitioned_var_uses (par, mask, &partitioned_var_uses);
   1638   find_local_vars_to_propagate (par, mask, &partitioned_var_uses,
   1639 				&gang_private_vars, writes_gang_private,
   1640 				&prop_set);
   1641 
   1642   record_field_map_t record_field_map;
   1643 
   1644   FOR_ALL_BB_FN (bb, cfun)
   1645     {
   1646       propagation_set *ws_prop = prop_set[bb->index];
   1647       if (ws_prop)
   1648 	{
   1649 	  tree record_type = lang_hooks.types.make_type (RECORD_TYPE);
   1650 	  tree name = create_tmp_var_name (".oacc_ws_data_s");
   1651 	  name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type);
   1652 	  DECL_ARTIFICIAL (name) = 1;
   1653 	  DECL_NAMELESS (name) = 1;
   1654 	  TYPE_NAME (record_type) = name;
   1655 	  TYPE_ARTIFICIAL (record_type) = 1;
   1656 
   1657 	  auto_vec<tree> field_vec (ws_prop->elements ());
   1658 	  for (hash_set<tree>::iterator it = ws_prop->begin ();
   1659 	       it != ws_prop->end (); ++it)
   1660 	    field_vec.quick_push (*it);
   1661 
   1662 	  field_vec.qsort (sort_by_size_then_ssa_version_or_uid);
   1663 
   1664 	  bool existed;
   1665 	  field_map_t *fields
   1666 	    = &record_field_map.get_or_insert (record_type, &existed);
   1667 	  gcc_checking_assert (!existed);
   1668 
   1669 	  /* Insert var fields in reverse order, so the last inserted element
   1670 	     is the first in the structure.  */
   1671 	  for (int i = field_vec.length () - 1; i >= 0; i--)
   1672 	    install_var_field (field_vec[i], record_type, fields);
   1673 
   1674 	  layout_type (record_type);
   1675 
   1676 	  bb->aux = (tree) record_type;
   1677 	}
   1678     }
   1679 
   1680   sbitmap *reachable
   1681     = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
   1682 			    last_basic_block_for_fn (cfun));
   1683 
   1684   bitmap_vector_clear (reachable, last_basic_block_for_fn (cfun));
   1685 
   1686   auto_vec<std::pair<int, tree> > priority;
   1687 
   1688   FOR_ALL_BB_FN (bb, cfun)
   1689     {
   1690       if (bb->aux)
   1691 	{
   1692 	  tree record_type = (tree) bb->aux;
   1693 
   1694 	  basic_block bb2;
   1695 	  FOR_ALL_BB_FN (bb2, cfun)
   1696 	    bb2->flags &= ~BB_VISITED;
   1697 
   1698 	  priority.safe_push (std::make_pair (bb->index, record_type));
   1699 	  dfs_broadcast_reachable_1 (bb, reachable[bb->index]);
   1700 	}
   1701     }
   1702 
   1703   sbitmap *inverted
   1704     = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
   1705 			    last_basic_block_for_fn (cfun));
   1706 
   1707   bitmap_vector_clear (inverted, last_basic_block_for_fn (cfun));
   1708 
   1709   for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
   1710     {
   1711       sbitmap_iterator bi;
   1712       unsigned int j;
   1713       EXECUTE_IF_SET_IN_BITMAP (reachable[i], 0, j, bi)
   1714 	bitmap_set_bit (inverted[j], i);
   1715     }
   1716 
   1717   for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
   1718     bitmap_ior (reachable[i], reachable[i], inverted[i]);
   1719 
   1720   sbitmap_vector_free (inverted);
   1721 
   1722   used_range_vec_t used_ranges;
   1723 
   1724   used_ranges.safe_grow_cleared (last_basic_block_for_fn (cfun));
   1725 
   1726   blk_offset_map_t blk_offset_map;
   1727 
   1728   addr_range worker_shm_bounds (bounds_lo, bounds_hi);
   1729 
   1730   priority.qsort (sort_size_descending);
   1731   for (unsigned int i = 0; i < priority.length (); i++)
   1732     {
   1733       idx_decl_pair_t p = priority[i];
   1734       int blkno = p.first;
   1735       tree record_type = p.second;
   1736       HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type));
   1737       HOST_WIDE_INT align = TYPE_ALIGN_UNIT (record_type);
   1738 
   1739       splay_tree conflicts = splay_tree_new (splay_tree_compare_addr_range,
   1740 					     splay_tree_free_key, NULL);
   1741 
   1742       if (!used_ranges[blkno])
   1743 	used_ranges[blkno] = splay_tree_new (splay_tree_compare_addr_range,
   1744 					     splay_tree_free_key, NULL);
   1745       else
   1746 	merge_ranges (conflicts, used_ranges[blkno]);
   1747 
   1748       sbitmap_iterator bi;
   1749       unsigned int j;
   1750       EXECUTE_IF_SET_IN_BITMAP (reachable[blkno], 0, j, bi)
   1751 	if (used_ranges[j])
   1752 	  merge_ranges (conflicts, used_ranges[j]);
   1753 
   1754       addr_range ar
   1755 	= first_fit_range (conflicts, size, align, &worker_shm_bounds);
   1756 
   1757       splay_tree_delete (conflicts);
   1758 
   1759       if (ar.invalid ())
   1760 	{
   1761 	  unsigned HOST_WIDE_INT base
   1762 	    = (bounds_lo + align - 1) & ~(align - 1);
   1763 	  if (base + size > bounds_hi)
   1764 	    error_at (UNKNOWN_LOCATION, "shared-memory region overflow");
   1765 	  std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
   1766 	    = std::make_pair (base, false);
   1767 	  blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
   1768 	}
   1769       else
   1770 	{
   1771 	  splay_tree_node old = splay_tree_lookup (used_ranges[blkno],
   1772 						   (splay_tree_key) &ar);
   1773 	  if (old)
   1774 	    {
   1775 	      fprintf (stderr, "trying to map [%d..%d] but [%d..%d] is "
   1776 		       "already mapped in block %d\n", (int) ar.lo,
   1777 		       (int) ar.hi, (int) ((addr_range *) old->key)->lo,
   1778 		       (int) ((addr_range *) old->key)->hi, blkno);
   1779 	      abort ();
   1780 	    }
   1781 
   1782 	  addr_range *arp = new addr_range (ar);
   1783 	  splay_tree_insert (used_ranges[blkno], (splay_tree_key) arp,
   1784 			     (splay_tree_value) blkno);
   1785 	  std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
   1786 	    = std::make_pair (ar.lo, true);
   1787 	  blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
   1788 	}
   1789     }
   1790 
   1791   sbitmap_vector_free (reachable);
   1792 
   1793   neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
   1794 			&partitioned_var_uses, &record_field_map,
   1795 			&blk_offset_map, writes_gang_private);
   1796 
   1797   record_field_map.empty ();
   1798 
   1799   /* These are supposed to have been 'delete'd by 'neuter_worker_single'.  */
   1800   for (auto it : prop_set)
   1801     gcc_checking_assert (!it);
   1802   prop_set.release ();
   1803 
   1804   delete par;
   1805 
   1806   /* This doesn't seem to make a difference.  */
   1807   loops_state_clear (LOOP_CLOSED_SSA);
   1808 
   1809   /* Neutering worker-single neutered blocks will invalidate dominance info.
   1810      It may be possible to incrementally update just the affected blocks, but
   1811      obliterate everything for now.  */
   1812   free_dominance_info (CDI_DOMINATORS);
   1813   free_dominance_info (CDI_POST_DOMINATORS);
   1814 
   1815   if (dump_file)
   1816     {
   1817       fprintf (dump_file, "\n\nAfter neutering:\n\n");
   1818       dump_function_to_file (current_function_decl, dump_file, dump_flags);
   1819     }
   1820 }
   1821 
   1822 static int
   1823 execute_omp_oacc_neuter_broadcast ()
   1824 {
   1825   unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX];
   1826   unsigned HOST_WIDE_INT private_size[GOMP_DIM_MAX];
   1827 
   1828   for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
   1829     {
   1830       reduction_size[i] = 0;
   1831       private_size[i] = 0;
   1832     }
   1833 
   1834   /* Calculate shared memory size required for reduction variables and
   1835      gang-private memory for this offloaded function.  */
   1836   basic_block bb;
   1837   FOR_ALL_BB_FN (bb, cfun)
   1838     {
   1839       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
   1840 	   !gsi_end_p (gsi);
   1841 	   gsi_next (&gsi))
   1842 	{
   1843 	  gimple *stmt = gsi_stmt (gsi);
   1844 	  if (!is_gimple_call (stmt))
   1845 	    continue;
   1846 	  gcall *call = as_a <gcall *> (stmt);
   1847 	  if (!gimple_call_internal_p (call))
   1848 	    continue;
   1849 	  enum internal_fn ifn_code = gimple_call_internal_fn (call);
   1850 	  switch (ifn_code)
   1851 	    {
   1852 	    default: break;
   1853 	    case IFN_GOACC_REDUCTION:
   1854 	      if (integer_minus_onep (gimple_call_arg (call, 3)))
   1855 		continue;
   1856 	      else
   1857 		{
   1858 		  unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0));
   1859 		  /* Only count reduction variables once: the choice to pick
   1860 		     the setup call is fairly arbitrary.  */
   1861 		  if (code == IFN_GOACC_REDUCTION_SETUP)
   1862 		    {
   1863 		      int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
   1864 		      tree var = gimple_call_arg (call, 2);
   1865 		      tree offset = gimple_call_arg (call, 5);
   1866 		      tree var_type = TREE_TYPE (var);
   1867 		      unsigned HOST_WIDE_INT limit
   1868 			= (tree_to_uhwi (offset)
   1869 			   + tree_to_uhwi (TYPE_SIZE_UNIT (var_type)));
   1870 		      reduction_size[level]
   1871 			= MAX (reduction_size[level], limit);
   1872 		    }
   1873 		}
   1874 	      break;
   1875 	    case IFN_UNIQUE:
   1876 	      {
   1877 		enum ifn_unique_kind kind
   1878 		  = ((enum ifn_unique_kind)
   1879 		     TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
   1880 
   1881 		if (kind == IFN_UNIQUE_OACC_PRIVATE)
   1882 		  {
   1883 		    HOST_WIDE_INT level
   1884 		      = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
   1885 		    if (level == -1)
   1886 		      break;
   1887 		    for (unsigned i = 3;
   1888 			 i < gimple_call_num_args (call);
   1889 			 i++)
   1890 		      {
   1891 			tree arg = gimple_call_arg (call, i);
   1892 			gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
   1893 			tree decl = TREE_OPERAND (arg, 0);
   1894 			unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (decl);
   1895 			private_size[level] = ((private_size[level] + align - 1)
   1896 					       & ~(align - 1));
   1897 			unsigned HOST_WIDE_INT decl_size
   1898 			  = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (decl)));
   1899 			private_size[level] += decl_size;
   1900 		      }
   1901 		  }
   1902 	      }
   1903 	      break;
   1904 	    }
   1905 	}
   1906     }
   1907 
   1908   int dims[GOMP_DIM_MAX];
   1909   for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
   1910     dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
   1911 
   1912   /* Find bounds of shared-memory buffer space we can use.  */
   1913   unsigned HOST_WIDE_INT bounds_lo = 0, bounds_hi = 0;
   1914   if (targetm.goacc.shared_mem_layout)
   1915     targetm.goacc.shared_mem_layout (&bounds_lo, &bounds_hi, dims,
   1916 				     private_size, reduction_size);
   1917 
   1918   /* Perform worker partitioning unless we know 'num_workers(1)'.  */
   1919   if (dims[GOMP_DIM_WORKER] != 1)
   1920     oacc_do_neutering (bounds_lo, bounds_hi);
   1921 
   1922   return 0;
   1923 }
   1924 
   1925 namespace {
   1926 
   1927 const pass_data pass_data_omp_oacc_neuter_broadcast =
   1928 {
   1929   GIMPLE_PASS, /* type */
   1930   "omp_oacc_neuter_broadcast", /* name */
   1931   OPTGROUP_OMP, /* optinfo_flags */
   1932   TV_NONE, /* tv_id */
   1933   PROP_cfg, /* properties_required */
   1934   0, /* properties_provided */
   1935   0, /* properties_destroyed */
   1936   0, /* todo_flags_start */
   1937   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
   1938 };
   1939 
   1940 class pass_omp_oacc_neuter_broadcast : public gimple_opt_pass
   1941 {
   1942 public:
   1943   pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
   1944     : gimple_opt_pass (pass_data_omp_oacc_neuter_broadcast, ctxt)
   1945   {}
   1946 
   1947   /* opt_pass methods: */
   1948   virtual bool gate (function *fun)
   1949   {
   1950     if (!flag_openacc)
   1951       return false;
   1952 
   1953     if (!targetm.goacc.create_worker_broadcast_record)
   1954       return false;
   1955 
   1956     /* Only relevant for OpenACC offloaded functions.  */
   1957     tree attr = oacc_get_fn_attrib (fun->decl);
   1958     if (!attr)
   1959       return false;
   1960 
   1961     return true;
   1962   }
   1963 
   1964   virtual unsigned int execute (function *)
   1965     {
   1966       return execute_omp_oacc_neuter_broadcast ();
   1967     }
   1968 
   1969 }; // class pass_omp_oacc_neuter_broadcast
   1970 
   1971 } // anon namespace
   1972 
   1973 gimple_opt_pass *
   1974 make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
   1975 {
   1976   return new pass_omp_oacc_neuter_broadcast (ctxt);
   1977 }
   1978