Home | History | Annotate | Line # | Download | only in gcc
      1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
      2    and a lowering pass for OpenACC device directives.
      3 
      4    Copyright (C) 2005-2022 Free Software Foundation, Inc.
      5 
      6 This file is part of GCC.
      7 
      8 GCC is free software; you can redistribute it and/or modify it under
      9 the terms of the GNU General Public License as published by the Free
     10 Software Foundation; either version 3, or (at your option) any later
     11 version.
     12 
     13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
     14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
     15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     16 for more details.
     17 
     18 You should have received a copy of the GNU General Public License
     19 along with GCC; see the file COPYING3.  If not see
     20 <http://www.gnu.org/licenses/>.  */
     21 
     22 #include "config.h"
     23 #include "system.h"
     24 #include "coretypes.h"
     25 #include "backend.h"
     26 #include "target.h"
     27 #include "tree.h"
     28 #include "gimple.h"
     29 #include "tree-pass.h"
     30 #include "ssa.h"
     31 #include "cgraph.h"
     32 #include "pretty-print.h"
     33 #include "diagnostic-core.h"
     34 #include "fold-const.h"
     35 #include "internal-fn.h"
     36 #include "langhooks.h"
     37 #include "gimplify.h"
     38 #include "gimple-iterator.h"
     39 #include "gimplify-me.h"
     40 #include "gimple-walk.h"
     41 #include "tree-cfg.h"
     42 #include "tree-into-ssa.h"
     43 #include "tree-nested.h"
     44 #include "stor-layout.h"
     45 #include "common/common-target.h"
     46 #include "omp-general.h"
     47 #include "omp-offload.h"
     48 #include "lto-section-names.h"
     49 #include "gomp-constants.h"
     50 #include "gimple-pretty-print.h"
     51 #include "intl.h"
     52 #include "stringpool.h"
     53 #include "attribs.h"
     54 #include "cfgloop.h"
     55 #include "context.h"
     56 #include "convert.h"
     57 #include "opts.h"
     58 
     59 /* Describe the OpenACC looping structure of a function.  The entire
     60    function is held in a 'NULL' loop.  */
     61 
     62 struct oacc_loop
     63 {
     64   oacc_loop *parent; /* Containing loop.  */
     65 
     66   oacc_loop *child; /* First inner loop.  */
     67 
     68   oacc_loop *sibling; /* Next loop within same parent.  */
     69 
     70   location_t loc; /* Location of the loop start.  */
     71 
     72   gcall *marker; /* Initial head marker.  */
     73 
     74   gcall *heads[GOMP_DIM_MAX];  /* Head marker functions.  */
     75   gcall *tails[GOMP_DIM_MAX];  /* Tail marker functions.  */
     76 
     77   tree routine;  /* Pseudo-loop enclosing a routine.  */
     78 
     79   unsigned mask;   /* Partitioning mask.  */
     80   unsigned e_mask; /* Partitioning of element loops (when tiling).  */
     81   unsigned inner;  /* Partitioning of inner loops.  */
     82   unsigned flags;  /* Partitioning flags.  */
     83   vec<gcall *> ifns;  /* Contained loop abstraction functions.  */
     84   tree chunk_size; /* Chunk size.  */
     85   gcall *head_end; /* Final marker of head sequence.  */
     86 };
     87 
     88 /* Holds offload tables with decls.  */
     89 vec<tree, va_gc> *offload_funcs, *offload_vars;
     90 
     91 /* Return level at which oacc routine may spawn a partitioned loop, or
     92    -1 if it is not a routine (i.e. is an offload fn).  */
     93 
     94 int
     95 oacc_fn_attrib_level (tree attr)
     96 {
     97   tree pos = TREE_VALUE (attr);
     98 
     99   if (!TREE_PURPOSE (pos))
    100     return -1;
    101 
    102   int ix = 0;
    103   for (ix = 0; ix != GOMP_DIM_MAX;
    104        ix++, pos = TREE_CHAIN (pos))
    105     if (!integer_zerop (TREE_PURPOSE (pos)))
    106       break;
    107 
    108   return ix;
    109 }
    110 
    111 /* Helper function for omp_finish_file routine.  Takes decls from V_DECLS and
    112    adds their addresses and sizes to constructor-vector V_CTOR.  */
    113 
    114 static void
    115 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
    116 					 vec<constructor_elt, va_gc> *v_ctor)
    117 {
    118   unsigned len = vec_safe_length (v_decls);
    119   for (unsigned i = 0; i < len; i++)
    120     {
    121       tree it = (*v_decls)[i];
    122       bool is_var = VAR_P (it);
    123       bool is_link_var
    124 	= is_var
    125 #ifdef ACCEL_COMPILER
    126 	  && DECL_HAS_VALUE_EXPR_P (it)
    127 #endif
    128 	  && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
    129 
    130       /* See also omp_finish_file and output_offload_tables in lto-cgraph.cc.  */
    131       if (!in_lto_p && !symtab_node::get (it))
    132 	continue;
    133 
    134       tree size = NULL_TREE;
    135       if (is_var)
    136 	size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
    137 
    138       tree addr;
    139       if (!is_link_var)
    140 	addr = build_fold_addr_expr (it);
    141       else
    142 	{
    143 #ifdef ACCEL_COMPILER
    144 	  /* For "omp declare target link" vars add address of the pointer to
    145 	     the target table, instead of address of the var.  */
    146 	  tree value_expr = DECL_VALUE_EXPR (it);
    147 	  tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
    148 	  varpool_node::finalize_decl (link_ptr_decl);
    149 	  addr = build_fold_addr_expr (link_ptr_decl);
    150 #else
    151 	  addr = build_fold_addr_expr (it);
    152 #endif
    153 
    154 	  /* Most significant bit of the size marks "omp declare target link"
    155 	     vars in host and target tables.  */
    156 	  unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
    157 	  isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
    158 			    * BITS_PER_UNIT - 1);
    159 	  size = wide_int_to_tree (const_ptr_type_node, isize);
    160 	}
    161 
    162       CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
    163       if (is_var)
    164 	CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
    165     }
    166 }
    167 
    168 /* Return true if DECL is a function for which its references should be
    169    analyzed.  */
    170 
    171 static bool
    172 omp_declare_target_fn_p (tree decl)
    173 {
    174   return (TREE_CODE (decl) == FUNCTION_DECL
    175 	  && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
    176 	  && !lookup_attribute ("omp declare target host",
    177 				DECL_ATTRIBUTES (decl))
    178 	  && (!flag_openacc
    179 	      || oacc_get_fn_attrib (decl) == NULL_TREE));
    180 }
    181 
    182 /* Return true if DECL Is a variable for which its initializer references
    183    should be analyzed.  */
    184 
    185 static bool
    186 omp_declare_target_var_p (tree decl)
    187 {
    188   return (VAR_P (decl)
    189 	  && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
    190 	  && !lookup_attribute ("omp declare target link",
    191 				DECL_ATTRIBUTES (decl)));
    192 }
    193 
    194 /* Helper function for omp_discover_implicit_declare_target, called through
    195    walk_tree.  Mark referenced FUNCTION_DECLs implicitly as
    196    declare target to.  */
    197 
    198 static tree
    199 omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
    200 {
    201   if (TREE_CODE (*tp) == CALL_EXPR
    202       && CALL_EXPR_FN (*tp)
    203       && TREE_CODE (CALL_EXPR_FN (*tp)) == ADDR_EXPR
    204       && TREE_CODE (TREE_OPERAND (CALL_EXPR_FN (*tp), 0)) == FUNCTION_DECL
    205       && lookup_attribute ("omp declare variant base",
    206 			   DECL_ATTRIBUTES (TREE_OPERAND (CALL_EXPR_FN (*tp),
    207 							  0))))
    208     {
    209       tree fn = TREE_OPERAND (CALL_EXPR_FN (*tp), 0);
    210       for (tree attr = DECL_ATTRIBUTES (fn); attr; attr = TREE_CHAIN (attr))
    211 	{
    212 	  attr = lookup_attribute ("omp declare variant base", attr);
    213 	  if (attr == NULL_TREE)
    214 	    break;
    215 	  tree purpose = TREE_PURPOSE (TREE_VALUE (attr));
    216 	  if (TREE_CODE (purpose) == FUNCTION_DECL)
    217 	    omp_discover_declare_target_tgt_fn_r (&purpose, walk_subtrees, data);
    218 	}
    219     }
    220   else if (TREE_CODE (*tp) == FUNCTION_DECL)
    221     {
    222       tree decl = *tp;
    223       tree id = get_identifier ("omp declare target");
    224       symtab_node *node = symtab_node::get (*tp);
    225       if (node != NULL)
    226 	{
    227 	  while (node->alias_target
    228 		 && TREE_CODE (node->alias_target) == FUNCTION_DECL)
    229 	    {
    230 	      if (!omp_declare_target_fn_p (node->decl)
    231 		  && !lookup_attribute ("omp declare target host",
    232 					DECL_ATTRIBUTES (node->decl)))
    233 		{
    234 		  node->offloadable = 1;
    235 		  DECL_ATTRIBUTES (node->decl)
    236 		    = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
    237 		}
    238 	      node = symtab_node::get (node->alias_target);
    239 	    }
    240 	  symtab_node *new_node = node->ultimate_alias_target ();
    241 	  decl = new_node->decl;
    242 	  while (node != new_node)
    243 	    {
    244 	      if (!omp_declare_target_fn_p (node->decl)
    245 		  && !lookup_attribute ("omp declare target host",
    246 					DECL_ATTRIBUTES (node->decl)))
    247 		{
    248 		  node->offloadable = 1;
    249 		  DECL_ATTRIBUTES (node->decl)
    250 		    = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
    251 		}
    252 	      gcc_assert (node->alias && node->analyzed);
    253 	      node = node->get_alias_target ();
    254 	    }
    255 	  node->offloadable = 1;
    256 	  if (ENABLE_OFFLOADING)
    257 	    g->have_offload = true;
    258 	}
    259       if (omp_declare_target_fn_p (decl)
    260 	  || lookup_attribute ("omp declare target host",
    261 			       DECL_ATTRIBUTES (decl)))
    262 	return NULL_TREE;
    263 
    264       if (!DECL_EXTERNAL (decl) && DECL_SAVED_TREE (decl))
    265 	((vec<tree> *) data)->safe_push (decl);
    266       DECL_ATTRIBUTES (decl) = tree_cons (id, NULL_TREE,
    267 					  DECL_ATTRIBUTES (decl));
    268     }
    269   else if (TYPE_P (*tp))
    270     *walk_subtrees = 0;
    271   /* else if (TREE_CODE (*tp) == OMP_TARGET)
    272        {
    273 	 if (tree dev = omp_find_clause (OMP_TARGET_CLAUSES (*tp)))
    274 	   if (OMP_DEVICE_ANCESTOR (dev))
    275 	     *walk_subtrees = 0;
    276        } */
    277   return NULL_TREE;
    278 }
    279 
    280 /* Similarly, but ignore references outside of OMP_TARGET regions.  */
    281 
    282 static tree
    283 omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
    284 {
    285   if (TREE_CODE (*tp) == OMP_TARGET)
    286     {
    287       /* And not OMP_DEVICE_ANCESTOR.  */
    288       walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
    289 				    omp_discover_declare_target_tgt_fn_r,
    290 				    data);
    291       *walk_subtrees = 0;
    292     }
    293   else if (TYPE_P (*tp))
    294     *walk_subtrees = 0;
    295   return NULL_TREE;
    296 }
    297 
    298 /* Helper function for omp_discover_implicit_declare_target, called through
    299    walk_tree.  Mark referenced FUNCTION_DECLs implicitly as
    300    declare target to.  */
    301 
    302 static tree
    303 omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
    304 {
    305   if (TREE_CODE (*tp) == FUNCTION_DECL)
    306     return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
    307   else if (VAR_P (*tp)
    308 	   && is_global_var (*tp)
    309 	   && !omp_declare_target_var_p (*tp))
    310     {
    311       tree id = get_identifier ("omp declare target");
    312       if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp)))
    313 	{
    314 	  error_at (DECL_SOURCE_LOCATION (*tp),
    315 		    "%qD specified both in declare target %<link%> and "
    316 		    "implicitly in %<to%> clauses", *tp);
    317 	  DECL_ATTRIBUTES (*tp)
    318 	    = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
    319 	}
    320       if (TREE_STATIC (*tp) && lang_hooks.decls.omp_get_decl_init (*tp))
    321 	((vec<tree> *) data)->safe_push (*tp);
    322       DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
    323       symtab_node *node = symtab_node::get (*tp);
    324       if (node != NULL && !node->offloadable)
    325 	{
    326 	  node->offloadable = 1;
    327 	  if (ENABLE_OFFLOADING)
    328 	    {
    329 	      g->have_offload = true;
    330 	      if (is_a <varpool_node *> (node))
    331 		vec_safe_push (offload_vars, node->decl);
    332 	    }
    333 	}
    334     }
    335   else if (TYPE_P (*tp))
    336     *walk_subtrees = 0;
    337   return NULL_TREE;
    338 }
    339 
    340 /* Perform the OpenMP implicit declare target to discovery.  */
    341 
    342 void
    343 omp_discover_implicit_declare_target (void)
    344 {
    345   cgraph_node *node;
    346   varpool_node *vnode;
    347   auto_vec<tree> worklist;
    348 
    349   FOR_EACH_DEFINED_FUNCTION (node)
    350     if (DECL_SAVED_TREE (node->decl))
    351       {
    352 	struct cgraph_node *cgn;
    353         if (omp_declare_target_fn_p (node->decl))
    354 	  worklist.safe_push (node->decl);
    355 	else if (DECL_STRUCT_FUNCTION (node->decl)
    356 		 && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
    357 	  worklist.safe_push (node->decl);
    358 	for (cgn = first_nested_function (node);
    359 	     cgn; cgn = next_nested_function (cgn))
    360 	  if (omp_declare_target_fn_p (cgn->decl))
    361 	    worklist.safe_push (cgn->decl);
    362 	  else if (DECL_STRUCT_FUNCTION (cgn->decl)
    363 		   && DECL_STRUCT_FUNCTION (cgn->decl)->has_omp_target)
    364 	    worklist.safe_push (cgn->decl);
    365       }
    366   FOR_EACH_VARIABLE (vnode)
    367     if (lang_hooks.decls.omp_get_decl_init (vnode->decl)
    368 	&& omp_declare_target_var_p (vnode->decl))
    369       worklist.safe_push (vnode->decl);
    370   while (!worklist.is_empty ())
    371     {
    372       tree decl = worklist.pop ();
    373       if (VAR_P (decl))
    374 	walk_tree_without_duplicates (lang_hooks.decls.omp_get_decl_init (decl),
    375 				      omp_discover_declare_target_var_r,
    376 				      &worklist);
    377       else if (omp_declare_target_fn_p (decl))
    378 	walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
    379 				      omp_discover_declare_target_tgt_fn_r,
    380 				      &worklist);
    381       else
    382 	walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
    383 				      omp_discover_declare_target_fn_r,
    384 				      &worklist);
    385     }
    386 
    387   lang_hooks.decls.omp_finish_decl_inits ();
    388 }
    389 
    390 
    391 /* Create new symbols containing (address, size) pairs for global variables,
    392    marked with "omp declare target" attribute, as well as addresses for the
    393    functions, which are outlined offloading regions.  */
    394 void
    395 omp_finish_file (void)
    396 {
    397   unsigned num_funcs = vec_safe_length (offload_funcs);
    398   unsigned num_vars = vec_safe_length (offload_vars);
    399 
    400   if (num_funcs == 0 && num_vars == 0)
    401     return;
    402 
    403   if (targetm_common.have_named_sections)
    404     {
    405       vec<constructor_elt, va_gc> *v_f, *v_v;
    406       vec_alloc (v_f, num_funcs);
    407       vec_alloc (v_v, num_vars * 2);
    408 
    409       add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
    410       add_decls_addresses_to_decl_constructor (offload_vars, v_v);
    411 
    412       tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
    413 						    vec_safe_length (v_v));
    414       tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
    415 						     num_funcs);
    416       SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
    417       SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
    418       tree ctor_v = build_constructor (vars_decl_type, v_v);
    419       tree ctor_f = build_constructor (funcs_decl_type, v_f);
    420       TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
    421       TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
    422       tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
    423 				    get_identifier (".offload_func_table"),
    424 				    funcs_decl_type);
    425       tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
    426 				   get_identifier (".offload_var_table"),
    427 				   vars_decl_type);
    428       TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
    429       /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
    430 	 otherwise a joint table in a binary will contain padding between
    431 	 tables from multiple object files.  */
    432       DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
    433       SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
    434       SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
    435       DECL_INITIAL (funcs_decl) = ctor_f;
    436       DECL_INITIAL (vars_decl) = ctor_v;
    437       set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
    438       set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
    439 
    440       varpool_node::finalize_decl (vars_decl);
    441       varpool_node::finalize_decl (funcs_decl);
    442     }
    443   else
    444     {
    445       for (unsigned i = 0; i < num_funcs; i++)
    446 	{
    447 	  tree it = (*offload_funcs)[i];
    448 	  /* See also add_decls_addresses_to_decl_constructor
    449 	     and output_offload_tables in lto-cgraph.cc.  */
    450 	  if (!in_lto_p && !symtab_node::get (it))
    451 	    continue;
    452 	  targetm.record_offload_symbol (it);
    453 	}
    454       for (unsigned i = 0; i < num_vars; i++)
    455 	{
    456 	  tree it = (*offload_vars)[i];
    457 	  if (!in_lto_p && !symtab_node::get (it))
    458 	    continue;
    459 #ifdef ACCEL_COMPILER
    460 	  if (DECL_HAS_VALUE_EXPR_P (it)
    461 	      && lookup_attribute ("omp declare target link",
    462 				   DECL_ATTRIBUTES (it)))
    463 	    {
    464 	      tree value_expr = DECL_VALUE_EXPR (it);
    465 	      tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
    466 	      targetm.record_offload_symbol (link_ptr_decl);
    467 	      varpool_node::finalize_decl (link_ptr_decl);
    468 	    }
    469 	  else
    470 #endif
    471 	    targetm.record_offload_symbol (it);
    472 	}
    473     }
    474 }
    475 
    476 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
    477    axis DIM.  Return a tmp var holding the result.  */
    478 
    479 static tree
    480 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
    481 {
    482   tree arg = build_int_cst (unsigned_type_node, dim);
    483   tree size = create_tmp_var (integer_type_node);
    484   enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
    485   gimple *call = gimple_build_call_internal (fn, 1, arg);
    486 
    487   gimple_call_set_lhs (call, size);
    488   gimple_seq_add_stmt (seq, call);
    489 
    490   return size;
    491 }
    492 
    493 /* Find the number of threads (POS = false), or thread number (POS =
    494    true) for an OpenACC region partitioned as MASK.  Setup code
    495    required for the calculation is added to SEQ.  */
    496 
    497 static tree
    498 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
    499 {
    500   tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
    501   unsigned ix;
    502 
    503   /* Start at gang level, and examine relevant dimension indices.  */
    504   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
    505     if (GOMP_DIM_MASK (ix) & mask)
    506       {
    507 	if (res)
    508 	  {
    509 	    /* We had an outer index, so scale that by the size of
    510 	       this dimension.  */
    511 	    tree n = oacc_dim_call (false, ix, seq);
    512 	    res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
    513 	  }
    514 	if (pos)
    515 	  {
    516 	    /* Determine index in this dimension.  */
    517 	    tree id = oacc_dim_call (true, ix, seq);
    518 	    if (res)
    519 	      res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
    520 	    else
    521 	      res = id;
    522 	  }
    523       }
    524 
    525   if (res == NULL_TREE)
    526     res = integer_zero_node;
    527 
    528   return res;
    529 }
    530 
    531 /* Transform IFN_GOACC_LOOP calls to actual code.  See
    532    expand_oacc_for for where these are generated.  At the vector
    533    level, we stride loops, such that each member of a warp will
    534    operate on adjacent iterations.  At the worker and gang level,
    535    each gang/warp executes a set of contiguous iterations.  Chunking
    536    can override this such that each iteration engine executes a
    537    contiguous chunk, and then moves on to stride to the next chunk.  */
    538 
    539 static void
    540 oacc_xform_loop (gcall *call)
    541 {
    542   gimple_stmt_iterator gsi = gsi_for_stmt (call);
    543   enum ifn_goacc_loop_kind code
    544     = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
    545   tree dir = gimple_call_arg (call, 1);
    546   tree range = gimple_call_arg (call, 2);
    547   tree step = gimple_call_arg (call, 3);
    548   tree chunk_size = NULL_TREE;
    549   unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
    550   tree lhs = gimple_call_lhs (call);
    551   tree type = NULL_TREE;
    552   tree diff_type = TREE_TYPE (range);
    553   tree r = NULL_TREE;
    554   gimple_seq seq = NULL;
    555   bool chunking = false, striding = true;
    556   unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
    557   unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
    558 
    559   /* Skip lowering if return value of IFN_GOACC_LOOP call is not used.  */
    560   if (!lhs)
    561     {
    562       gsi_replace_with_seq (&gsi, seq, true);
    563       return;
    564     }
    565 
    566   type = TREE_TYPE (lhs);
    567 
    568 #ifdef ACCEL_COMPILER
    569   chunk_size = gimple_call_arg (call, 4);
    570   if (integer_minus_onep (chunk_size)  /* Force static allocation.  */
    571       || integer_zerop (chunk_size))   /* Default (also static).  */
    572     {
    573       /* If we're at the gang level, we want each to execute a
    574 	 contiguous run of iterations.  Otherwise we want each element
    575 	 to stride.  */
    576       striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
    577       chunking = false;
    578     }
    579   else
    580     {
    581       /* Chunk of size 1 is striding.  */
    582       striding = integer_onep (chunk_size);
    583       chunking = !striding;
    584     }
    585 #endif
    586 
    587   /* striding=true, chunking=true
    588        -> invalid.
    589      striding=true, chunking=false
    590        -> chunks=1
    591      striding=false,chunking=true
    592        -> chunks=ceil (range/(chunksize*threads*step))
    593      striding=false,chunking=false
    594        -> chunk_size=ceil(range/(threads*step)),chunks=1  */
    595   push_gimplify_context (true);
    596 
    597   switch (code)
    598     {
    599     default: gcc_unreachable ();
    600 
    601     case IFN_GOACC_LOOP_CHUNKS:
    602       if (!chunking)
    603 	r = build_int_cst (type, 1);
    604       else
    605 	{
    606 	  /* chunk_max
    607 	     = (range - dir) / (chunks * step * num_threads) + dir  */
    608 	  tree per = oacc_thread_numbers (false, mask, &seq);
    609 	  per = fold_convert (type, per);
    610 	  chunk_size = fold_convert (type, chunk_size);
    611 	  per = fold_build2 (MULT_EXPR, type, per, chunk_size);
    612 	  per = fold_build2 (MULT_EXPR, type, per, step);
    613 	  r = build2 (MINUS_EXPR, type, range, dir);
    614 	  r = build2 (PLUS_EXPR, type, r, per);
    615 	  r = build2 (TRUNC_DIV_EXPR, type, r, per);
    616 	}
    617       break;
    618 
    619     case IFN_GOACC_LOOP_STEP:
    620       {
    621 	/* If striding, step by the entire compute volume, otherwise
    622 	   step by the inner volume.  */
    623 	unsigned volume = striding ? mask : inner_mask;
    624 
    625 	r = oacc_thread_numbers (false, volume, &seq);
    626 	r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
    627       }
    628       break;
    629 
    630     case IFN_GOACC_LOOP_OFFSET:
    631       /* Enable vectorization on non-SIMT targets.  */
    632       if (!targetm.simt.vf
    633 	  && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
    634 	  /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
    635 	     the loop.  */
    636 	  && (flag_tree_loop_vectorize
    637 	      || !OPTION_SET_P (flag_tree_loop_vectorize)))
    638 	{
    639 	  basic_block bb = gsi_bb (gsi);
    640 	  class loop *parent = bb->loop_father;
    641 	  class loop *body = parent->inner;
    642 
    643 	  parent->force_vectorize = true;
    644 	  parent->safelen = INT_MAX;
    645 
    646 	  /* "Chunking loops" may have inner loops.  */
    647 	  if (parent->inner)
    648 	    {
    649 	      body->force_vectorize = true;
    650 	      body->safelen = INT_MAX;
    651 	    }
    652 
    653 	  cfun->has_force_vectorize_loops = true;
    654 	}
    655       if (striding)
    656 	{
    657 	  r = oacc_thread_numbers (true, mask, &seq);
    658 	  r = fold_convert (diff_type, r);
    659 	}
    660       else
    661 	{
    662 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
    663 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
    664 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
    665 				     inner_size, outer_size);
    666 
    667 	  volume = fold_convert (diff_type, volume);
    668 	  if (chunking)
    669 	    chunk_size = fold_convert (diff_type, chunk_size);
    670 	  else
    671 	    {
    672 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
    673 
    674 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
    675 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
    676 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
    677 	    }
    678 
    679 	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
    680 			      fold_convert (diff_type, inner_size));
    681 	  r = oacc_thread_numbers (true, outer_mask, &seq);
    682 	  r = fold_convert (diff_type, r);
    683 	  r = build2 (MULT_EXPR, diff_type, r, span);
    684 
    685 	  tree inner = oacc_thread_numbers (true, inner_mask, &seq);
    686 	  inner = fold_convert (diff_type, inner);
    687 	  r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
    688 
    689 	  if (chunking)
    690 	    {
    691 	      tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
    692 	      tree per
    693 		= fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
    694 	      per = build2 (MULT_EXPR, diff_type, per, chunk);
    695 
    696 	      r = build2 (PLUS_EXPR, diff_type, r, per);
    697 	    }
    698 	}
    699       r = fold_build2 (MULT_EXPR, diff_type, r, step);
    700       if (type != diff_type)
    701 	r = fold_convert (type, r);
    702       break;
    703 
    704     case IFN_GOACC_LOOP_BOUND:
    705       if (striding)
    706 	r = range;
    707       else
    708 	{
    709 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
    710 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
    711 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
    712 				     inner_size, outer_size);
    713 
    714 	  volume = fold_convert (diff_type, volume);
    715 	  if (chunking)
    716 	    chunk_size = fold_convert (diff_type, chunk_size);
    717 	  else
    718 	    {
    719 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
    720 
    721 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
    722 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
    723 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
    724 	    }
    725 
    726 	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
    727 			      fold_convert (diff_type, inner_size));
    728 
    729 	  r = fold_build2 (MULT_EXPR, diff_type, span, step);
    730 
    731 	  tree offset = gimple_call_arg (call, 6);
    732 	  r = build2 (PLUS_EXPR, diff_type, r,
    733 		      fold_convert (diff_type, offset));
    734 	  r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
    735 		      diff_type, r, range);
    736 	}
    737       if (diff_type != type)
    738 	r = fold_convert (type, r);
    739       break;
    740     }
    741 
    742   gimplify_assign (lhs, r, &seq);
    743 
    744   pop_gimplify_context (NULL);
    745 
    746   gsi_replace_with_seq (&gsi, seq, true);
    747 }
    748 
    749 /* Transform a GOACC_TILE call.  Determines the element loop span for
    750    the specified loop of the nest.  This is 1 if we're not tiling.
    751 
    752    GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element);  */
    753 
    754 static void
    755 oacc_xform_tile (gcall *call)
    756 {
    757   gimple_stmt_iterator gsi = gsi_for_stmt (call);
    758   unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
    759   /* Inner loops have higher loop_nos.  */
    760   unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
    761   tree tile_size = gimple_call_arg (call, 2);
    762   unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
    763   tree lhs = gimple_call_lhs (call);
    764   tree type = TREE_TYPE (lhs);
    765   gimple_seq seq = NULL;
    766   tree span = build_int_cst (type, 1);
    767 
    768   gcc_assert (!(e_mask
    769 		& ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
    770 		    | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
    771   push_gimplify_context (!seen_error ());
    772 
    773 #ifndef ACCEL_COMPILER
    774   /* Partitioning disabled on host compilers.  */
    775   e_mask = 0;
    776 #endif
    777   if (!e_mask)
    778     /* Not paritioning.  */
    779     span = integer_one_node;
    780   else if (!integer_zerop (tile_size))
    781     /* User explicitly specified size.  */
    782     span = tile_size;
    783   else
    784     {
    785       /* Pick a size based on the paritioning of the element loop and
    786 	 the number of loop nests.  */
    787       tree first_size = NULL_TREE;
    788       tree second_size = NULL_TREE;
    789 
    790       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
    791 	first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
    792       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
    793 	second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
    794 
    795       if (!first_size)
    796 	{
    797 	  first_size = second_size;
    798 	  second_size = NULL_TREE;
    799 	}
    800 
    801       if (loop_no + 1 == collapse)
    802 	{
    803 	  span = first_size;
    804 	  if (!loop_no && second_size)
    805 	    span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
    806 				span, second_size);
    807 	}
    808       else if (loop_no + 2 == collapse)
    809 	span = second_size;
    810       else
    811 	span = NULL_TREE;
    812 
    813       if (!span)
    814 	/* There's no obvious element size for this loop.  Options
    815 	   are 1, first_size or some non-unity constant (32 is my
    816 	   favourite).   We should gather some statistics.  */
    817 	span = first_size;
    818     }
    819 
    820   span = fold_convert (type, span);
    821   gimplify_assign (lhs, span, &seq);
    822 
    823   pop_gimplify_context (NULL);
    824 
    825   gsi_replace_with_seq (&gsi, seq, true);
    826 }
    827 
    828 /* Default partitioned and minimum partitioned dimensions.  */
    829 
    830 static int oacc_default_dims[GOMP_DIM_MAX];
    831 static int oacc_min_dims[GOMP_DIM_MAX];
    832 
    833 int
    834 oacc_get_default_dim (int dim)
    835 {
    836   gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
    837   return oacc_default_dims[dim];
    838 }
    839 
    840 int
    841 oacc_get_min_dim (int dim)
    842 {
    843   gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
    844   return oacc_min_dims[dim];
    845 }
    846 
    847 /* Parse the default dimension parameter.  This is a set of
    848    :-separated optional compute dimensions.  Each specified dimension
    849    is a positive integer.  When device type support is added, it is
    850    planned to be a comma separated list of such compute dimensions,
    851    with all but the first prefixed by the colon-terminated device
    852    type.  */
    853 
    854 static void
    855 oacc_parse_default_dims (const char *dims)
    856 {
    857   int ix;
    858 
    859   for (ix = GOMP_DIM_MAX; ix--;)
    860     {
    861       oacc_default_dims[ix] = -1;
    862       oacc_min_dims[ix] = 1;
    863     }
    864 
    865 #ifndef ACCEL_COMPILER
    866   /* Cannot be overridden on the host.  */
    867   dims = NULL;
    868 #endif
    869   if (dims)
    870     {
    871       const char *pos = dims;
    872 
    873       for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
    874 	{
    875 	  if (ix)
    876 	    {
    877 	      if (*pos != ':')
    878 		goto malformed;
    879 	      pos++;
    880 	    }
    881 
    882 	  if (*pos != ':')
    883 	    {
    884 	      long val;
    885 	      const char *eptr;
    886 
    887 	      errno = 0;
    888 	      val = strtol (pos, CONST_CAST (char **, &eptr), 10);
    889 	      if (errno || val <= 0 || (int) val != val)
    890 		goto malformed;
    891 	      pos = eptr;
    892 	      oacc_default_dims[ix] = (int) val;
    893 	    }
    894 	}
    895       if (*pos)
    896 	{
    897 	malformed:
    898 	  error_at (UNKNOWN_LOCATION,
    899 		    "%<-fopenacc-dim%> operand is malformed at %qs", pos);
    900 	}
    901     }
    902 
    903   /* Allow the backend to validate the dimensions.  */
    904   targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
    905   targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
    906 }
    907 
    908 /* Validate and update the dimensions for offloaded FN.  ATTRS is the
    909    raw attribute.  DIMS is an array of dimensions, which is filled in.
    910    LEVEL is the partitioning level of a routine, or -1 for an offload
    911    region itself.  USED is the mask of partitioned execution in the
    912    function.  */
    913 
    914 static void
    915 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
    916 {
    917   tree purpose[GOMP_DIM_MAX];
    918   unsigned ix;
    919   tree pos = TREE_VALUE (attrs);
    920 
    921   /* Make sure the attribute creator attached the dimension
    922      information.  */
    923   gcc_assert (pos);
    924 
    925   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
    926     {
    927       purpose[ix] = TREE_PURPOSE (pos);
    928       tree val = TREE_VALUE (pos);
    929       dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
    930       pos = TREE_CHAIN (pos);
    931     }
    932 
    933   bool check = true;
    934 #ifdef ACCEL_COMPILER
    935   check = false;
    936 #endif
    937   if (check
    938       && warn_openacc_parallelism
    939       && !lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn)))
    940     {
    941       static char const *const axes[] =
    942       /* Must be kept in sync with GOMP_DIM enumeration.  */
    943 	{ "gang", "worker", "vector" };
    944       for (ix = level >= 0 ? level : 0; ix != GOMP_DIM_MAX; ix++)
    945 	if (dims[ix] < 0)
    946 	  ; /* Defaulting axis.  */
    947 	else if ((used & GOMP_DIM_MASK (ix)) && dims[ix] == 1)
    948 	  /* There is partitioned execution, but the user requested a
    949 	     dimension size of 1.  They're probably confused.  */
    950 	  warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
    951 		      "region contains %s partitioned code but"
    952 		      " is not %s partitioned", axes[ix], axes[ix]);
    953 	else if (!(used & GOMP_DIM_MASK (ix)) && dims[ix] != 1)
    954 	  /* The dimension is explicitly partitioned to non-unity, but
    955 	     no use is made within the region.  */
    956 	  warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
    957 		      "region is %s partitioned but"
    958 		      " does not contain %s partitioned code",
    959 		      axes[ix], axes[ix]);
    960     }
    961 
    962   bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
    963 
    964   /* Default anything left to 1 or a partitioned default.  */
    965   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
    966     if (dims[ix] < 0)
    967       {
    968 	/* The OpenACC spec says 'If the [num_gangs] clause is not
    969 	   specified, an implementation-defined default will be used;
    970 	   the default may depend on the code within the construct.'
    971 	   (2.5.6).  Thus an implementation is free to choose
    972 	   non-unity default for a parallel region that doesn't have
    973 	   any gang-partitioned loops.  However, it appears that there
    974 	   is a sufficient body of user code that expects non-gang
    975 	   partitioned regions to not execute in gang-redundant mode.
    976 	   So we (a) don't warn about the non-portability and (b) pick
    977 	   the minimum permissible dimension size when there is no
    978 	   partitioned execution.  Otherwise we pick the global
    979 	   default for the dimension, which the user can control.  The
    980 	   same wording and logic applies to num_workers and
    981 	   vector_length, however the worker- or vector- single
    982 	   execution doesn't have the same impact as gang-redundant
    983 	   execution.  (If the minimum gang-level partioning is not 1,
    984 	   the target is probably too confusing.)  */
    985 	dims[ix] = (used & GOMP_DIM_MASK (ix)
    986 		    ? oacc_default_dims[ix] : oacc_min_dims[ix]);
    987 	changed = true;
    988       }
    989 
    990   if (changed)
    991     {
    992       /* Replace the attribute with new values.  */
    993       pos = NULL_TREE;
    994       for (ix = GOMP_DIM_MAX; ix--;)
    995 	pos = tree_cons (purpose[ix],
    996 			 build_int_cst (integer_type_node, dims[ix]), pos);
    997       oacc_replace_fn_attrib (fn, pos);
    998     }
    999 }
   1000 
   1001 /* Create an empty OpenACC loop structure at LOC.  */
   1002 
   1003 static oacc_loop *
   1004 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
   1005 {
   1006   oacc_loop *loop = XCNEW (oacc_loop);
   1007 
   1008   loop->parent = parent;
   1009 
   1010   if (parent)
   1011     {
   1012       loop->sibling = parent->child;
   1013       parent->child = loop;
   1014     }
   1015 
   1016   loop->loc = loc;
   1017   return loop;
   1018 }
   1019 
   1020 /* Create an outermost, dummy OpenACC loop for offloaded function
   1021    DECL.  */
   1022 
   1023 static oacc_loop *
   1024 new_oacc_loop_outer (tree decl)
   1025 {
   1026   return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
   1027 }
   1028 
   1029 /* Start a new OpenACC loop  structure beginning at head marker HEAD.
   1030    Link into PARENT loop.  Return the new loop.  */
   1031 
   1032 static oacc_loop *
   1033 new_oacc_loop (oacc_loop *parent, gcall *marker)
   1034 {
   1035   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
   1036 
   1037   loop->marker = marker;
   1038 
   1039   /* TODO: This is where device_type flattening would occur for the loop
   1040      flags.  */
   1041 
   1042   loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
   1043 
   1044   tree chunk_size = integer_zero_node;
   1045   if (loop->flags & OLF_GANG_STATIC)
   1046     chunk_size = gimple_call_arg (marker, 4);
   1047   loop->chunk_size = chunk_size;
   1048 
   1049   return loop;
   1050 }
   1051 
   1052 /* Create a dummy loop encompassing a call to a openACC routine.
   1053    Extract the routine's partitioning requirements.  */
   1054 
   1055 static void
   1056 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
   1057 {
   1058   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
   1059   int level = oacc_fn_attrib_level (attrs);
   1060 
   1061   gcc_assert (level >= 0);
   1062 
   1063   loop->marker = call;
   1064   loop->routine = decl;
   1065   loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
   1066 		^ (GOMP_DIM_MASK (level) - 1));
   1067 }
   1068 
   1069 /* Finish off the current OpenACC loop ending at tail marker TAIL.
   1070    Return the parent loop.  */
   1071 
   1072 static oacc_loop *
   1073 finish_oacc_loop (oacc_loop *loop)
   1074 {
   1075   /* If the loop has been collapsed, don't partition it.  */
   1076   if (loop->ifns.is_empty ())
   1077     loop->mask = loop->flags = 0;
   1078   return loop->parent;
   1079 }
   1080 
   1081 /* Free all OpenACC loop structures within LOOP (inclusive).  */
   1082 
   1083 static void
   1084 free_oacc_loop (oacc_loop *loop)
   1085 {
   1086   if (loop->sibling)
   1087     free_oacc_loop (loop->sibling);
   1088   if (loop->child)
   1089     free_oacc_loop (loop->child);
   1090 
   1091   loop->ifns.release ();
   1092   free (loop);
   1093 }
   1094 
   1095 /* Dump out the OpenACC loop head or tail beginning at FROM.  */
   1096 
   1097 static void
   1098 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
   1099 		     const char *title, int level)
   1100 {
   1101   enum ifn_unique_kind kind
   1102     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
   1103 
   1104   fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
   1105   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
   1106     {
   1107       gimple *stmt = gsi_stmt (gsi);
   1108 
   1109       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
   1110 	{
   1111 	  enum ifn_unique_kind k
   1112 	    = ((enum ifn_unique_kind) TREE_INT_CST_LOW
   1113 	       (gimple_call_arg (stmt, 0)));
   1114 
   1115 	  if (k == kind && stmt != from)
   1116 	    break;
   1117 	}
   1118       print_gimple_stmt (file, stmt, depth * 2 + 2);
   1119 
   1120       gsi_next (&gsi);
   1121       while (gsi_end_p (gsi))
   1122 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
   1123     }
   1124 }
   1125 
   1126 /* Dump OpenACC loop LOOP, its children, and its siblings.  */
   1127 
   1128 static void
   1129 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
   1130 {
   1131   int ix;
   1132 
   1133   fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
   1134 	   loop->flags, loop->mask,
   1135 	   LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
   1136 
   1137   if (loop->marker)
   1138     print_gimple_stmt (file, loop->marker, depth * 2);
   1139 
   1140   if (loop->routine)
   1141     fprintf (file, "%*sRoutine %s:%u:%s\n",
   1142 	     depth * 2, "", DECL_SOURCE_FILE (loop->routine),
   1143 	     DECL_SOURCE_LINE (loop->routine),
   1144 	     IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
   1145 
   1146   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
   1147     if (loop->heads[ix])
   1148       dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
   1149   for (ix = GOMP_DIM_MAX; ix--;)
   1150     if (loop->tails[ix])
   1151       dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
   1152 
   1153   if (loop->child)
   1154     dump_oacc_loop (file, loop->child, depth + 1);
   1155   if (loop->sibling)
   1156     dump_oacc_loop (file, loop->sibling, depth);
   1157 }
   1158 
   1159 void debug_oacc_loop (oacc_loop *);
   1160 
   1161 /* Dump loops to stderr.  */
   1162 
   1163 DEBUG_FUNCTION void
   1164 debug_oacc_loop (oacc_loop *loop)
   1165 {
   1166   dump_oacc_loop (stderr, loop, 0);
   1167 }
   1168 
   1169 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
   1170    siblings.  */
   1171 
   1172 static void
   1173 inform_oacc_loop (const oacc_loop *loop)
   1174 {
   1175   const char *gang
   1176     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
   1177   const char *worker
   1178     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
   1179   const char *vector
   1180     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
   1181   const char *seq = loop->mask == 0 ? " seq" : "";
   1182   const dump_user_location_t loc
   1183     = dump_user_location_t::from_location_t (loop->loc);
   1184   dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
   1185 		   "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
   1186 		   vector, seq);
   1187 
   1188   if (loop->child)
   1189     inform_oacc_loop (loop->child);
   1190   if (loop->sibling)
   1191     inform_oacc_loop (loop->sibling);
   1192 }
   1193 
   1194 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
   1195    structures as we go.  By construction these loops are properly
   1196    nested.  */
   1197 
   1198 static void
   1199 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
   1200 {
   1201   int marker = 0;
   1202   int remaining = 0;
   1203 
   1204   if (bb->flags & BB_VISITED)
   1205     return;
   1206 
   1207  follow:
   1208   bb->flags |= BB_VISITED;
   1209 
   1210   /* Scan for loop markers.  */
   1211   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
   1212        gsi_next (&gsi))
   1213     {
   1214       gimple *stmt = gsi_stmt (gsi);
   1215 
   1216       if (!is_gimple_call (stmt))
   1217 	continue;
   1218 
   1219       gcall *call = as_a <gcall *> (stmt);
   1220 
   1221       /* If this is a routine, make a dummy loop for it.  */
   1222       if (tree decl = gimple_call_fndecl (call))
   1223 	if (tree attrs = oacc_get_fn_attrib (decl))
   1224 	  {
   1225 	    gcc_assert (!marker);
   1226 	    new_oacc_loop_routine (loop, call, decl, attrs);
   1227 	  }
   1228 
   1229       if (!gimple_call_internal_p (call))
   1230 	continue;
   1231 
   1232       switch (gimple_call_internal_fn (call))
   1233 	{
   1234 	default:
   1235 	  break;
   1236 
   1237 	case IFN_GOACC_LOOP:
   1238 	case IFN_GOACC_TILE:
   1239 	  /* Record the abstraction function, so we can manipulate it
   1240 	     later.  */
   1241 	  loop->ifns.safe_push (call);
   1242 	  break;
   1243 
   1244 	case IFN_UNIQUE:
   1245 	  enum ifn_unique_kind kind
   1246 	    = (enum ifn_unique_kind) (TREE_INT_CST_LOW
   1247 				      (gimple_call_arg (call, 0)));
   1248 	  if (kind == IFN_UNIQUE_OACC_HEAD_MARK
   1249 	      || kind == IFN_UNIQUE_OACC_TAIL_MARK)
   1250 	    {
   1251 	      if (gimple_call_num_args (call) == 2)
   1252 		{
   1253 		  gcc_assert (marker && !remaining);
   1254 		  marker = 0;
   1255 		  if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
   1256 		    loop = finish_oacc_loop (loop);
   1257 		  else
   1258 		    loop->head_end = call;
   1259 		}
   1260 	      else
   1261 		{
   1262 		  int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
   1263 
   1264 		  if (!marker)
   1265 		    {
   1266 		      if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
   1267 			loop = new_oacc_loop (loop, call);
   1268 		      remaining = count;
   1269 		    }
   1270 		  gcc_assert (count == remaining);
   1271 		  if (remaining)
   1272 		    {
   1273 		      remaining--;
   1274 		      if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
   1275 			loop->heads[marker] = call;
   1276 		      else
   1277 			loop->tails[remaining] = call;
   1278 		    }
   1279 		  marker++;
   1280 		}
   1281 	    }
   1282 	}
   1283     }
   1284   if (remaining || marker)
   1285     {
   1286       bb = single_succ (bb);
   1287       gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
   1288       goto follow;
   1289     }
   1290 
   1291   /* Walk successor blocks.  */
   1292   edge e;
   1293   edge_iterator ei;
   1294 
   1295   FOR_EACH_EDGE (e, ei, bb->succs)
   1296     oacc_loop_discover_walk (loop, e->dest);
   1297 }
   1298 
   1299 /* LOOP is the first sibling.  Reverse the order in place and return
   1300    the new first sibling.  Recurse to child loops.  */
   1301 
   1302 static oacc_loop *
   1303 oacc_loop_sibling_nreverse (oacc_loop *loop)
   1304 {
   1305   oacc_loop *last = NULL;
   1306   do
   1307     {
   1308       if (loop->child)
   1309 	loop->child = oacc_loop_sibling_nreverse (loop->child);
   1310 
   1311       oacc_loop *next = loop->sibling;
   1312       loop->sibling = last;
   1313       last = loop;
   1314       loop = next;
   1315     }
   1316   while (loop);
   1317 
   1318   return last;
   1319 }
   1320 
   1321 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
   1322    the current function.  */
   1323 
   1324 static oacc_loop *
   1325 oacc_loop_discovery ()
   1326 {
   1327   /* Clear basic block flags, in particular BB_VISITED which we're going to use
   1328      in the following.  */
   1329   clear_bb_flags ();
   1330 
   1331   oacc_loop *top = new_oacc_loop_outer (current_function_decl);
   1332   oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
   1333 
   1334   /* The siblings were constructed in reverse order, reverse them so
   1335      that diagnostics come out in an unsurprising order.  */
   1336   top = oacc_loop_sibling_nreverse (top);
   1337 
   1338   return top;
   1339 }
   1340 
   1341 /* Transform the abstract internal function markers starting at FROM
   1342    to be for partitioning level LEVEL.  Stop when we meet another HEAD
   1343    or TAIL  marker.  */
   1344 
   1345 static void
   1346 oacc_loop_xform_head_tail (gcall *from, int level)
   1347 {
   1348   enum ifn_unique_kind kind
   1349     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
   1350   tree replacement = build_int_cst (unsigned_type_node, level);
   1351 
   1352   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
   1353     {
   1354       gimple *stmt = gsi_stmt (gsi);
   1355 
   1356       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
   1357 	{
   1358 	  enum ifn_unique_kind k
   1359 	    = ((enum ifn_unique_kind)
   1360 	       TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
   1361 
   1362 	  if (k == IFN_UNIQUE_OACC_FORK
   1363 	      || k == IFN_UNIQUE_OACC_JOIN
   1364 	      || k == IFN_UNIQUE_OACC_PRIVATE)
   1365 	    *gimple_call_arg_ptr (stmt, 2) = replacement;
   1366 	  else if (k == kind && stmt != from)
   1367 	    break;
   1368 	}
   1369       else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
   1370 	*gimple_call_arg_ptr (stmt, 3) = replacement;
   1371       update_stmt (stmt);
   1372 
   1373       gsi_next (&gsi);
   1374       while (gsi_end_p (gsi))
   1375 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
   1376     }
   1377 }
   1378 
   1379 /* Process the discovered OpenACC loops, setting the correct
   1380    partitioning level etc.  */
   1381 
   1382 static void
   1383 oacc_loop_process (oacc_loop *loop, int fn_level)
   1384 {
   1385   if (loop->child)
   1386     oacc_loop_process (loop->child, fn_level);
   1387 
   1388   if (loop->mask && !loop->routine)
   1389     {
   1390       int ix;
   1391       tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
   1392       tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
   1393       tree chunk_arg = loop->chunk_size;
   1394       gcall *call;
   1395 
   1396       for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
   1397 	{
   1398 	  switch (gimple_call_internal_fn (call))
   1399 	    {
   1400 	    case IFN_GOACC_LOOP:
   1401 	      {
   1402 		bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
   1403 		gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
   1404 		if (!is_e)
   1405 		  gimple_call_set_arg (call, 4, chunk_arg);
   1406 	      }
   1407 	      break;
   1408 
   1409 	    case IFN_GOACC_TILE:
   1410 	      gimple_call_set_arg (call, 3, mask_arg);
   1411 	      gimple_call_set_arg (call, 4, e_mask_arg);
   1412 	      break;
   1413 
   1414 	    default:
   1415 	      gcc_unreachable ();
   1416 	    }
   1417 	  update_stmt (call);
   1418 	}
   1419 
   1420       unsigned dim = GOMP_DIM_GANG;
   1421       unsigned mask = loop->mask | loop->e_mask;
   1422       for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
   1423 	{
   1424 	  while (!(GOMP_DIM_MASK (dim) & mask))
   1425 	    dim++;
   1426 
   1427 	  oacc_loop_xform_head_tail (loop->heads[ix], dim);
   1428 	  oacc_loop_xform_head_tail (loop->tails[ix], dim);
   1429 
   1430 	  mask ^= GOMP_DIM_MASK (dim);
   1431 	}
   1432     }
   1433 
   1434   if (loop->sibling)
   1435     oacc_loop_process (loop->sibling, fn_level);
   1436 
   1437 
   1438   /* OpenACC 2.6, 2.9.11. "reduction clause" places a restriction such that
   1439      "The 'reduction' clause may not be specified on an orphaned 'loop'
   1440      construct with the 'gang' clause, or on an orphaned 'loop' construct that
   1441      will generate gang parallelism in a procedure that is compiled with the
   1442      'routine gang' clause."  */
   1443   if (fn_level == GOMP_DIM_GANG
   1444       && (loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
   1445       && (loop->flags & OLF_REDUCTION))
   1446     error_at (loop->loc,
   1447 	      "gang reduction on an orphan loop");
   1448 }
   1449 
   1450 /* Walk the OpenACC loop heirarchy checking and assigning the
   1451    programmer-specified partitionings.  OUTER_MASK is the partitioning
   1452    this loop is contained within.  Return mask of partitioning
   1453    encountered.  If any auto loops are discovered, set GOMP_DIM_MAX
   1454    bit.  */
   1455 
   1456 static unsigned
   1457 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
   1458 {
   1459   unsigned this_mask = loop->mask;
   1460   unsigned mask_all = 0;
   1461   bool noisy = true;
   1462 
   1463 #ifdef ACCEL_COMPILER
   1464   /* When device_type is supported, we want the device compiler to be
   1465      noisy, if the loop parameters are device_type-specific.  */
   1466   noisy = false;
   1467 #endif
   1468 
   1469   if (!loop->routine)
   1470     {
   1471       bool auto_par = (loop->flags & OLF_AUTO) != 0;
   1472       bool seq_par = (loop->flags & OLF_SEQ) != 0;
   1473       bool tiling = (loop->flags & OLF_TILE) != 0;
   1474 
   1475       this_mask = ((loop->flags >> OLF_DIM_BASE)
   1476 		   & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
   1477 
   1478       /* Apply auto partitioning if this is a non-partitioned regular
   1479 	 loop, or (no more than) single axis tiled loop.  */
   1480       bool maybe_auto
   1481 	= !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
   1482 
   1483       if ((this_mask != 0) + auto_par + seq_par > 1)
   1484 	{
   1485 	  if (noisy)
   1486 	    error_at (loop->loc,
   1487 		      seq_par
   1488 		      ? G_("%<seq%> overrides other OpenACC loop specifiers")
   1489 		      : G_("%<auto%> conflicts with other OpenACC loop "
   1490 			   "specifiers"));
   1491 	  maybe_auto = false;
   1492 	  loop->flags &= ~OLF_AUTO;
   1493 	  if (seq_par)
   1494 	    {
   1495 	      loop->flags
   1496 		&= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
   1497 	      this_mask = 0;
   1498 	    }
   1499 	}
   1500 
   1501       if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
   1502 	{
   1503 	  loop->flags |= OLF_AUTO;
   1504 	  mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
   1505 	}
   1506     }
   1507 
   1508   if (this_mask & outer_mask)
   1509     {
   1510       const oacc_loop *outer;
   1511       for (outer = loop->parent; outer; outer = outer->parent)
   1512 	if ((outer->mask | outer->e_mask) & this_mask)
   1513 	  break;
   1514 
   1515       if (noisy)
   1516 	{
   1517 	  if (outer)
   1518 	    {
   1519 	      error_at (loop->loc,
   1520 			loop->routine
   1521 			? G_("routine call uses same OpenACC parallelism"
   1522 			     " as containing loop")
   1523 			: G_("inner loop uses same OpenACC parallelism"
   1524 			     " as containing loop"));
   1525 	      inform (outer->loc, "containing loop here");
   1526 	    }
   1527 	  else
   1528 	    error_at (loop->loc,
   1529 		      loop->routine
   1530 		      ? G_("routine call uses OpenACC parallelism disallowed"
   1531 			   " by containing routine")
   1532 		      : G_("loop uses OpenACC parallelism disallowed"
   1533 			   " by containing routine"));
   1534 
   1535 	  if (loop->routine)
   1536 	    inform (DECL_SOURCE_LOCATION (loop->routine),
   1537 		    "routine %qD declared here", loop->routine);
   1538 	}
   1539       this_mask &= ~outer_mask;
   1540     }
   1541   else
   1542     {
   1543       unsigned outermost = least_bit_hwi (this_mask);
   1544 
   1545       if (outermost && outermost <= outer_mask)
   1546 	{
   1547 	  if (noisy)
   1548 	    {
   1549 	      error_at (loop->loc,
   1550 			"incorrectly nested OpenACC loop parallelism");
   1551 
   1552 	      const oacc_loop *outer;
   1553 	      for (outer = loop->parent;
   1554 		   outer->flags && outer->flags < outermost;
   1555 		   outer = outer->parent)
   1556 		continue;
   1557 	      inform (outer->loc, "containing loop here");
   1558 	    }
   1559 
   1560 	  this_mask &= ~outermost;
   1561 	}
   1562     }
   1563 
   1564   mask_all |= this_mask;
   1565 
   1566   if (loop->flags & OLF_TILE)
   1567     {
   1568       /* When tiling, vector goes to the element loop, and failing
   1569 	 that we put worker there.  The std doesn't contemplate
   1570 	 specifying all three.  We choose to put worker and vector on
   1571 	 the element loops in that case.  */
   1572       unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
   1573       if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
   1574 	this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
   1575 
   1576       loop->e_mask = this_e_mask;
   1577       this_mask ^= this_e_mask;
   1578     }
   1579 
   1580   loop->mask = this_mask;
   1581 
   1582   if (dump_file)
   1583     fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
   1584 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
   1585 	     loop->mask, loop->e_mask);
   1586 
   1587   if (loop->child)
   1588     {
   1589       unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
   1590       loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
   1591       mask_all |= loop->inner;
   1592     }
   1593 
   1594   if (loop->sibling)
   1595     mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
   1596 
   1597   return mask_all;
   1598 }
   1599 
   1600 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
   1601    OUTER_MASK is the partitioning this loop is contained within.
   1602    OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
   1603    Return the cumulative partitioning used by this loop, siblings and
   1604    children.  */
   1605 
   1606 static unsigned
   1607 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
   1608 			   bool outer_assign)
   1609 {
   1610   bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
   1611   bool noisy = true;
   1612   bool tiling = loop->flags & OLF_TILE;
   1613 
   1614 #ifdef ACCEL_COMPILER
   1615   /* When device_type is supported, we want the device compiler to be
   1616      noisy, if the loop parameters are device_type-specific.  */
   1617   noisy = false;
   1618 #endif
   1619 
   1620   if (assign && (!outer_assign || loop->inner))
   1621     {
   1622       /* Allocate outermost and non-innermost loops at the outermost
   1623 	 non-innermost available level.  */
   1624       unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
   1625 
   1626       /* Find the first outermost available partition. */
   1627       while (this_mask <= outer_mask)
   1628 	this_mask <<= 1;
   1629 
   1630       /* Grab two axes if tiling, and we've not assigned anything  */
   1631       if (tiling && !(loop->mask | loop->e_mask))
   1632 	this_mask |= this_mask << 1;
   1633 
   1634       /* Prohibit the innermost partitioning at the moment.  */
   1635       this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
   1636 
   1637       /* Don't use any dimension explicitly claimed by an inner loop. */
   1638       this_mask &= ~loop->inner;
   1639 
   1640       if (tiling && !loop->e_mask)
   1641 	{
   1642 	  /* If we got two axes, allocate the inner one to the element
   1643 	     loop.  */
   1644 	  loop->e_mask = this_mask & (this_mask << 1);
   1645 	  this_mask ^= loop->e_mask;
   1646 	}
   1647 
   1648       loop->mask |= this_mask;
   1649     }
   1650 
   1651   if (loop->child)
   1652     {
   1653       unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
   1654       loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
   1655 					       outer_assign | assign);
   1656     }
   1657 
   1658   if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
   1659     {
   1660       /* Allocate the loop at the innermost available level.  Note
   1661 	 that we do this even if we already assigned this loop the
   1662 	 outermost available level above.  That way we'll partition
   1663 	 this along 2 axes, if they are available.  */
   1664       unsigned this_mask = 0;
   1665 
   1666       /* Determine the outermost partitioning used within this loop.  */
   1667       this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
   1668       this_mask = least_bit_hwi (this_mask);
   1669 
   1670       /* Pick the partitioning just inside that one.  */
   1671       this_mask >>= 1;
   1672 
   1673       /* And avoid picking one use by an outer loop.  */
   1674       this_mask &= ~outer_mask;
   1675 
   1676       /* If tiling and we failed completely above, grab the next one
   1677 	 too.  Making sure it doesn't hit an outer loop.  */
   1678       if (tiling)
   1679 	{
   1680 	  this_mask &= ~(loop->e_mask | loop->mask);
   1681 	  unsigned tile_mask = ((this_mask >> 1)
   1682 				& ~(outer_mask | loop->e_mask | loop->mask));
   1683 
   1684 	  if (tile_mask || loop->mask)
   1685 	    {
   1686 	      loop->e_mask |= this_mask;
   1687 	      this_mask = tile_mask;
   1688 	    }
   1689 	  if (!loop->e_mask && noisy)
   1690 	    warning_at (loop->loc, 0,
   1691 			"insufficient partitioning available"
   1692 			" to parallelize element loop");
   1693 	}
   1694 
   1695       loop->mask |= this_mask;
   1696       if (!loop->mask && noisy)
   1697 	warning_at (loop->loc, 0,
   1698 		    tiling
   1699 		    ? G_("insufficient partitioning available"
   1700 			 " to parallelize tile loop")
   1701 		    : G_("insufficient partitioning available"
   1702 			 " to parallelize loop"));
   1703     }
   1704 
   1705   if (assign && dump_file)
   1706     fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
   1707 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
   1708 	     loop->mask, loop->e_mask);
   1709 
   1710   unsigned inner_mask = 0;
   1711 
   1712   if (loop->sibling)
   1713     inner_mask |= oacc_loop_auto_partitions (loop->sibling,
   1714 					     outer_mask, outer_assign);
   1715 
   1716   inner_mask |= loop->inner | loop->mask | loop->e_mask;
   1717 
   1718   return inner_mask;
   1719 }
   1720 
   1721 /* Walk the OpenACC loop heirarchy to check and assign partitioning
   1722    axes.  Return mask of partitioning.  */
   1723 
   1724 static unsigned
   1725 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
   1726 {
   1727   unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
   1728 
   1729   if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
   1730     {
   1731       mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
   1732       mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
   1733     }
   1734   return mask_all;
   1735 }
   1736 
   1737 /* Default fork/join early expander.  Delete the function calls if
   1738    there is no RTL expander.  */
   1739 
   1740 bool
   1741 default_goacc_fork_join (gcall *ARG_UNUSED (call),
   1742 			 const int *ARG_UNUSED (dims), bool is_fork)
   1743 {
   1744   if (is_fork)
   1745     return targetm.have_oacc_fork ();
   1746   else
   1747     return targetm.have_oacc_join ();
   1748 }
   1749 
   1750 /* Default goacc.reduction early expander.
   1751 
   1752    LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
   1753    If RES_PTR is not integer-zerop:
   1754        SETUP - emit 'LHS = *RES_PTR', LHS = NULL
   1755        TEARDOWN - emit '*RES_PTR = VAR'
   1756    If LHS is not NULL
   1757        emit 'LHS = VAR'   */
   1758 
   1759 void
   1760 default_goacc_reduction (gcall *call)
   1761 {
   1762   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
   1763   gimple_stmt_iterator gsi = gsi_for_stmt (call);
   1764   tree lhs = gimple_call_lhs (call);
   1765   tree var = gimple_call_arg (call, 2);
   1766   gimple_seq seq = NULL;
   1767 
   1768   if (code == IFN_GOACC_REDUCTION_SETUP
   1769       || code == IFN_GOACC_REDUCTION_TEARDOWN)
   1770     {
   1771       /* Setup and Teardown need to copy from/to the receiver object,
   1772 	 if there is one.  */
   1773       tree ref_to_res = gimple_call_arg (call, 1);
   1774 
   1775       if (!integer_zerop (ref_to_res))
   1776 	{
   1777 	  tree dst = build_simple_mem_ref (ref_to_res);
   1778 	  tree src = var;
   1779 
   1780 	  if (code == IFN_GOACC_REDUCTION_SETUP)
   1781 	    {
   1782 	      src = dst;
   1783 	      dst = lhs;
   1784 	      lhs = NULL;
   1785 	    }
   1786 	  gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
   1787 	}
   1788     }
   1789 
   1790   /* Copy VAR to LHS, if there is an LHS.  */
   1791   if (lhs)
   1792     gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
   1793 
   1794   gsi_replace_with_seq (&gsi, seq, true);
   1795 }
   1796 
   1797 struct var_decl_rewrite_info
   1798 {
   1799   gimple *stmt;
   1800   hash_map<tree, tree> *adjusted_vars;
   1801   bool avoid_pointer_conversion;
   1802   bool modified;
   1803 };
   1804 
   1805 /* Helper function for execute_oacc_device_lower.  Rewrite VAR_DECLs (by
   1806    themselves or wrapped in various other nodes) according to ADJUSTED_VARS in
   1807    the var_decl_rewrite_info pointed to via DATA.  Used as part of coercing
   1808    gang-private variables in OpenACC offload regions to reside in GPU shared
   1809    memory.  */
   1810 
   1811 static tree
   1812 oacc_rewrite_var_decl (tree *tp, int *walk_subtrees, void *data)
   1813 {
   1814   walk_stmt_info *wi = (walk_stmt_info *) data;
   1815   var_decl_rewrite_info *info = (var_decl_rewrite_info *) wi->info;
   1816 
   1817   if (TREE_CODE (*tp) == ADDR_EXPR)
   1818     {
   1819       tree arg = TREE_OPERAND (*tp, 0);
   1820       tree *new_arg = info->adjusted_vars->get (arg);
   1821 
   1822       if (new_arg)
   1823 	{
   1824 	  if (info->avoid_pointer_conversion)
   1825 	    {
   1826 	      *tp = build_fold_addr_expr (*new_arg);
   1827 	      info->modified = true;
   1828 	      *walk_subtrees = 0;
   1829 	    }
   1830 	  else
   1831 	    {
   1832 	      gimple_stmt_iterator gsi = gsi_for_stmt (info->stmt);
   1833 	      tree repl = build_fold_addr_expr (*new_arg);
   1834 	      gimple *stmt1
   1835 		= gimple_build_assign (make_ssa_name (TREE_TYPE (repl)), repl);
   1836 	      tree conv = convert_to_pointer (TREE_TYPE (*tp),
   1837 					      gimple_assign_lhs (stmt1));
   1838 	      gimple *stmt2
   1839 		= gimple_build_assign (make_ssa_name (TREE_TYPE (*tp)), conv);
   1840 	      gsi_insert_before (&gsi, stmt1, GSI_SAME_STMT);
   1841 	      gsi_insert_before (&gsi, stmt2, GSI_SAME_STMT);
   1842 	      *tp = gimple_assign_lhs (stmt2);
   1843 	      info->modified = true;
   1844 	      *walk_subtrees = 0;
   1845 	    }
   1846 	}
   1847     }
   1848   else if (TREE_CODE (*tp) == COMPONENT_REF || TREE_CODE (*tp) == ARRAY_REF)
   1849     {
   1850       tree *base = &TREE_OPERAND (*tp, 0);
   1851 
   1852       while (TREE_CODE (*base) == COMPONENT_REF
   1853 	     || TREE_CODE (*base) == ARRAY_REF)
   1854 	base = &TREE_OPERAND (*base, 0);
   1855 
   1856       if (TREE_CODE (*base) != VAR_DECL)
   1857 	return NULL;
   1858 
   1859       tree *new_decl = info->adjusted_vars->get (*base);
   1860       if (!new_decl)
   1861 	return NULL;
   1862 
   1863       int base_quals = TYPE_QUALS (TREE_TYPE (*new_decl));
   1864       tree field = TREE_OPERAND (*tp, 1);
   1865 
   1866       /* Adjust the type of the field.  */
   1867       int field_quals = TYPE_QUALS (TREE_TYPE (field));
   1868       if (TREE_CODE (field) == FIELD_DECL && field_quals != base_quals)
   1869 	{
   1870 	  tree *field_type = &TREE_TYPE (field);
   1871 	  while (TREE_CODE (*field_type) == ARRAY_TYPE)
   1872 	    field_type = &TREE_TYPE (*field_type);
   1873 	  field_quals |= base_quals;
   1874 	  *field_type = build_qualified_type (*field_type, field_quals);
   1875 	}
   1876 
   1877       /* Adjust the type of the component ref itself.  */
   1878       tree comp_type = TREE_TYPE (*tp);
   1879       int comp_quals = TYPE_QUALS (comp_type);
   1880       if (TREE_CODE (*tp) == COMPONENT_REF && comp_quals != base_quals)
   1881 	{
   1882 	  comp_quals |= base_quals;
   1883 	  TREE_TYPE (*tp)
   1884 	    = build_qualified_type (comp_type, comp_quals);
   1885 	}
   1886 
   1887       *base = *new_decl;
   1888       info->modified = true;
   1889     }
   1890   else if (TREE_CODE (*tp) == VAR_DECL)
   1891     {
   1892       tree *new_decl = info->adjusted_vars->get (*tp);
   1893       if (new_decl)
   1894 	{
   1895 	  *tp = *new_decl;
   1896 	  info->modified = true;
   1897 	}
   1898     }
   1899 
   1900   return NULL_TREE;
   1901 }
   1902 
   1903 /* Return TRUE if CALL is a call to a builtin atomic/sync operation.  */
   1904 
   1905 static bool
   1906 is_sync_builtin_call (gcall *call)
   1907 {
   1908   tree callee = gimple_call_fndecl (call);
   1909 
   1910   if (callee != NULL_TREE
   1911       && gimple_call_builtin_p (call, BUILT_IN_NORMAL))
   1912     switch (DECL_FUNCTION_CODE (callee))
   1913       {
   1914 #undef DEF_SYNC_BUILTIN
   1915 #define DEF_SYNC_BUILTIN(ENUM, NAME, TYPE, ATTRS) case ENUM:
   1916 #include "sync-builtins.def"
   1917 #undef DEF_SYNC_BUILTIN
   1918 	return true;
   1919 
   1920       default:
   1921 	;
   1922       }
   1923 
   1924   return false;
   1925 }
   1926 
   1927 /* Main entry point for oacc transformations which run on the device
   1928    compiler after LTO, so we know what the target device is at this
   1929    point (including the host fallback).  */
   1930 
   1931 static unsigned int
   1932 execute_oacc_loop_designation ()
   1933 {
   1934   tree attrs = oacc_get_fn_attrib (current_function_decl);
   1935 
   1936   if (!attrs)
   1937     /* Not an offloaded function.  */
   1938     return 0;
   1939 
   1940   /* Parse the default dim argument exactly once.  */
   1941   if ((const void *)flag_openacc_dims != &flag_openacc_dims)
   1942     {
   1943       oacc_parse_default_dims (flag_openacc_dims);
   1944       flag_openacc_dims = (char *)&flag_openacc_dims;
   1945     }
   1946 
   1947   bool is_oacc_parallel
   1948     = (lookup_attribute ("oacc parallel",
   1949 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
   1950   bool is_oacc_kernels
   1951     = (lookup_attribute ("oacc kernels",
   1952 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
   1953   bool is_oacc_serial
   1954     = (lookup_attribute ("oacc serial",
   1955 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
   1956   bool is_oacc_parallel_kernels_parallelized
   1957     = (lookup_attribute ("oacc parallel_kernels_parallelized",
   1958 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
   1959   bool is_oacc_parallel_kernels_gang_single
   1960     = (lookup_attribute ("oacc parallel_kernels_gang_single",
   1961 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
   1962   int fn_level = oacc_fn_attrib_level (attrs);
   1963   bool is_oacc_routine = (fn_level >= 0);
   1964   gcc_checking_assert (is_oacc_parallel
   1965 		       + is_oacc_kernels
   1966 		       + is_oacc_serial
   1967 		       + is_oacc_parallel_kernels_parallelized
   1968 		       + is_oacc_parallel_kernels_gang_single
   1969 		       + is_oacc_routine
   1970 		       == 1);
   1971 
   1972   bool is_oacc_kernels_parallelized
   1973     = (lookup_attribute ("oacc kernels parallelized",
   1974 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
   1975   if (is_oacc_kernels_parallelized)
   1976     gcc_checking_assert (is_oacc_kernels);
   1977 
   1978   if (dump_file)
   1979     {
   1980       if (is_oacc_parallel)
   1981 	fprintf (dump_file, "Function is OpenACC parallel offload\n");
   1982       else if (is_oacc_kernels)
   1983 	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
   1984 		 (is_oacc_kernels_parallelized
   1985 		  ? "parallelized" : "unparallelized"));
   1986       else if (is_oacc_serial)
   1987 	fprintf (dump_file, "Function is OpenACC serial offload\n");
   1988       else if (is_oacc_parallel_kernels_parallelized)
   1989 	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
   1990 		 "parallel_kernels_parallelized");
   1991       else if (is_oacc_parallel_kernels_gang_single)
   1992 	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
   1993 		 "parallel_kernels_gang_single");
   1994       else if (is_oacc_routine)
   1995 	fprintf (dump_file, "Function is OpenACC routine level %d\n",
   1996 		 fn_level);
   1997       else
   1998 	gcc_unreachable ();
   1999     }
   2000 
   2001   /* This doesn't belong into 'pass_oacc_loop_designation' conceptually, but
   2002      it's a convenient place, so...  */
   2003   if (is_oacc_routine)
   2004     {
   2005       tree attr = lookup_attribute ("omp declare target",
   2006 				    DECL_ATTRIBUTES (current_function_decl));
   2007       gcc_checking_assert (attr);
   2008       tree clauses = TREE_VALUE (attr);
   2009       gcc_checking_assert (clauses);
   2010 
   2011       /* Should this OpenACC routine be discarded?  */
   2012       bool discard = false;
   2013 
   2014       tree clause_nohost = omp_find_clause (clauses, OMP_CLAUSE_NOHOST);
   2015       if (dump_file)
   2016 	fprintf (dump_file,
   2017 		 "OpenACC routine '%s' %s '%s' clause.\n",
   2018 		 lang_hooks.decl_printable_name (current_function_decl, 2),
   2019 		 clause_nohost ? "has" : "doesn't have",
   2020 		 omp_clause_code_name[OMP_CLAUSE_NOHOST]);
   2021       /* Host compiler, 'nohost' clause?  */
   2022 #ifndef ACCEL_COMPILER
   2023       if (clause_nohost)
   2024 	discard = true;
   2025 #endif
   2026 
   2027       if (dump_file)
   2028 	fprintf (dump_file,
   2029 		 "OpenACC routine '%s' %sdiscarded.\n",
   2030 		 lang_hooks.decl_printable_name (current_function_decl, 2),
   2031 		 discard ? "" : "not ");
   2032       if (discard)
   2033 	{
   2034 	  TREE_ASM_WRITTEN (current_function_decl) = 1;
   2035 	  return TODO_discard_function;
   2036 	}
   2037     }
   2038 
   2039   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
   2040      kernels, so remove the parallelism dimensions function attributes
   2041      potentially set earlier on.  */
   2042   if (is_oacc_kernels && !is_oacc_kernels_parallelized)
   2043     {
   2044       oacc_set_fn_attrib (current_function_decl, NULL, NULL);
   2045       attrs = oacc_get_fn_attrib (current_function_decl);
   2046     }
   2047 
   2048   /* Discover, partition and process the loops.  */
   2049   oacc_loop *loops = oacc_loop_discovery ();
   2050 
   2051   unsigned outer_mask = 0;
   2052   if (is_oacc_routine)
   2053     outer_mask = GOMP_DIM_MASK (fn_level) - 1;
   2054   unsigned used_mask = oacc_loop_partition (loops, outer_mask);
   2055   /* OpenACC kernels constructs are special: they currently don't use the
   2056      generic oacc_loop infrastructure and attribute/dimension processing.  */
   2057   if (is_oacc_kernels && is_oacc_kernels_parallelized)
   2058     {
   2059       /* Parallelized OpenACC kernels constructs use gang parallelism.  See
   2060 	 also tree-parloops.cc:create_parallel_loop.  */
   2061       used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
   2062     }
   2063 
   2064   int dims[GOMP_DIM_MAX];
   2065   oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
   2066 
   2067   if (dump_file)
   2068     {
   2069       const char *comma = "Compute dimensions [";
   2070       for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
   2071 	fprintf (dump_file, "%s%d", comma, dims[ix]);
   2072       fprintf (dump_file, "]\n");
   2073     }
   2074 
   2075   /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
   2076      a single gang only.  */
   2077   if (is_oacc_parallel_kernels_gang_single)
   2078     gcc_checking_assert (dims[GOMP_DIM_GANG] == 1);
   2079 
   2080   oacc_loop_process (loops, fn_level);
   2081   if (dump_file)
   2082     {
   2083       fprintf (dump_file, "OpenACC loops\n");
   2084       dump_oacc_loop (dump_file, loops, 0);
   2085       fprintf (dump_file, "\n");
   2086     }
   2087   if (dump_enabled_p ())
   2088     {
   2089       oacc_loop *l = loops;
   2090       /* OpenACC kernels constructs are special: they currently don't use the
   2091 	 generic oacc_loop infrastructure.  */
   2092       if (is_oacc_kernels)
   2093 	{
   2094 	  /* Create a fake oacc_loop for diagnostic purposes.  */
   2095 	  l = new_oacc_loop_raw (NULL,
   2096 				 DECL_SOURCE_LOCATION (current_function_decl));
   2097 	  l->mask = used_mask;
   2098 	}
   2099       else
   2100 	{
   2101 	  /* Skip the outermost, dummy OpenACC loop  */
   2102 	  l = l->child;
   2103 	}
   2104       if (l)
   2105 	inform_oacc_loop (l);
   2106       if (is_oacc_kernels)
   2107 	free_oacc_loop (l);
   2108     }
   2109 
   2110   free_oacc_loop (loops);
   2111 
   2112   return 0;
   2113 }
   2114 
   2115 static unsigned int
   2116 execute_oacc_device_lower ()
   2117 {
   2118   tree attrs = oacc_get_fn_attrib (current_function_decl);
   2119 
   2120   if (!attrs)
   2121     /* Not an offloaded function.  */
   2122     return 0;
   2123 
   2124   int dims[GOMP_DIM_MAX];
   2125   for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
   2126     dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
   2127 
   2128   hash_map<tree, tree> adjusted_vars;
   2129 
   2130   /* Now lower internal loop functions to target-specific code
   2131      sequences.  */
   2132   basic_block bb;
   2133   FOR_ALL_BB_FN (bb, cfun)
   2134     for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
   2135       {
   2136 	gimple *stmt = gsi_stmt (gsi);
   2137 	if (!is_gimple_call (stmt))
   2138 	  {
   2139 	    gsi_next (&gsi);
   2140 	    continue;
   2141 	  }
   2142 
   2143 	gcall *call = as_a <gcall *> (stmt);
   2144 	if (!gimple_call_internal_p (call))
   2145 	  {
   2146 	    gsi_next (&gsi);
   2147 	    continue;
   2148 	  }
   2149 
   2150 	/* Rewind to allow rescan.  */
   2151 	gsi_prev (&gsi);
   2152 	bool rescan = false, remove = false;
   2153 	enum  internal_fn ifn_code = gimple_call_internal_fn (call);
   2154 
   2155 	switch (ifn_code)
   2156 	  {
   2157 	  default: break;
   2158 
   2159 	  case IFN_GOACC_TILE:
   2160 	    oacc_xform_tile (call);
   2161 	    rescan = true;
   2162 	    break;
   2163 
   2164 	  case IFN_GOACC_LOOP:
   2165 	    oacc_xform_loop (call);
   2166 	    rescan = true;
   2167 	    break;
   2168 
   2169 	  case IFN_GOACC_REDUCTION:
   2170 	    /* Mark the function for SSA renaming.  */
   2171 	    mark_virtual_operands_for_renaming (cfun);
   2172 
   2173 	    /* If the level is -1, this ended up being an unused
   2174 	       axis.  Handle as a default.  */
   2175 	    if (integer_minus_onep (gimple_call_arg (call, 3)))
   2176 	      default_goacc_reduction (call);
   2177 	    else
   2178 	      targetm.goacc.reduction (call);
   2179 	    rescan = true;
   2180 	    break;
   2181 
   2182 	  case IFN_UNIQUE:
   2183 	    {
   2184 	      enum ifn_unique_kind kind
   2185 		= ((enum ifn_unique_kind)
   2186 		   TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
   2187 
   2188 	      switch (kind)
   2189 		{
   2190 		default:
   2191 		  break;
   2192 
   2193 		case IFN_UNIQUE_OACC_FORK:
   2194 		case IFN_UNIQUE_OACC_JOIN:
   2195 		  if (integer_minus_onep (gimple_call_arg (call, 2)))
   2196 		    remove = true;
   2197 		  else if (!targetm.goacc.fork_join
   2198 			   (call, dims, kind == IFN_UNIQUE_OACC_FORK))
   2199 		    remove = true;
   2200 		  break;
   2201 
   2202 		case IFN_UNIQUE_OACC_HEAD_MARK:
   2203 		case IFN_UNIQUE_OACC_TAIL_MARK:
   2204 		  remove = true;
   2205 		  break;
   2206 
   2207 		case IFN_UNIQUE_OACC_PRIVATE:
   2208 		  {
   2209 		    dump_flags_t l_dump_flags
   2210 		      = get_openacc_privatization_dump_flags ();
   2211 
   2212 		    location_t loc = gimple_location (stmt);
   2213 		    if (LOCATION_LOCUS (loc) == UNKNOWN_LOCATION)
   2214 		      loc = DECL_SOURCE_LOCATION (current_function_decl);
   2215 		    const dump_user_location_t d_u_loc
   2216 		      = dump_user_location_t::from_location_t (loc);
   2217 
   2218 		    HOST_WIDE_INT level
   2219 		      = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
   2220 		    gcc_checking_assert (level == -1
   2221 					 || (level >= 0
   2222 					     && level < GOMP_DIM_MAX));
   2223 		    for (unsigned i = 3;
   2224 			 i < gimple_call_num_args (call);
   2225 			 i++)
   2226 		      {
   2227 			static char const *const axes[] =
   2228 			/* Must be kept in sync with GOMP_DIM enumeration.  */
   2229 			  { "gang", "worker", "vector" };
   2230 
   2231 			tree arg = gimple_call_arg (call, i);
   2232 			gcc_checking_assert (TREE_CODE (arg) == ADDR_EXPR);
   2233 			tree decl = TREE_OPERAND (arg, 0);
   2234 			if (dump_enabled_p ())
   2235 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
   2236 #if __GNUC__ >= 10
   2237 # pragma GCC diagnostic push
   2238 # pragma GCC diagnostic ignored "-Wformat"
   2239 #endif
   2240 			  dump_printf_loc (l_dump_flags, d_u_loc,
   2241 					   "variable %<%T%> ought to be"
   2242 					   " adjusted for OpenACC"
   2243 					   " privatization level: %qs\n",
   2244 					   decl,
   2245 					   (level == -1
   2246 					    ? "UNKNOWN" : axes[level]));
   2247 #if __GNUC__ >= 10
   2248 # pragma GCC diagnostic pop
   2249 #endif
   2250 			bool adjusted;
   2251 			if (level == -1)
   2252 			  adjusted = false;
   2253 			else if (!targetm.goacc.adjust_private_decl)
   2254 			  adjusted = false;
   2255 			else if (level == GOMP_DIM_VECTOR)
   2256 			  {
   2257 			    /* That's the default behavior.  */
   2258 			    adjusted = true;
   2259 			  }
   2260 			else
   2261 			  {
   2262 			    tree oldtype = TREE_TYPE (decl);
   2263 			    tree newdecl
   2264 			      = targetm.goacc.adjust_private_decl (loc, decl,
   2265 								   level);
   2266 			    adjusted = (TREE_TYPE (newdecl) != oldtype
   2267 					|| newdecl != decl);
   2268 			    if (adjusted)
   2269 			      adjusted_vars.put (decl, newdecl);
   2270 			  }
   2271 			if (adjusted
   2272 			    && dump_enabled_p ())
   2273 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
   2274 #if __GNUC__ >= 10
   2275 # pragma GCC diagnostic push
   2276 # pragma GCC diagnostic ignored "-Wformat"
   2277 #endif
   2278 			  dump_printf_loc (l_dump_flags, d_u_loc,
   2279 					   "variable %<%T%> adjusted for"
   2280 					   " OpenACC privatization level:"
   2281 					   " %qs\n",
   2282 					   decl, axes[level]);
   2283 #if __GNUC__ >= 10
   2284 # pragma GCC diagnostic pop
   2285 #endif
   2286 		      }
   2287 		    remove = true;
   2288 		  }
   2289 		  break;
   2290 		}
   2291 	      break;
   2292 	    }
   2293 	  }
   2294 
   2295 	if (gsi_end_p (gsi))
   2296 	  /* We rewound past the beginning of the BB.  */
   2297 	  gsi = gsi_start_bb (bb);
   2298 	else
   2299 	  /* Undo the rewind.  */
   2300 	  gsi_next (&gsi);
   2301 
   2302 	if (remove)
   2303 	  {
   2304 	    if (gimple_vdef (call))
   2305 	      replace_uses_by (gimple_vdef (call), gimple_vuse (call));
   2306 	    if (gimple_call_lhs (call))
   2307 	      {
   2308 		/* Propagate the data dependency var.  */
   2309 		gimple *ass = gimple_build_assign (gimple_call_lhs (call),
   2310 						   gimple_call_arg (call, 1));
   2311 		gsi_replace (&gsi, ass,  false);
   2312 	      }
   2313 	    else
   2314 	      gsi_remove (&gsi, true);
   2315 	  }
   2316 	else if (!rescan)
   2317 	  /* If not rescanning, advance over the call.  */
   2318 	  gsi_next (&gsi);
   2319       }
   2320 
   2321   /* Regarding the OpenACC privatization level, we're currently only looking at
   2322      making the gang-private level work.  Regarding that, we have the following
   2323      configurations:
   2324 
   2325        - GCN offloading: 'targetm.goacc.adjust_private_decl' does the work (in
   2326 	 particular, change 'TREE_TYPE', etc.) and there is no
   2327 	 'targetm.goacc.expand_var_decl'.
   2328 
   2329        - nvptx offloading: 'targetm.goacc.adjust_private_decl' only sets a
   2330 	 marker and then 'targetm.goacc.expand_var_decl' does the work.
   2331 
   2332      Eventually (in particular, for worker-private level?), both
   2333      'targetm.goacc.adjust_private_decl' and 'targetm.goacc.expand_var_decl'
   2334      may need to do things, but that's currently not meant to be addressed, and
   2335      thus not fully worked out and implemented, and thus untested.  Hence,
   2336      'assert' what currently is implemented/tested, only.  */
   2337 
   2338   if (targetm.goacc.expand_var_decl)
   2339     gcc_assert (adjusted_vars.is_empty ());
   2340 
   2341   /* Make adjustments to gang-private local variables if required by the
   2342      target, e.g. forcing them into a particular address space.  Afterwards,
   2343      ADDR_EXPR nodes which have adjusted variables as their argument need to
   2344      be modified in one of two ways:
   2345 
   2346        1. They can be recreated, making a pointer to the variable in the new
   2347 	  address space, or
   2348 
   2349        2. The address of the variable in the new address space can be taken,
   2350 	  converted to the default (original) address space, and the result of
   2351 	  that conversion subsituted in place of the original ADDR_EXPR node.
   2352 
   2353      Which of these is done depends on the gimple statement being processed.
   2354      At present atomic operations and inline asms use (1), and everything else
   2355      uses (2).  At least on AMD GCN, there are atomic operations that work
   2356      directly in the LDS address space.
   2357 
   2358      COMPONENT_REFS, ARRAY_REFS and plain VAR_DECLs are also rewritten to use
   2359      the new decl, adjusting types of appropriate tree nodes as necessary.  */
   2360 
   2361   if (targetm.goacc.adjust_private_decl
   2362       && !adjusted_vars.is_empty ())
   2363     {
   2364       FOR_ALL_BB_FN (bb, cfun)
   2365 	for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
   2366 	     !gsi_end_p (gsi);
   2367 	     gsi_next (&gsi))
   2368 	  {
   2369 	    gimple *stmt = gsi_stmt (gsi);
   2370 	    walk_stmt_info wi;
   2371 	    var_decl_rewrite_info info;
   2372 
   2373 	    info.avoid_pointer_conversion
   2374 	      = (is_gimple_call (stmt)
   2375 		 && is_sync_builtin_call (as_a <gcall *> (stmt)))
   2376 		|| gimple_code (stmt) == GIMPLE_ASM;
   2377 	    info.stmt = stmt;
   2378 	    info.modified = false;
   2379 	    info.adjusted_vars = &adjusted_vars;
   2380 
   2381 	    memset (&wi, 0, sizeof (wi));
   2382 	    wi.info = &info;
   2383 
   2384 	    walk_gimple_op (stmt, oacc_rewrite_var_decl, &wi);
   2385 
   2386 	    if (info.modified)
   2387 	      update_stmt (stmt);
   2388 	  }
   2389     }
   2390 
   2391   return 0;
   2392 }
   2393 
   2394 /* Default launch dimension validator.  Force everything to 1.  A
   2395    backend that wants to provide larger dimensions must override this
   2396    hook.  */
   2397 
   2398 bool
   2399 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
   2400 			     int ARG_UNUSED (fn_level),
   2401 			     unsigned ARG_UNUSED (used))
   2402 {
   2403   bool changed = false;
   2404 
   2405   for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
   2406     {
   2407       if (dims[ix] != 1)
   2408 	{
   2409 	  dims[ix] = 1;
   2410 	  changed = true;
   2411 	}
   2412     }
   2413 
   2414   return changed;
   2415 }
   2416 
   2417 /* Default dimension bound is unknown on accelerator and 1 on host.  */
   2418 
   2419 int
   2420 default_goacc_dim_limit (int ARG_UNUSED (axis))
   2421 {
   2422 #ifdef ACCEL_COMPILER
   2423   return 0;
   2424 #else
   2425   return 1;
   2426 #endif
   2427 }
   2428 
   2429 namespace {
   2430 
   2431 const pass_data pass_data_oacc_loop_designation =
   2432 {
   2433   GIMPLE_PASS, /* type */
   2434   "oaccloops", /* name */
   2435   OPTGROUP_OMP, /* optinfo_flags */
   2436   TV_NONE, /* tv_id */
   2437   PROP_cfg, /* properties_required */
   2438   0 /* Possibly PROP_gimple_eomp.  */, /* properties_provided */
   2439   0, /* properties_destroyed */
   2440   0, /* todo_flags_start */
   2441   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
   2442 };
   2443 
   2444 class pass_oacc_loop_designation : public gimple_opt_pass
   2445 {
   2446 public:
   2447   pass_oacc_loop_designation (gcc::context *ctxt)
   2448     : gimple_opt_pass (pass_data_oacc_loop_designation, ctxt)
   2449   {}
   2450 
   2451   /* opt_pass methods: */
   2452   virtual bool gate (function *) { return flag_openacc; };
   2453 
   2454   virtual unsigned int execute (function *)
   2455     {
   2456       return execute_oacc_loop_designation ();
   2457     }
   2458 
   2459 }; // class pass_oacc_loop_designation
   2460 
   2461 const pass_data pass_data_oacc_device_lower =
   2462 {
   2463   GIMPLE_PASS, /* type */
   2464   "oaccdevlow", /* name */
   2465   OPTGROUP_OMP, /* optinfo_flags */
   2466   TV_NONE, /* tv_id */
   2467   PROP_cfg, /* properties_required */
   2468   0 /* Possibly PROP_gimple_eomp.  */, /* properties_provided */
   2469   0, /* properties_destroyed */
   2470   0, /* todo_flags_start */
   2471   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
   2472 };
   2473 
   2474 class pass_oacc_device_lower : public gimple_opt_pass
   2475 {
   2476 public:
   2477   pass_oacc_device_lower (gcc::context *ctxt)
   2478     : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
   2479   {}
   2480 
   2481   /* opt_pass methods: */
   2482   virtual bool gate (function *) { return flag_openacc; };
   2483 
   2484   virtual unsigned int execute (function *)
   2485     {
   2486       return execute_oacc_device_lower ();
   2487     }
   2488 
   2489 }; // class pass_oacc_device_lower
   2490 
   2491 } // anon namespace
   2492 
   2493 gimple_opt_pass *
   2494 make_pass_oacc_loop_designation (gcc::context *ctxt)
   2495 {
   2496   return new pass_oacc_loop_designation (ctxt);
   2497 }
   2498 
   2499 gimple_opt_pass *
   2500 make_pass_oacc_device_lower (gcc::context *ctxt)
   2501 {
   2502   return new pass_oacc_device_lower (ctxt);
   2503 }
   2504 
   2505 
   2506 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
   2508    GOMP_SIMT_ENTER call identifying the privatized variables, which are
   2509    turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
   2510    Set *REGIMPLIFY to true, except if no privatized variables were seen.  */
   2511 
   2512 static void
   2513 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
   2514 {
   2515   gimple *alloc_stmt = gsi_stmt (*gsi);
   2516   tree simtrec = gimple_call_lhs (alloc_stmt);
   2517   tree simduid = gimple_call_arg (alloc_stmt, 0);
   2518   gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
   2519   gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
   2520   tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
   2521   TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
   2522   TREE_ADDRESSABLE (rectype) = 1;
   2523   TREE_TYPE (simtrec) = build_pointer_type (rectype);
   2524   for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
   2525     {
   2526       tree *argp = gimple_call_arg_ptr (enter_stmt, i);
   2527       if (*argp == null_pointer_node)
   2528 	continue;
   2529       gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
   2530 		  && VAR_P (TREE_OPERAND (*argp, 0)));
   2531       tree var = TREE_OPERAND (*argp, 0);
   2532 
   2533       tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
   2534 			       DECL_NAME (var), TREE_TYPE (var));
   2535       SET_DECL_ALIGN (field, DECL_ALIGN (var));
   2536       DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
   2537       TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
   2538 
   2539       insert_field_into_struct (rectype, field);
   2540 
   2541       tree t = build_simple_mem_ref (simtrec);
   2542       t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
   2543       TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
   2544       SET_DECL_VALUE_EXPR (var, t);
   2545       DECL_HAS_VALUE_EXPR_P (var) = 1;
   2546       *regimplify = true;
   2547     }
   2548   layout_type (rectype);
   2549   tree size = TYPE_SIZE_UNIT (rectype);
   2550   tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
   2551 
   2552   alloc_stmt
   2553     = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
   2554   gimple_call_set_lhs (alloc_stmt, simtrec);
   2555   gsi_replace (gsi, alloc_stmt, false);
   2556   gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
   2557   enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
   2558   gsi_replace (&enter_gsi, enter_stmt, false);
   2559 
   2560   use_operand_p use;
   2561   gimple *exit_stmt;
   2562   if (single_imm_use (simtrec, &use, &exit_stmt))
   2563     {
   2564       gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
   2565       gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
   2566       tree clobber = build_clobber (rectype);
   2567       exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
   2568       gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
   2569     }
   2570   else
   2571     gcc_checking_assert (has_zero_uses (simtrec));
   2572 }
   2573 
   2574 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables.  */
   2575 
   2576 static tree
   2577 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
   2578 {
   2579   tree t = *tp;
   2580 
   2581   if (VAR_P (t)
   2582       && DECL_HAS_VALUE_EXPR_P (t)
   2583       && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
   2584     {
   2585       *walk_subtrees = 0;
   2586       return t;
   2587     }
   2588   return NULL_TREE;
   2589 }
   2590 
   2591 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
   2592    VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
   2593    LANE is kept to be expanded to RTL later on.  Also cleanup all other SIMT
   2594    internal functions on non-SIMT targets, and likewise some SIMD internal
   2595    functions on SIMT targets.  */
   2596 
   2597 static unsigned int
   2598 execute_omp_device_lower ()
   2599 {
   2600   int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
   2601   bool regimplify = false;
   2602   basic_block bb;
   2603   gimple_stmt_iterator gsi;
   2604   bool calls_declare_variant_alt
   2605     = cgraph_node::get (cfun->decl)->calls_declare_variant_alt;
   2606   FOR_EACH_BB_FN (bb, cfun)
   2607     for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
   2608       {
   2609 	gimple *stmt = gsi_stmt (gsi);
   2610 	if (!is_gimple_call (stmt))
   2611 	  continue;
   2612 	if (!gimple_call_internal_p (stmt))
   2613 	  {
   2614 	    if (calls_declare_variant_alt)
   2615 	      if (tree fndecl = gimple_call_fndecl (stmt))
   2616 		{
   2617 		  tree new_fndecl = omp_resolve_declare_variant (fndecl);
   2618 		  if (new_fndecl != fndecl)
   2619 		    {
   2620 		      gimple_call_set_fndecl (stmt, new_fndecl);
   2621 		      update_stmt (stmt);
   2622 		    }
   2623 		}
   2624 	    continue;
   2625 	  }
   2626 	tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
   2627 	tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
   2628 	switch (gimple_call_internal_fn (stmt))
   2629 	  {
   2630 	  case IFN_GOMP_USE_SIMT:
   2631 	    rhs = vf == 1 ? integer_zero_node : integer_one_node;
   2632 	    break;
   2633 	  case IFN_GOMP_SIMT_ENTER:
   2634 	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
   2635 	    goto simtreg_enter_exit;
   2636 	  case IFN_GOMP_SIMT_ENTER_ALLOC:
   2637 	    if (vf != 1)
   2638 	      ompdevlow_adjust_simt_enter (&gsi, &regimplify);
   2639 	    rhs = vf == 1 ? null_pointer_node : NULL_TREE;
   2640 	    goto simtreg_enter_exit;
   2641 	  case IFN_GOMP_SIMT_EXIT:
   2642 	  simtreg_enter_exit:
   2643 	    if (vf != 1)
   2644 	      continue;
   2645 	    unlink_stmt_vdef (stmt);
   2646 	    break;
   2647 	  case IFN_GOMP_SIMT_LANE:
   2648 	  case IFN_GOMP_SIMT_LAST_LANE:
   2649 	    rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
   2650 	    break;
   2651 	  case IFN_GOMP_SIMT_VF:
   2652 	    rhs = build_int_cst (type, vf);
   2653 	    break;
   2654 	  case IFN_GOMP_SIMT_ORDERED_PRED:
   2655 	    rhs = vf == 1 ? integer_zero_node : NULL_TREE;
   2656 	    if (rhs || !lhs)
   2657 	      unlink_stmt_vdef (stmt);
   2658 	    break;
   2659 	  case IFN_GOMP_SIMT_VOTE_ANY:
   2660 	  case IFN_GOMP_SIMT_XCHG_BFLY:
   2661 	  case IFN_GOMP_SIMT_XCHG_IDX:
   2662 	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
   2663 	    break;
   2664 	  case IFN_GOMP_SIMD_LANE:
   2665 	  case IFN_GOMP_SIMD_LAST_LANE:
   2666 	    rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
   2667 	    break;
   2668 	  case IFN_GOMP_SIMD_VF:
   2669 	    rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
   2670 	    break;
   2671 	  default:
   2672 	    continue;
   2673 	  }
   2674 	if (lhs && !rhs)
   2675 	  continue;
   2676 	stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
   2677 	gsi_replace (&gsi, stmt, false);
   2678       }
   2679   if (regimplify)
   2680     FOR_EACH_BB_REVERSE_FN (bb, cfun)
   2681       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
   2682 	if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
   2683 	  {
   2684 	    if (gimple_clobber_p (gsi_stmt (gsi)))
   2685 	      gsi_remove (&gsi, true);
   2686 	    else
   2687 	      gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
   2688 	  }
   2689   if (vf != 1)
   2690     cfun->has_force_vectorize_loops = false;
   2691   return 0;
   2692 }
   2693 
   2694 namespace {
   2695 
   2696 const pass_data pass_data_omp_device_lower =
   2697 {
   2698   GIMPLE_PASS, /* type */
   2699   "ompdevlow", /* name */
   2700   OPTGROUP_OMP, /* optinfo_flags */
   2701   TV_NONE, /* tv_id */
   2702   PROP_cfg, /* properties_required */
   2703   PROP_gimple_lomp_dev, /* properties_provided */
   2704   0, /* properties_destroyed */
   2705   0, /* todo_flags_start */
   2706   TODO_update_ssa, /* todo_flags_finish */
   2707 };
   2708 
   2709 class pass_omp_device_lower : public gimple_opt_pass
   2710 {
   2711 public:
   2712   pass_omp_device_lower (gcc::context *ctxt)
   2713     : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
   2714   {}
   2715 
   2716   /* opt_pass methods: */
   2717   virtual bool gate (function *fun)
   2718     {
   2719       return (!(fun->curr_properties & PROP_gimple_lomp_dev)
   2720 	      || (flag_openmp
   2721 		  && cgraph_node::get (fun->decl)->calls_declare_variant_alt));
   2722     }
   2723   virtual unsigned int execute (function *)
   2724     {
   2725       return execute_omp_device_lower ();
   2726     }
   2727 
   2728 }; // class pass_expand_omp_ssa
   2729 
   2730 } // anon namespace
   2731 
   2732 gimple_opt_pass *
   2733 make_pass_omp_device_lower (gcc::context *ctxt)
   2734 {
   2735   return new pass_omp_device_lower (ctxt);
   2736 }
   2737 
   2738 /* "omp declare target link" handling pass.  */
   2739 
   2740 namespace {
   2741 
   2742 const pass_data pass_data_omp_target_link =
   2743 {
   2744   GIMPLE_PASS,			/* type */
   2745   "omptargetlink",		/* name */
   2746   OPTGROUP_OMP,			/* optinfo_flags */
   2747   TV_NONE,			/* tv_id */
   2748   PROP_ssa,			/* properties_required */
   2749   0,				/* properties_provided */
   2750   0,				/* properties_destroyed */
   2751   0,				/* todo_flags_start */
   2752   TODO_update_ssa,		/* todo_flags_finish */
   2753 };
   2754 
   2755 class pass_omp_target_link : public gimple_opt_pass
   2756 {
   2757 public:
   2758   pass_omp_target_link (gcc::context *ctxt)
   2759     : gimple_opt_pass (pass_data_omp_target_link, ctxt)
   2760   {}
   2761 
   2762   /* opt_pass methods: */
   2763   virtual bool gate (function *fun)
   2764     {
   2765 #ifdef ACCEL_COMPILER
   2766       return offloading_function_p (fun->decl);
   2767 #else
   2768       (void) fun;
   2769       return false;
   2770 #endif
   2771     }
   2772 
   2773   virtual unsigned execute (function *);
   2774 };
   2775 
   2776 /* Callback for walk_gimple_stmt used to scan for link var operands.  */
   2777 
   2778 static tree
   2779 find_link_var_op (tree *tp, int *walk_subtrees, void *)
   2780 {
   2781   tree t = *tp;
   2782 
   2783   if (VAR_P (t)
   2784       && DECL_HAS_VALUE_EXPR_P (t)
   2785       && is_global_var (t)
   2786       && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
   2787     {
   2788       *walk_subtrees = 0;
   2789       return t;
   2790     }
   2791 
   2792   return NULL_TREE;
   2793 }
   2794 
   2795 unsigned
   2796 pass_omp_target_link::execute (function *fun)
   2797 {
   2798   basic_block bb;
   2799   FOR_EACH_BB_FN (bb, fun)
   2800     {
   2801       gimple_stmt_iterator gsi;
   2802       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
   2803 	{
   2804 	  if (gimple_call_builtin_p (gsi_stmt (gsi), BUILT_IN_GOMP_TARGET))
   2805 	    {
   2806 	      /* Nullify the second argument of __builtin_GOMP_target_ext.  */
   2807 	      gimple_call_set_arg (gsi_stmt (gsi), 1, null_pointer_node);
   2808 	      update_stmt (gsi_stmt (gsi));
   2809 	    }
   2810 	  if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
   2811 	    gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
   2812 	}
   2813     }
   2814 
   2815   return 0;
   2816 }
   2817 
   2818 } // anon namespace
   2819 
   2820 gimple_opt_pass *
   2821 make_pass_omp_target_link (gcc::context *ctxt)
   2822 {
   2823   return new pass_omp_target_link (ctxt);
   2824 }
   2825