Home | History | Annotate | Line # | Download | only in nvptx
      1  1.1  mrg /* Target code for NVPTX.
      2  1.1  mrg    Copyright (C) 2014-2022 Free Software Foundation, Inc.
      3  1.1  mrg    Contributed by Bernd Schmidt <bernds (at) codesourcery.com>
      4  1.1  mrg 
      5  1.1  mrg    This file is part of GCC.
      6  1.1  mrg 
      7  1.1  mrg    GCC is free software; you can redistribute it and/or modify it
      8  1.1  mrg    under the terms of the GNU General Public License as published
      9  1.1  mrg    by the Free Software Foundation; either version 3, or (at your
     10  1.1  mrg    option) any later version.
     11  1.1  mrg 
     12  1.1  mrg    GCC is distributed in the hope that it will be useful, but WITHOUT
     13  1.1  mrg    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     14  1.1  mrg    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
     15  1.1  mrg    License for more details.
     16  1.1  mrg 
     17  1.1  mrg    You should have received a copy of the GNU General Public License
     18  1.1  mrg    along with GCC; see the file COPYING3.  If not see
     19  1.1  mrg    <http://www.gnu.org/licenses/>.  */
     20  1.1  mrg 
     21  1.1  mrg #define IN_TARGET_CODE 1
     22  1.1  mrg 
     23  1.1  mrg #include "config.h"
     24  1.1  mrg #include <sstream>
     25  1.1  mrg #include "system.h"
     26  1.1  mrg #include "coretypes.h"
     27  1.1  mrg #include "backend.h"
     28  1.1  mrg #include "target.h"
     29  1.1  mrg #include "rtl.h"
     30  1.1  mrg #include "tree.h"
     31  1.1  mrg #include "cfghooks.h"
     32  1.1  mrg #include "df.h"
     33  1.1  mrg #include "memmodel.h"
     34  1.1  mrg #include "tm_p.h"
     35  1.1  mrg #include "expmed.h"
     36  1.1  mrg #include "optabs.h"
     37  1.1  mrg #include "regs.h"
     38  1.1  mrg #include "emit-rtl.h"
     39  1.1  mrg #include "recog.h"
     40  1.1  mrg #include "diagnostic.h"
     41  1.1  mrg #include "alias.h"
     42  1.1  mrg #include "insn-flags.h"
     43  1.1  mrg #include "output.h"
     44  1.1  mrg #include "insn-attr.h"
     45  1.1  mrg #include "flags.h"
     46  1.1  mrg #include "dojump.h"
     47  1.1  mrg #include "explow.h"
     48  1.1  mrg #include "calls.h"
     49  1.1  mrg #include "varasm.h"
     50  1.1  mrg #include "stmt.h"
     51  1.1  mrg #include "expr.h"
     52  1.1  mrg #include "tm-preds.h"
     53  1.1  mrg #include "tm-constrs.h"
     54  1.1  mrg #include "langhooks.h"
     55  1.1  mrg #include "dbxout.h"
     56  1.1  mrg #include "cfgrtl.h"
     57  1.1  mrg #include "gimple.h"
     58  1.1  mrg #include "stor-layout.h"
     59  1.1  mrg #include "builtins.h"
     60  1.1  mrg #include "omp-general.h"
     61  1.1  mrg #include "omp-low.h"
     62  1.1  mrg #include "omp-offload.h"
     63  1.1  mrg #include "gomp-constants.h"
     64  1.1  mrg #include "dumpfile.h"
     65  1.1  mrg #include "internal-fn.h"
     66  1.1  mrg #include "gimple-iterator.h"
     67  1.1  mrg #include "stringpool.h"
     68  1.1  mrg #include "attribs.h"
     69  1.1  mrg #include "tree-vrp.h"
     70  1.1  mrg #include "tree-ssa-operands.h"
     71  1.1  mrg #include "tree-ssanames.h"
     72  1.1  mrg #include "gimplify.h"
     73  1.1  mrg #include "tree-phinodes.h"
     74  1.1  mrg #include "cfgloop.h"
     75  1.1  mrg #include "fold-const.h"
     76  1.1  mrg #include "intl.h"
     77  1.1  mrg #include "opts.h"
     78  1.1  mrg #include "tree-pretty-print.h"
     79  1.1  mrg #include "rtl-iter.h"
     80  1.1  mrg #include "cgraph.h"
     81  1.1  mrg 
     82  1.1  mrg /* This file should be included last.  */
     83  1.1  mrg #include "target-def.h"
     84  1.1  mrg 
     85  1.1  mrg #define WORKAROUND_PTXJIT_BUG 1
     86  1.1  mrg #define WORKAROUND_PTXJIT_BUG_2 1
     87  1.1  mrg #define WORKAROUND_PTXJIT_BUG_3 1
     88  1.1  mrg 
     89  1.1  mrg /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
     90  1.1  mrg    block, which has had a maximum number of threads of 1024 since CUDA version
     91  1.1  mrg    2.x.  */
     92  1.1  mrg #define PTX_CTA_SIZE 1024
     93  1.1  mrg 
     94  1.1  mrg #define PTX_CTA_NUM_BARRIERS 16
     95  1.1  mrg #define PTX_WARP_SIZE 32
     96  1.1  mrg 
     97  1.1  mrg #define PTX_PER_CTA_BARRIER 0
     98  1.1  mrg #define PTX_NUM_PER_CTA_BARRIERS 1
     99  1.1  mrg #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
    100  1.1  mrg #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
    101  1.1  mrg 
    102  1.1  mrg #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
    103  1.1  mrg #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
    104  1.1  mrg #define PTX_WORKER_LENGTH 32
    105  1.1  mrg #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
    106  1.1  mrg 
    107  1.1  mrg /* The various PTX memory areas an object might reside in.  */
    108  1.1  mrg enum nvptx_data_area
    109  1.1  mrg {
    110  1.1  mrg   DATA_AREA_GENERIC,
    111  1.1  mrg   DATA_AREA_GLOBAL,
    112  1.1  mrg   DATA_AREA_SHARED,
    113  1.1  mrg   DATA_AREA_LOCAL,
    114  1.1  mrg   DATA_AREA_CONST,
    115  1.1  mrg   DATA_AREA_PARAM,
    116  1.1  mrg   DATA_AREA_MAX
    117  1.1  mrg };
    118  1.1  mrg 
    119  1.1  mrg /*  We record the data area in the target symbol flags.  */
    120  1.1  mrg #define SYMBOL_DATA_AREA(SYM) \
    121  1.1  mrg   (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
    122  1.1  mrg 		    & 7)
    123  1.1  mrg #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
    124  1.1  mrg   (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
    125  1.1  mrg 
    126  1.1  mrg /* Record the function decls we've written, and the libfuncs and function
    127  1.1  mrg    decls corresponding to them.  */
    128  1.1  mrg static std::stringstream func_decls;
    129  1.1  mrg 
    130  1.1  mrg struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
    131  1.1  mrg {
    132  1.1  mrg   static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
    133  1.1  mrg   static bool equal (rtx a, rtx b) { return a == b; }
    134  1.1  mrg };
    135  1.1  mrg 
    136  1.1  mrg static GTY((cache))
    137  1.1  mrg   hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
    138  1.1  mrg 
    139  1.1  mrg struct tree_hasher : ggc_cache_ptr_hash<tree_node>
    140  1.1  mrg {
    141  1.1  mrg   static hashval_t hash (tree t) { return htab_hash_pointer (t); }
    142  1.1  mrg   static bool equal (tree a, tree b) { return a == b; }
    143  1.1  mrg };
    144  1.1  mrg 
    145  1.1  mrg static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
    146  1.1  mrg static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
    147  1.1  mrg 
    148  1.1  mrg /* Buffer needed to broadcast across workers and vectors.  This is
    149  1.1  mrg    used for both worker-neutering and worker broadcasting, and
    150  1.1  mrg    vector-neutering and boardcasting when vector_length > 32.  It is
    151  1.1  mrg    shared by all functions emitted.  The buffer is placed in shared
    152  1.1  mrg    memory.  It'd be nice if PTX supported common blocks, because then
    153  1.1  mrg    this could be shared across TUs (taking the largest size).  */
    154  1.1  mrg static unsigned oacc_bcast_size;
    155  1.1  mrg static unsigned oacc_bcast_partition;
    156  1.1  mrg static unsigned oacc_bcast_align;
    157  1.1  mrg static GTY(()) rtx oacc_bcast_sym;
    158  1.1  mrg 
    159  1.1  mrg /* Buffer needed for worker reductions.  This has to be distinct from
    160  1.1  mrg    the worker broadcast array, as both may be live concurrently.  */
    161  1.1  mrg static unsigned worker_red_size;
    162  1.1  mrg static unsigned worker_red_align;
    163  1.1  mrg static GTY(()) rtx worker_red_sym;
    164  1.1  mrg 
    165  1.1  mrg /* Buffer needed for vector reductions, when vector_length >
    166  1.1  mrg    PTX_WARP_SIZE.  This has to be distinct from the worker broadcast
    167  1.1  mrg    array, as both may be live concurrently.  */
    168  1.1  mrg static unsigned vector_red_size;
    169  1.1  mrg static unsigned vector_red_align;
    170  1.1  mrg static unsigned vector_red_partition;
    171  1.1  mrg static GTY(()) rtx vector_red_sym;
    172  1.1  mrg 
    173  1.1  mrg /* Shared memory block for gang-private variables.  */
    174  1.1  mrg static unsigned gang_private_shared_size;
    175  1.1  mrg static unsigned gang_private_shared_align;
    176  1.1  mrg static GTY(()) rtx gang_private_shared_sym;
    177  1.1  mrg static hash_map<tree_decl_hash, unsigned int> gang_private_shared_hmap;
    178  1.1  mrg 
    179  1.1  mrg /* Global lock variable, needed for 128bit worker & gang reductions.  */
    180  1.1  mrg static GTY(()) tree global_lock_var;
    181  1.1  mrg 
    182  1.1  mrg /* True if any function references __nvptx_stacks.  */
    183  1.1  mrg static bool need_softstack_decl;
    184  1.1  mrg 
    185  1.1  mrg /* True if any function references __nvptx_uni.  */
    186  1.1  mrg static bool need_unisimt_decl;
    187  1.1  mrg 
    188  1.1  mrg static int nvptx_mach_max_workers ();
    189  1.1  mrg 
    190  1.1  mrg /* Allocate a new, cleared machine_function structure.  */
    191  1.1  mrg 
    192  1.1  mrg static struct machine_function *
    193  1.1  mrg nvptx_init_machine_status (void)
    194  1.1  mrg {
    195  1.1  mrg   struct machine_function *p = ggc_cleared_alloc<machine_function> ();
    196  1.1  mrg   p->return_mode = VOIDmode;
    197  1.1  mrg   return p;
    198  1.1  mrg }
    199  1.1  mrg 
    200  1.1  mrg /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
    201  1.1  mrg    and -fopenacc is also enabled.  */
    202  1.1  mrg 
    203  1.1  mrg static void
    204  1.1  mrg diagnose_openacc_conflict (bool optval, const char *optname)
    205  1.1  mrg {
    206  1.1  mrg   if (flag_openacc && optval)
    207  1.1  mrg     error ("option %s is not supported together with %<-fopenacc%>", optname);
    208  1.1  mrg }
    209  1.1  mrg 
    210  1.1  mrg static enum ptx_version
    211  1.1  mrg first_ptx_version_supporting_sm (enum ptx_isa sm)
    212  1.1  mrg {
    213  1.1  mrg   switch (sm)
    214  1.1  mrg     {
    215  1.1  mrg     case PTX_ISA_SM30:
    216  1.1  mrg       return PTX_VERSION_3_0;
    217  1.1  mrg     case PTX_ISA_SM35:
    218  1.1  mrg       return PTX_VERSION_3_1;
    219  1.1  mrg     case PTX_ISA_SM53:
    220  1.1  mrg       return PTX_VERSION_4_2;
    221  1.1  mrg     case PTX_ISA_SM70:
    222  1.1  mrg       return PTX_VERSION_6_0;
    223  1.1  mrg     case PTX_ISA_SM75:
    224  1.1  mrg       return PTX_VERSION_6_3;
    225  1.1  mrg     case PTX_ISA_SM80:
    226  1.1  mrg       return PTX_VERSION_7_0;
    227  1.1  mrg     default:
    228  1.1  mrg       gcc_unreachable ();
    229  1.1  mrg     }
    230  1.1  mrg }
    231  1.1  mrg 
    232  1.1  mrg static enum ptx_version
    233  1.1  mrg default_ptx_version_option (void)
    234  1.1  mrg {
    235  1.1  mrg   enum ptx_version first
    236  1.1  mrg     = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
    237  1.1  mrg 
    238  1.1  mrg   /* Pick a version that supports the sm.  */
    239  1.1  mrg   enum ptx_version res = first;
    240  1.1  mrg 
    241  1.1  mrg   /* Pick at least 3.1.  This has been the smallest version historically.  */
    242  1.1  mrg   res = MAX (res, PTX_VERSION_3_1);
    243  1.1  mrg 
    244  1.1  mrg   /* Pick at least 6.0, to enable using bar.warp.sync to have a way to force
    245  1.1  mrg      warp convergence.  */
    246  1.1  mrg   res = MAX (res, PTX_VERSION_6_0);
    247  1.1  mrg 
    248  1.1  mrg   /* Verify that we pick a version that supports the sm.  */
    249  1.1  mrg   gcc_assert (first <= res);
    250  1.1  mrg   return res;
    251  1.1  mrg }
    252  1.1  mrg 
    253  1.1  mrg static const char *
    254  1.1  mrg ptx_version_to_string (enum ptx_version v)
    255  1.1  mrg {
    256  1.1  mrg   switch (v)
    257  1.1  mrg     {
    258  1.1  mrg     case PTX_VERSION_3_0:
    259  1.1  mrg       return "3.0";
    260  1.1  mrg     case PTX_VERSION_3_1:
    261  1.1  mrg       return "3.1";
    262  1.1  mrg     case PTX_VERSION_4_2:
    263  1.1  mrg       return "4.2";
    264  1.1  mrg     case PTX_VERSION_6_0:
    265  1.1  mrg       return "6.0";
    266  1.1  mrg     case PTX_VERSION_6_3:
    267  1.1  mrg       return "6.3";
    268  1.1  mrg     case PTX_VERSION_7_0:
    269  1.1  mrg       return "7.0";
    270  1.1  mrg     default:
    271  1.1  mrg       gcc_unreachable ();
    272  1.1  mrg     }
    273  1.1  mrg }
    274  1.1  mrg 
    275  1.1  mrg unsigned int
    276  1.1  mrg ptx_version_to_number (enum ptx_version v, bool major_p)
    277  1.1  mrg {
    278  1.1  mrg   switch (v)
    279  1.1  mrg     {
    280  1.1  mrg     case PTX_VERSION_3_0:
    281  1.1  mrg       return major_p ? 3 : 0;
    282  1.1  mrg     case PTX_VERSION_3_1:
    283  1.1  mrg       return major_p ? 3 : 1;
    284  1.1  mrg     case PTX_VERSION_4_2:
    285  1.1  mrg       return major_p ? 4 : 2;
    286  1.1  mrg     case PTX_VERSION_6_0:
    287  1.1  mrg       return major_p ? 6 : 0;
    288  1.1  mrg     case PTX_VERSION_6_3:
    289  1.1  mrg       return major_p ? 6 : 3;
    290  1.1  mrg     case PTX_VERSION_7_0:
    291  1.1  mrg       return major_p ? 7 : 0;
    292  1.1  mrg     default:
    293  1.1  mrg       gcc_unreachable ();
    294  1.1  mrg     }
    295  1.1  mrg }
    296  1.1  mrg 
    297  1.1  mrg static const char *
    298  1.1  mrg sm_version_to_string (enum ptx_isa sm)
    299  1.1  mrg {
    300  1.1  mrg   switch (sm)
    301  1.1  mrg     {
    302  1.1  mrg #define NVPTX_SM(XX, SEP)			\
    303  1.1  mrg       case PTX_ISA_SM ## XX:			\
    304  1.1  mrg 	return #XX;
    305  1.1  mrg #include "nvptx-sm.def"
    306  1.1  mrg #undef NVPTX_SM
    307  1.1  mrg     default:
    308  1.1  mrg       gcc_unreachable ();
    309  1.1  mrg     }
    310  1.1  mrg }
    311  1.1  mrg 
    312  1.1  mrg static void
    313  1.1  mrg handle_ptx_version_option (void)
    314  1.1  mrg {
    315  1.1  mrg   if (!OPTION_SET_P (ptx_version_option)
    316  1.1  mrg       || ptx_version_option == PTX_VERSION_default)
    317  1.1  mrg     {
    318  1.1  mrg       ptx_version_option = default_ptx_version_option ();
    319  1.1  mrg       return;
    320  1.1  mrg     }
    321  1.1  mrg 
    322  1.1  mrg   enum ptx_version first
    323  1.1  mrg     = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
    324  1.1  mrg 
    325  1.1  mrg   if (ptx_version_option < first)
    326  1.1  mrg     error ("PTX version (%<-mptx%>) needs to be at least %s to support selected"
    327  1.1  mrg 	   " %<-misa%> (sm_%s)", ptx_version_to_string (first),
    328  1.1  mrg 	   sm_version_to_string ((enum ptx_isa)ptx_isa_option));
    329  1.1  mrg }
    330  1.1  mrg 
    331  1.1  mrg /* Implement TARGET_OPTION_OVERRIDE.  */
    332  1.1  mrg 
    333  1.1  mrg static void
    334  1.1  mrg nvptx_option_override (void)
    335  1.1  mrg {
    336  1.1  mrg   init_machine_status = nvptx_init_machine_status;
    337  1.1  mrg 
    338  1.1  mrg   handle_ptx_version_option ();
    339  1.1  mrg 
    340  1.1  mrg   /* Set toplevel_reorder, unless explicitly disabled.  We need
    341  1.1  mrg      reordering so that we emit necessary assembler decls of
    342  1.1  mrg      undeclared variables. */
    343  1.1  mrg   if (!OPTION_SET_P (flag_toplevel_reorder))
    344  1.1  mrg     flag_toplevel_reorder = 1;
    345  1.1  mrg 
    346  1.1  mrg   debug_nonbind_markers_p = 0;
    347  1.1  mrg 
    348  1.1  mrg   /* Set flag_no_common, unless explicitly disabled.  We fake common
    349  1.1  mrg      using .weak, and that's not entirely accurate, so avoid it
    350  1.1  mrg      unless forced.  */
    351  1.1  mrg   if (!OPTION_SET_P (flag_no_common))
    352  1.1  mrg     flag_no_common = 1;
    353  1.1  mrg 
    354  1.1  mrg   /* The patch area requires nops, which we don't have.  */
    355  1.1  mrg   HOST_WIDE_INT patch_area_size, patch_area_entry;
    356  1.1  mrg   parse_and_check_patch_area (flag_patchable_function_entry, false,
    357  1.1  mrg 			      &patch_area_size, &patch_area_entry);
    358  1.1  mrg   if (patch_area_size > 0)
    359  1.1  mrg     sorry ("not generating patch area, nops not supported");
    360  1.1  mrg 
    361  1.1  mrg   /* Assumes that it will see only hard registers.  */
    362  1.1  mrg   flag_var_tracking = 0;
    363  1.1  mrg 
    364  1.1  mrg   if (nvptx_optimize < 0)
    365  1.1  mrg     nvptx_optimize = optimize > 0;
    366  1.1  mrg 
    367  1.1  mrg   declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
    368  1.1  mrg   needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
    369  1.1  mrg   declared_libfuncs_htab
    370  1.1  mrg     = hash_table<declared_libfunc_hasher>::create_ggc (17);
    371  1.1  mrg 
    372  1.1  mrg   oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
    373  1.1  mrg   SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
    374  1.1  mrg   oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
    375  1.1  mrg   oacc_bcast_partition = 0;
    376  1.1  mrg 
    377  1.1  mrg   worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
    378  1.1  mrg   SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
    379  1.1  mrg   worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
    380  1.1  mrg 
    381  1.1  mrg   vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
    382  1.1  mrg   SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
    383  1.1  mrg   vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
    384  1.1  mrg   vector_red_partition = 0;
    385  1.1  mrg 
    386  1.1  mrg   gang_private_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gang_private_shared");
    387  1.1  mrg   SET_SYMBOL_DATA_AREA (gang_private_shared_sym, DATA_AREA_SHARED);
    388  1.1  mrg   gang_private_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
    389  1.1  mrg 
    390  1.1  mrg   diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
    391  1.1  mrg   diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
    392  1.1  mrg   diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
    393  1.1  mrg 
    394  1.1  mrg   if (TARGET_GOMP)
    395  1.1  mrg     target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
    396  1.1  mrg }
    397  1.1  mrg 
    398  1.1  mrg /* Return a ptx type for MODE.  If PROMOTE, then use .u32 for QImode to
    399  1.1  mrg    deal with ptx ideosyncracies.  */
    400  1.1  mrg 
    401  1.1  mrg const char *
    402  1.1  mrg nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
    403  1.1  mrg {
    404  1.1  mrg   switch (mode)
    405  1.1  mrg     {
    406  1.1  mrg     case E_BLKmode:
    407  1.1  mrg       return ".b8";
    408  1.1  mrg     case E_BImode:
    409  1.1  mrg       return ".pred";
    410  1.1  mrg     case E_QImode:
    411  1.1  mrg       if (promote)
    412  1.1  mrg 	return ".u32";
    413  1.1  mrg       else
    414  1.1  mrg 	return ".u8";
    415  1.1  mrg     case E_HImode:
    416  1.1  mrg       return ".u16";
    417  1.1  mrg     case E_SImode:
    418  1.1  mrg       return ".u32";
    419  1.1  mrg     case E_DImode:
    420  1.1  mrg       return ".u64";
    421  1.1  mrg 
    422  1.1  mrg     case E_HFmode:
    423  1.1  mrg       return ".f16";
    424  1.1  mrg     case E_SFmode:
    425  1.1  mrg       return ".f32";
    426  1.1  mrg     case E_DFmode:
    427  1.1  mrg       return ".f64";
    428  1.1  mrg 
    429  1.1  mrg     case E_V2SImode:
    430  1.1  mrg       return ".v2.u32";
    431  1.1  mrg     case E_V2DImode:
    432  1.1  mrg       return ".v2.u64";
    433  1.1  mrg 
    434  1.1  mrg     default:
    435  1.1  mrg       gcc_unreachable ();
    436  1.1  mrg     }
    437  1.1  mrg }
    438  1.1  mrg 
    439  1.1  mrg /* Encode the PTX data area that DECL (which might not actually be a
    440  1.1  mrg    _DECL) should reside in.  */
    441  1.1  mrg 
    442  1.1  mrg static void
    443  1.1  mrg nvptx_encode_section_info (tree decl, rtx rtl, int first)
    444  1.1  mrg {
    445  1.1  mrg   default_encode_section_info (decl, rtl, first);
    446  1.1  mrg   if (first && MEM_P (rtl))
    447  1.1  mrg     {
    448  1.1  mrg       nvptx_data_area area = DATA_AREA_GENERIC;
    449  1.1  mrg 
    450  1.1  mrg       if (TREE_CONSTANT (decl))
    451  1.1  mrg 	area = DATA_AREA_CONST;
    452  1.1  mrg       else if (TREE_CODE (decl) == VAR_DECL)
    453  1.1  mrg 	{
    454  1.1  mrg 	  if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
    455  1.1  mrg 	    {
    456  1.1  mrg 	      area = DATA_AREA_SHARED;
    457  1.1  mrg 	      if (DECL_INITIAL (decl))
    458  1.1  mrg 		error ("static initialization of variable %q+D in %<.shared%>"
    459  1.1  mrg 		       " memory is not supported", decl);
    460  1.1  mrg 	    }
    461  1.1  mrg 	  else
    462  1.1  mrg 	    area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
    463  1.1  mrg 	}
    464  1.1  mrg 
    465  1.1  mrg       SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
    466  1.1  mrg     }
    467  1.1  mrg }
    468  1.1  mrg 
    469  1.1  mrg /* Return the PTX name of the data area in which SYM should be
    470  1.1  mrg    placed.  The symbol must have already been processed by
    471  1.1  mrg    nvptx_encode_seciton_info, or equivalent.  */
    472  1.1  mrg 
    473  1.1  mrg static const char *
    474  1.1  mrg section_for_sym (rtx sym)
    475  1.1  mrg {
    476  1.1  mrg   nvptx_data_area area = SYMBOL_DATA_AREA (sym);
    477  1.1  mrg   /* Same order as nvptx_data_area enum.  */
    478  1.1  mrg   static char const *const areas[] =
    479  1.1  mrg     {"", ".global", ".shared", ".local", ".const", ".param"};
    480  1.1  mrg 
    481  1.1  mrg   return areas[area];
    482  1.1  mrg }
    483  1.1  mrg 
    484  1.1  mrg /* Similarly for a decl.  */
    485  1.1  mrg 
    486  1.1  mrg static const char *
    487  1.1  mrg section_for_decl (const_tree decl)
    488  1.1  mrg {
    489  1.1  mrg   return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
    490  1.1  mrg }
    491  1.1  mrg 
    492  1.1  mrg /* Check NAME for special function names and redirect them by returning a
    493  1.1  mrg    replacement.  This applies to malloc, free and realloc, for which we
    494  1.1  mrg    want to use libgcc wrappers, and call, which triggers a bug in
    495  1.1  mrg    ptxas.  We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
    496  1.1  mrg    not active in an offload compiler -- the names are all set by the
    497  1.1  mrg    host-side compiler.  */
    498  1.1  mrg 
    499  1.1  mrg static const char *
    500  1.1  mrg nvptx_name_replacement (const char *name)
    501  1.1  mrg {
    502  1.1  mrg   if (strcmp (name, "call") == 0)
    503  1.1  mrg     return "__nvptx_call";
    504  1.1  mrg   if (strcmp (name, "malloc") == 0)
    505  1.1  mrg     return "__nvptx_malloc";
    506  1.1  mrg   if (strcmp (name, "free") == 0)
    507  1.1  mrg     return "__nvptx_free";
    508  1.1  mrg   if (strcmp (name, "realloc") == 0)
    509  1.1  mrg     return "__nvptx_realloc";
    510  1.1  mrg   return name;
    511  1.1  mrg }
    512  1.1  mrg 
    513  1.1  mrg /* Return NULL if NAME contains no dot.  Otherwise return a copy of NAME
    514  1.1  mrg    with the dots replaced with dollar signs.  */
    515  1.1  mrg 
    516  1.1  mrg static char *
    517  1.1  mrg nvptx_replace_dot (const char *name)
    518  1.1  mrg {
    519  1.1  mrg   if (strchr (name, '.') == NULL)
    520  1.1  mrg     return NULL;
    521  1.1  mrg 
    522  1.1  mrg   char *p = xstrdup (name);
    523  1.1  mrg   for (size_t i = 0; i < strlen (p); ++i)
    524  1.1  mrg     if (p[i] == '.')
    525  1.1  mrg       p[i] = '$';
    526  1.1  mrg   return p;
    527  1.1  mrg }
    528  1.1  mrg 
    529  1.1  mrg /* If MODE should be treated as two registers of an inner mode, return
    530  1.1  mrg    that inner mode.  Otherwise return VOIDmode.  */
    531  1.1  mrg 
    532  1.1  mrg static machine_mode
    533  1.1  mrg maybe_split_mode (machine_mode mode)
    534  1.1  mrg {
    535  1.1  mrg   if (COMPLEX_MODE_P (mode))
    536  1.1  mrg     return GET_MODE_INNER (mode);
    537  1.1  mrg 
    538  1.1  mrg   if (mode == TImode)
    539  1.1  mrg     return DImode;
    540  1.1  mrg 
    541  1.1  mrg   return VOIDmode;
    542  1.1  mrg }
    543  1.1  mrg 
    544  1.1  mrg /* Return true if mode should be treated as two registers.  */
    545  1.1  mrg 
    546  1.1  mrg static bool
    547  1.1  mrg split_mode_p (machine_mode mode)
    548  1.1  mrg {
    549  1.1  mrg   return maybe_split_mode (mode) != VOIDmode;
    550  1.1  mrg }
    551  1.1  mrg 
    552  1.1  mrg /* Output a register, subreg, or register pair (with optional
    553  1.1  mrg    enclosing braces).  */
    554  1.1  mrg 
    555  1.1  mrg static void
    556  1.1  mrg output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
    557  1.1  mrg 	    int subreg_offset = -1)
    558  1.1  mrg {
    559  1.1  mrg   if (inner_mode == VOIDmode)
    560  1.1  mrg     {
    561  1.1  mrg       if (HARD_REGISTER_NUM_P (regno))
    562  1.1  mrg 	fprintf (file, "%s", reg_names[regno]);
    563  1.1  mrg       else
    564  1.1  mrg 	fprintf (file, "%%r%d", regno);
    565  1.1  mrg     }
    566  1.1  mrg   else if (subreg_offset >= 0)
    567  1.1  mrg     {
    568  1.1  mrg       output_reg (file, regno, VOIDmode);
    569  1.1  mrg       fprintf (file, "$%d", subreg_offset);
    570  1.1  mrg     }
    571  1.1  mrg   else
    572  1.1  mrg     {
    573  1.1  mrg       if (subreg_offset == -1)
    574  1.1  mrg 	fprintf (file, "{");
    575  1.1  mrg       output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
    576  1.1  mrg       fprintf (file, ",");
    577  1.1  mrg       output_reg (file, regno, inner_mode, 0);
    578  1.1  mrg       if (subreg_offset == -1)
    579  1.1  mrg 	fprintf (file, "}");
    580  1.1  mrg     }
    581  1.1  mrg }
    582  1.1  mrg 
    583  1.1  mrg /* Emit forking instructions for MASK.  */
    584  1.1  mrg 
    585  1.1  mrg static void
    586  1.1  mrg nvptx_emit_forking (unsigned mask, bool is_call)
    587  1.1  mrg {
    588  1.1  mrg   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
    589  1.1  mrg 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
    590  1.1  mrg   if (mask)
    591  1.1  mrg     {
    592  1.1  mrg       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
    593  1.1  mrg 
    594  1.1  mrg       /* Emit fork at all levels.  This helps form SESE regions, as
    595  1.1  mrg 	 it creates a block with a single successor before entering a
    596  1.1  mrg 	 partitooned region.  That is a good candidate for the end of
    597  1.1  mrg 	 an SESE region.  */
    598  1.1  mrg       emit_insn (gen_nvptx_fork (op));
    599  1.1  mrg       emit_insn (gen_nvptx_forked (op));
    600  1.1  mrg     }
    601  1.1  mrg }
    602  1.1  mrg 
    603  1.1  mrg /* Emit joining instructions for MASK.  */
    604  1.1  mrg 
    605  1.1  mrg static void
    606  1.1  mrg nvptx_emit_joining (unsigned mask, bool is_call)
    607  1.1  mrg {
    608  1.1  mrg   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
    609  1.1  mrg 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
    610  1.1  mrg   if (mask)
    611  1.1  mrg     {
    612  1.1  mrg       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
    613  1.1  mrg 
    614  1.1  mrg       /* Emit joining for all non-call pars to ensure there's a single
    615  1.1  mrg 	 predecessor for the block the join insn ends up in.  This is
    616  1.1  mrg 	 needed for skipping entire loops.  */
    617  1.1  mrg       emit_insn (gen_nvptx_joining (op));
    618  1.1  mrg       emit_insn (gen_nvptx_join (op));
    619  1.1  mrg     }
    620  1.1  mrg }
    621  1.1  mrg 
    622  1.1  mrg 
    623  1.1  mrg /* Determine whether MODE and TYPE (possibly NULL) should be passed or
    625  1.1  mrg    returned in memory.  Integer and floating types supported by the
    626  1.1  mrg    machine are passed in registers, everything else is passed in
    627  1.1  mrg    memory.  Complex types are split.  */
    628  1.1  mrg 
    629  1.1  mrg static bool
    630  1.1  mrg pass_in_memory (machine_mode mode, const_tree type, bool for_return)
    631  1.1  mrg {
    632  1.1  mrg   if (type)
    633  1.1  mrg     {
    634  1.1  mrg       if (AGGREGATE_TYPE_P (type))
    635  1.1  mrg 	return true;
    636  1.1  mrg       if (TREE_CODE (type) == VECTOR_TYPE)
    637  1.1  mrg 	return true;
    638  1.1  mrg     }
    639  1.1  mrg 
    640  1.1  mrg   if (!for_return && COMPLEX_MODE_P (mode))
    641  1.1  mrg     /* Complex types are passed as two underlying args.  */
    642  1.1  mrg     mode = GET_MODE_INNER (mode);
    643  1.1  mrg 
    644  1.1  mrg   if (GET_MODE_CLASS (mode) != MODE_INT
    645  1.1  mrg       && GET_MODE_CLASS (mode) != MODE_FLOAT)
    646  1.1  mrg     return true;
    647  1.1  mrg 
    648  1.1  mrg   if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
    649  1.1  mrg     return true;
    650  1.1  mrg 
    651  1.1  mrg   return false;
    652  1.1  mrg }
    653  1.1  mrg 
    654  1.1  mrg /* A non-memory argument of mode MODE is being passed, determine the mode it
    655  1.1  mrg    should be promoted to.  This is also used for determining return
    656  1.1  mrg    type promotion.  */
    657  1.1  mrg 
    658  1.1  mrg static machine_mode
    659  1.1  mrg promote_arg (machine_mode mode, bool prototyped)
    660  1.1  mrg {
    661  1.1  mrg   if (!prototyped && mode == SFmode)
    662  1.1  mrg     /* K&R float promotion for unprototyped functions.  */
    663  1.1  mrg     mode = DFmode;
    664  1.1  mrg   else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
    665  1.1  mrg     mode = SImode;
    666  1.1  mrg 
    667  1.1  mrg   return mode;
    668  1.1  mrg }
    669  1.1  mrg 
    670  1.1  mrg /* A non-memory return type of MODE is being returned.  Determine the
    671  1.1  mrg    mode it should be promoted to.  */
    672  1.1  mrg 
    673  1.1  mrg static machine_mode
    674  1.1  mrg promote_return (machine_mode mode)
    675  1.1  mrg {
    676  1.1  mrg   return promote_arg (mode, true);
    677  1.1  mrg }
    678  1.1  mrg 
    679  1.1  mrg /* Implement TARGET_FUNCTION_ARG.  */
    680  1.1  mrg 
    681  1.1  mrg static rtx
    682  1.1  mrg nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
    683  1.1  mrg {
    684  1.1  mrg   if (arg.end_marker_p () || !arg.named)
    685  1.1  mrg     return NULL_RTX;
    686  1.1  mrg 
    687  1.1  mrg   return gen_reg_rtx (arg.mode);
    688  1.1  mrg }
    689  1.1  mrg 
    690  1.1  mrg /* Implement TARGET_FUNCTION_INCOMING_ARG.  */
    691  1.1  mrg 
    692  1.1  mrg static rtx
    693  1.1  mrg nvptx_function_incoming_arg (cumulative_args_t cum_v,
    694  1.1  mrg 			     const function_arg_info &arg)
    695  1.1  mrg {
    696  1.1  mrg   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
    697  1.1  mrg 
    698  1.1  mrg   if (arg.end_marker_p () || !arg.named)
    699  1.1  mrg     return NULL_RTX;
    700  1.1  mrg 
    701  1.1  mrg   /* No need to deal with split modes here, the only case that can
    702  1.1  mrg      happen is complex modes and those are dealt with by
    703  1.1  mrg      TARGET_SPLIT_COMPLEX_ARG.  */
    704  1.1  mrg   return gen_rtx_UNSPEC (arg.mode,
    705  1.1  mrg 			 gen_rtvec (1, GEN_INT (cum->count)),
    706  1.1  mrg 			 UNSPEC_ARG_REG);
    707  1.1  mrg }
    708  1.1  mrg 
    709  1.1  mrg /* Implement TARGET_FUNCTION_ARG_ADVANCE.  */
    710  1.1  mrg 
    711  1.1  mrg static void
    712  1.1  mrg nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
    713  1.1  mrg {
    714  1.1  mrg   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
    715  1.1  mrg 
    716  1.1  mrg   cum->count++;
    717  1.1  mrg }
    718  1.1  mrg 
    719  1.1  mrg /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
    720  1.1  mrg 
    721  1.1  mrg    For nvptx This is only used for varadic args.  The type has already
    722  1.1  mrg    been promoted and/or converted to invisible reference.  */
    723  1.1  mrg 
    724  1.1  mrg static unsigned
    725  1.1  mrg nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
    726  1.1  mrg {
    727  1.1  mrg   return GET_MODE_ALIGNMENT (mode);
    728  1.1  mrg }
    729  1.1  mrg 
    730  1.1  mrg /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
    731  1.1  mrg 
    732  1.1  mrg    For nvptx, we know how to handle functions declared as stdarg: by
    733  1.1  mrg    passing an extra pointer to the unnamed arguments.  However, the
    734  1.1  mrg    Fortran frontend can produce a different situation, where a
    735  1.1  mrg    function pointer is declared with no arguments, but the actual
    736  1.1  mrg    function and calls to it take more arguments.  In that case, we
    737  1.1  mrg    want to ensure the call matches the definition of the function.  */
    738  1.1  mrg 
    739  1.1  mrg static bool
    740  1.1  mrg nvptx_strict_argument_naming (cumulative_args_t cum_v)
    741  1.1  mrg {
    742  1.1  mrg   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
    743  1.1  mrg 
    744  1.1  mrg   return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
    745  1.1  mrg }
    746  1.1  mrg 
    747  1.1  mrg /* Implement TARGET_LIBCALL_VALUE.  */
    748  1.1  mrg 
    749  1.1  mrg static rtx
    750  1.1  mrg nvptx_libcall_value (machine_mode mode, const_rtx)
    751  1.1  mrg {
    752  1.1  mrg   if (!cfun || !cfun->machine->doing_call)
    753  1.1  mrg     /* Pretend to return in a hard reg for early uses before pseudos can be
    754  1.1  mrg        generated.  */
    755  1.1  mrg     return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
    756  1.1  mrg 
    757  1.1  mrg   return gen_reg_rtx (mode);
    758  1.1  mrg }
    759  1.1  mrg 
    760  1.1  mrg /* TARGET_FUNCTION_VALUE implementation.  Returns an RTX representing the place
    761  1.1  mrg    where function FUNC returns or receives a value of data type TYPE.  */
    762  1.1  mrg 
    763  1.1  mrg static rtx
    764  1.1  mrg nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
    765  1.1  mrg 		      bool outgoing)
    766  1.1  mrg {
    767  1.1  mrg   machine_mode mode = promote_return (TYPE_MODE (type));
    768  1.1  mrg 
    769  1.1  mrg   if (outgoing)
    770  1.1  mrg     {
    771  1.1  mrg       gcc_assert (cfun);
    772  1.1  mrg       cfun->machine->return_mode = mode;
    773  1.1  mrg       return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
    774  1.1  mrg     }
    775  1.1  mrg 
    776  1.1  mrg   return nvptx_libcall_value (mode, NULL_RTX);
    777  1.1  mrg }
    778  1.1  mrg 
    779  1.1  mrg /* Implement TARGET_FUNCTION_VALUE_REGNO_P.  */
    780  1.1  mrg 
    781  1.1  mrg static bool
    782  1.1  mrg nvptx_function_value_regno_p (const unsigned int regno)
    783  1.1  mrg {
    784  1.1  mrg   return regno == NVPTX_RETURN_REGNUM;
    785  1.1  mrg }
    786  1.1  mrg 
    787  1.1  mrg /* Types with a mode other than those supported by the machine are passed by
    788  1.1  mrg    reference in memory.  */
    789  1.1  mrg 
    790  1.1  mrg static bool
    791  1.1  mrg nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
    792  1.1  mrg {
    793  1.1  mrg   return pass_in_memory (arg.mode, arg.type, false);
    794  1.1  mrg }
    795  1.1  mrg 
    796  1.1  mrg /* Implement TARGET_RETURN_IN_MEMORY.  */
    797  1.1  mrg 
    798  1.1  mrg static bool
    799  1.1  mrg nvptx_return_in_memory (const_tree type, const_tree)
    800  1.1  mrg {
    801  1.1  mrg   return pass_in_memory (TYPE_MODE (type), type, true);
    802  1.1  mrg }
    803  1.1  mrg 
    804  1.1  mrg /* Implement TARGET_PROMOTE_FUNCTION_MODE.  */
    805  1.1  mrg 
    806  1.1  mrg static machine_mode
    807  1.1  mrg nvptx_promote_function_mode (const_tree type, machine_mode mode,
    808  1.1  mrg 			     int *ARG_UNUSED (punsignedp),
    809  1.1  mrg 			     const_tree funtype, int for_return)
    810  1.1  mrg {
    811  1.1  mrg   return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
    812  1.1  mrg }
    813  1.1  mrg 
    814  1.1  mrg /* Helper for write_arg.  Emit a single PTX argument of MODE, either
    815  1.1  mrg    in a prototype, or as copy in a function prologue.  ARGNO is the
    816  1.1  mrg    index of this argument in the PTX function.  FOR_REG is negative,
    817  1.1  mrg    if we're emitting the PTX prototype.  It is zero if we're copying
    818  1.1  mrg    to an argument register and it is greater than zero if we're
    819  1.1  mrg    copying to a specific hard register.  */
    820  1.1  mrg 
    821  1.1  mrg static int
    822  1.1  mrg write_arg_mode (std::stringstream &s, int for_reg, int argno,
    823  1.1  mrg 		machine_mode mode)
    824  1.1  mrg {
    825  1.1  mrg   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
    826  1.1  mrg 
    827  1.1  mrg   if (for_reg < 0)
    828  1.1  mrg     {
    829  1.1  mrg       /* Writing PTX prototype.  */
    830  1.1  mrg       s << (argno ? ", " : " (");
    831  1.1  mrg       s << ".param" << ptx_type << " %in_ar" << argno;
    832  1.1  mrg     }
    833  1.1  mrg   else
    834  1.1  mrg     {
    835  1.1  mrg       s << "\t.reg" << ptx_type << " ";
    836  1.1  mrg       if (for_reg)
    837  1.1  mrg 	s << reg_names[for_reg];
    838  1.1  mrg       else
    839  1.1  mrg 	s << "%ar" << argno;
    840  1.1  mrg       s << ";\n";
    841  1.1  mrg       if (argno >= 0)
    842  1.1  mrg 	{
    843  1.1  mrg 	  s << "\tld.param" << ptx_type << " ";
    844  1.1  mrg 	  if (for_reg)
    845  1.1  mrg 	    s << reg_names[for_reg];
    846  1.1  mrg 	  else
    847  1.1  mrg 	    s << "%ar" << argno;
    848  1.1  mrg 	  s << ", [%in_ar" << argno << "];\n";
    849  1.1  mrg 	}
    850  1.1  mrg     }
    851  1.1  mrg   return argno + 1;
    852  1.1  mrg }
    853  1.1  mrg 
    854  1.1  mrg /* Process function parameter TYPE to emit one or more PTX
    855  1.1  mrg    arguments. S, FOR_REG and ARGNO as for write_arg_mode.  PROTOTYPED
    856  1.1  mrg    is true, if this is a prototyped function, rather than an old-style
    857  1.1  mrg    C declaration.  Returns the next argument number to use.
    858  1.1  mrg 
    859  1.1  mrg    The promotion behavior here must match the regular GCC function
    860  1.1  mrg    parameter marshalling machinery.  */
    861  1.1  mrg 
    862  1.1  mrg static int
    863  1.1  mrg write_arg_type (std::stringstream &s, int for_reg, int argno,
    864  1.1  mrg 		tree type, bool prototyped)
    865  1.1  mrg {
    866  1.1  mrg   machine_mode mode = TYPE_MODE (type);
    867  1.1  mrg 
    868  1.1  mrg   if (mode == VOIDmode)
    869  1.1  mrg     return argno;
    870  1.1  mrg 
    871  1.1  mrg   if (pass_in_memory (mode, type, false))
    872  1.1  mrg     mode = Pmode;
    873  1.1  mrg   else
    874  1.1  mrg     {
    875  1.1  mrg       bool split = TREE_CODE (type) == COMPLEX_TYPE;
    876  1.1  mrg 
    877  1.1  mrg       if (split)
    878  1.1  mrg 	{
    879  1.1  mrg 	  /* Complex types are sent as two separate args.  */
    880  1.1  mrg 	  type = TREE_TYPE (type);
    881  1.1  mrg 	  mode = TYPE_MODE (type);
    882  1.1  mrg 	  prototyped = true;
    883  1.1  mrg 	}
    884  1.1  mrg 
    885  1.1  mrg       mode = promote_arg (mode, prototyped);
    886  1.1  mrg       if (split)
    887  1.1  mrg 	argno = write_arg_mode (s, for_reg, argno, mode);
    888  1.1  mrg     }
    889  1.1  mrg 
    890  1.1  mrg   return write_arg_mode (s, for_reg, argno, mode);
    891  1.1  mrg }
    892  1.1  mrg 
    893  1.1  mrg /* Emit a PTX return as a prototype or function prologue declaration
    894  1.1  mrg    for MODE.  */
    895  1.1  mrg 
    896  1.1  mrg static void
    897  1.1  mrg write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
    898  1.1  mrg {
    899  1.1  mrg   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
    900  1.1  mrg   const char *pfx = "\t.reg";
    901  1.1  mrg   const char *sfx = ";\n";
    902  1.1  mrg 
    903  1.1  mrg   if (for_proto)
    904  1.1  mrg     pfx = "(.param", sfx = "_out) ";
    905  1.1  mrg 
    906  1.1  mrg   s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
    907  1.1  mrg }
    908  1.1  mrg 
    909  1.1  mrg /* Process a function return TYPE to emit a PTX return as a prototype
    910  1.1  mrg    or function prologue declaration.  Returns true if return is via an
    911  1.1  mrg    additional pointer parameter.  The promotion behavior here must
    912  1.1  mrg    match the regular GCC function return mashalling.  */
    913  1.1  mrg 
    914  1.1  mrg static bool
    915  1.1  mrg write_return_type (std::stringstream &s, bool for_proto, tree type)
    916  1.1  mrg {
    917  1.1  mrg   machine_mode mode = TYPE_MODE (type);
    918  1.1  mrg 
    919  1.1  mrg   if (mode == VOIDmode)
    920  1.1  mrg     return false;
    921  1.1  mrg 
    922  1.1  mrg   bool return_in_mem = pass_in_memory (mode, type, true);
    923  1.1  mrg 
    924  1.1  mrg   if (return_in_mem)
    925  1.1  mrg     {
    926  1.1  mrg       if (for_proto)
    927  1.1  mrg 	return return_in_mem;
    928  1.1  mrg 
    929  1.1  mrg       /* Named return values can cause us to return a pointer as well
    930  1.1  mrg 	 as expect an argument for the return location.  This is
    931  1.1  mrg 	 optimization-level specific, so no caller can make use of
    932  1.1  mrg 	 this data, but more importantly for us, we must ensure it
    933  1.1  mrg 	 doesn't change the PTX prototype.  */
    934  1.1  mrg       mode = (machine_mode) cfun->machine->return_mode;
    935  1.1  mrg 
    936  1.1  mrg       if (mode == VOIDmode)
    937  1.1  mrg 	return return_in_mem;
    938  1.1  mrg 
    939  1.1  mrg       /* Clear return_mode to inhibit copy of retval to non-existent
    940  1.1  mrg 	 retval parameter.  */
    941  1.1  mrg       cfun->machine->return_mode = VOIDmode;
    942  1.1  mrg     }
    943  1.1  mrg   else
    944  1.1  mrg     mode = promote_return (mode);
    945  1.1  mrg 
    946  1.1  mrg   write_return_mode (s, for_proto, mode);
    947  1.1  mrg 
    948  1.1  mrg   return return_in_mem;
    949  1.1  mrg }
    950  1.1  mrg 
    951  1.1  mrg /* Look for attributes in ATTRS that would indicate we must write a function
    952  1.1  mrg    as a .entry kernel rather than a .func.  Return true if one is found.  */
    953  1.1  mrg 
    954  1.1  mrg static bool
    955  1.1  mrg write_as_kernel (tree attrs)
    956  1.1  mrg {
    957  1.1  mrg   return (lookup_attribute ("kernel", attrs) != NULL_TREE
    958  1.1  mrg 	  || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
    959  1.1  mrg 	      && lookup_attribute ("oacc function", attrs) != NULL_TREE));
    960  1.1  mrg   /* For OpenMP target regions, the corresponding kernel entry is emitted from
    961  1.1  mrg      write_omp_entry as a separate function.  */
    962  1.1  mrg }
    963  1.1  mrg 
    964  1.1  mrg /* Emit a linker marker for a function decl or defn.  */
    965  1.1  mrg 
    966  1.1  mrg static void
    967  1.1  mrg write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
    968  1.1  mrg 		 const char *name)
    969  1.1  mrg {
    970  1.1  mrg   s << "\n// BEGIN";
    971  1.1  mrg   if (globalize)
    972  1.1  mrg     s << " GLOBAL";
    973  1.1  mrg   s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
    974  1.1  mrg   s << name << "\n";
    975  1.1  mrg }
    976  1.1  mrg 
    977  1.1  mrg /* Emit a linker marker for a variable decl or defn.  */
    978  1.1  mrg 
    979  1.1  mrg static void
    980  1.1  mrg write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
    981  1.1  mrg {
    982  1.1  mrg   fprintf (file, "\n// BEGIN%s VAR %s: ",
    983  1.1  mrg 	   globalize ? " GLOBAL" : "",
    984  1.1  mrg 	   is_defn ? "DEF" : "DECL");
    985  1.1  mrg   assemble_name_raw (file, name);
    986  1.1  mrg   fputs ("\n", file);
    987  1.1  mrg }
    988  1.1  mrg 
    989  1.1  mrg /* Helper function for write_fn_proto.  */
    990  1.1  mrg 
    991  1.1  mrg static void
    992  1.1  mrg write_fn_proto_1 (std::stringstream &s, bool is_defn,
    993  1.1  mrg 		  const char *name, const_tree decl)
    994  1.1  mrg {
    995  1.1  mrg   if (lookup_attribute ("alias", DECL_ATTRIBUTES (decl)) == NULL)
    996  1.1  mrg     write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
    997  1.1  mrg 
    998  1.1  mrg   /* PTX declaration.  */
    999  1.1  mrg   if (DECL_EXTERNAL (decl))
   1000  1.1  mrg     s << ".extern ";
   1001  1.1  mrg   else if (TREE_PUBLIC (decl))
   1002  1.1  mrg     s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
   1003  1.1  mrg   s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
   1004  1.1  mrg 
   1005  1.1  mrg   tree fntype = TREE_TYPE (decl);
   1006  1.1  mrg   tree result_type = TREE_TYPE (fntype);
   1007  1.1  mrg 
   1008  1.1  mrg   /* atomic_compare_exchange_$n builtins have an exceptional calling
   1009  1.1  mrg      convention.  */
   1010  1.1  mrg   int not_atomic_weak_arg = -1;
   1011  1.1  mrg   if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
   1012  1.1  mrg     switch (DECL_FUNCTION_CODE (decl))
   1013  1.1  mrg       {
   1014  1.1  mrg       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
   1015  1.1  mrg       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
   1016  1.1  mrg       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
   1017  1.1  mrg       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
   1018  1.1  mrg       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
   1019  1.1  mrg 	/* These atomics skip the 'weak' parm in an actual library
   1020  1.1  mrg 	   call.  We must skip it in the prototype too.  */
   1021  1.1  mrg 	not_atomic_weak_arg = 3;
   1022  1.1  mrg 	break;
   1023  1.1  mrg 
   1024  1.1  mrg       default:
   1025  1.1  mrg 	break;
   1026  1.1  mrg       }
   1027  1.1  mrg 
   1028  1.1  mrg   /* Declare the result.  */
   1029  1.1  mrg   bool return_in_mem = write_return_type (s, true, result_type);
   1030  1.1  mrg 
   1031  1.1  mrg   s << name;
   1032  1.1  mrg 
   1033  1.1  mrg   int argno = 0;
   1034  1.1  mrg 
   1035  1.1  mrg   /* Emit argument list.  */
   1036  1.1  mrg   if (return_in_mem)
   1037  1.1  mrg     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
   1038  1.1  mrg 
   1039  1.1  mrg   /* We get:
   1040  1.1  mrg      NULL in TYPE_ARG_TYPES, for old-style functions
   1041  1.1  mrg      NULL in DECL_ARGUMENTS, for builtin functions without another
   1042  1.1  mrg        declaration.
   1043  1.1  mrg      So we have to pick the best one we have.  */
   1044  1.1  mrg   tree args = TYPE_ARG_TYPES (fntype);
   1045  1.1  mrg   bool prototyped = true;
   1046  1.1  mrg   if (!args)
   1047  1.1  mrg     {
   1048  1.1  mrg       args = DECL_ARGUMENTS (decl);
   1049  1.1  mrg       prototyped = false;
   1050  1.1  mrg     }
   1051  1.1  mrg 
   1052  1.1  mrg   for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
   1053  1.1  mrg     {
   1054  1.1  mrg       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
   1055  1.1  mrg 
   1056  1.1  mrg       if (not_atomic_weak_arg)
   1057  1.1  mrg 	argno = write_arg_type (s, -1, argno, type, prototyped);
   1058  1.1  mrg       else
   1059  1.1  mrg 	gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE);
   1060  1.1  mrg     }
   1061  1.1  mrg 
   1062  1.1  mrg   if (stdarg_p (fntype))
   1063  1.1  mrg     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
   1064  1.1  mrg 
   1065  1.1  mrg   if (DECL_STATIC_CHAIN (decl))
   1066  1.1  mrg     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
   1067  1.1  mrg 
   1068  1.1  mrg   if (argno < 2 && strcmp (name, "main") == 0)
   1069  1.1  mrg     {
   1070  1.1  mrg       if (argno == 0)
   1071  1.1  mrg 	argno = write_arg_type (s, -1, argno, integer_type_node, true);
   1072  1.1  mrg 
   1073  1.1  mrg       if (argno == 1)
   1074  1.1  mrg 	argno = write_arg_type (s, -1, argno, ptr_type_node, true);
   1075  1.1  mrg     }
   1076  1.1  mrg 
   1077  1.1  mrg   if (argno)
   1078  1.1  mrg     s << ")";
   1079  1.1  mrg 
   1080  1.1  mrg   s << (is_defn ? "\n" : ";\n");
   1081  1.1  mrg }
   1082  1.1  mrg 
   1083  1.1  mrg /* Write a .func or .kernel declaration or definition along with
   1084  1.1  mrg    a helper comment for use by ld.  S is the stream to write to, DECL
   1085  1.1  mrg    the decl for the function with name NAME.  For definitions, emit
   1086  1.1  mrg    a declaration too.  */
   1087  1.1  mrg 
   1088  1.1  mrg static void
   1089  1.1  mrg write_fn_proto (std::stringstream &s, bool is_defn,
   1090  1.1  mrg 		const char *name, const_tree decl)
   1091  1.1  mrg {
   1092  1.1  mrg   const char *replacement = nvptx_name_replacement (name);
   1093  1.1  mrg   char *replaced_dots = NULL;
   1094  1.1  mrg   if (replacement != name)
   1095  1.1  mrg     name = replacement;
   1096  1.1  mrg   else
   1097  1.1  mrg     {
   1098  1.1  mrg       replaced_dots = nvptx_replace_dot (name);
   1099  1.1  mrg       if (replaced_dots)
   1100  1.1  mrg 	name = replaced_dots;
   1101  1.1  mrg     }
   1102  1.1  mrg   if (name[0] == '*')
   1103  1.1  mrg     name++;
   1104  1.1  mrg 
   1105  1.1  mrg   if (is_defn)
   1106  1.1  mrg     /* Emit a declaration.  The PTX assembler gets upset without it.  */
   1107  1.1  mrg     write_fn_proto_1 (s, false, name, decl);
   1108  1.1  mrg 
   1109  1.1  mrg   write_fn_proto_1 (s, is_defn, name, decl);
   1110  1.1  mrg 
   1111  1.1  mrg   if (replaced_dots)
   1112  1.1  mrg     XDELETE (replaced_dots);
   1113  1.1  mrg }
   1114  1.1  mrg 
   1115  1.1  mrg /* Construct a function declaration from a call insn.  This can be
   1116  1.1  mrg    necessary for two reasons - either we have an indirect call which
   1117  1.1  mrg    requires a .callprototype declaration, or we have a libcall
   1118  1.1  mrg    generated by emit_library_call for which no decl exists.  */
   1119  1.1  mrg 
   1120  1.1  mrg static void
   1121  1.1  mrg write_fn_proto_from_insn (std::stringstream &s, const char *name,
   1122  1.1  mrg 			  rtx result, rtx pat)
   1123  1.1  mrg {
   1124  1.1  mrg   char *replaced_dots = NULL;
   1125  1.1  mrg 
   1126  1.1  mrg   if (!name)
   1127  1.1  mrg     {
   1128  1.1  mrg       s << "\t.callprototype ";
   1129  1.1  mrg       name = "_";
   1130  1.1  mrg     }
   1131  1.1  mrg   else
   1132  1.1  mrg     {
   1133  1.1  mrg       const char *replacement = nvptx_name_replacement (name);
   1134  1.1  mrg       if (replacement != name)
   1135  1.1  mrg 	name = replacement;
   1136  1.1  mrg       else
   1137  1.1  mrg 	{
   1138  1.1  mrg 	  replaced_dots = nvptx_replace_dot (name);
   1139  1.1  mrg 	  if (replaced_dots)
   1140  1.1  mrg 	    name = replaced_dots;
   1141  1.1  mrg 	}
   1142  1.1  mrg       write_fn_marker (s, false, true, name);
   1143  1.1  mrg       s << "\t.extern .func ";
   1144  1.1  mrg     }
   1145  1.1  mrg 
   1146  1.1  mrg   if (result != NULL_RTX)
   1147  1.1  mrg     write_return_mode (s, true, GET_MODE (result));
   1148  1.1  mrg 
   1149  1.1  mrg   s << name;
   1150  1.1  mrg   if (replaced_dots)
   1151  1.1  mrg     XDELETE (replaced_dots);
   1152  1.1  mrg 
   1153  1.1  mrg   int arg_end = XVECLEN (pat, 0);
   1154  1.1  mrg   for (int i = 1; i < arg_end; i++)
   1155  1.1  mrg     {
   1156  1.1  mrg       /* We don't have to deal with mode splitting & promotion here,
   1157  1.1  mrg 	 as that was already done when generating the call
   1158  1.1  mrg 	 sequence.  */
   1159  1.1  mrg       machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
   1160  1.1  mrg 
   1161  1.1  mrg       write_arg_mode (s, -1, i - 1, mode);
   1162  1.1  mrg     }
   1163  1.1  mrg   if (arg_end != 1)
   1164  1.1  mrg     s << ")";
   1165  1.1  mrg   s << ";\n";
   1166  1.1  mrg }
   1167  1.1  mrg 
   1168  1.1  mrg /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
   1169  1.1  mrg    table and write a ptx prototype.  These are emitted at end of
   1170  1.1  mrg    compilation.  */
   1171  1.1  mrg 
   1172  1.1  mrg static void
   1173  1.1  mrg nvptx_record_fndecl (tree decl)
   1174  1.1  mrg {
   1175  1.1  mrg   tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
   1176  1.1  mrg   if (*slot == NULL)
   1177  1.1  mrg     {
   1178  1.1  mrg       *slot = decl;
   1179  1.1  mrg       const char *name = get_fnname_from_decl (decl);
   1180  1.1  mrg       write_fn_proto (func_decls, false, name, decl);
   1181  1.1  mrg     }
   1182  1.1  mrg }
   1183  1.1  mrg 
   1184  1.1  mrg /* Record a libcall or unprototyped external function. CALLEE is the
   1185  1.1  mrg    SYMBOL_REF.  Insert into the libfunc hash table and emit a ptx
   1186  1.1  mrg    declaration for it.  */
   1187  1.1  mrg 
   1188  1.1  mrg static void
   1189  1.1  mrg nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
   1190  1.1  mrg {
   1191  1.1  mrg   rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
   1192  1.1  mrg   if (*slot == NULL)
   1193  1.1  mrg     {
   1194  1.1  mrg       *slot = callee;
   1195  1.1  mrg 
   1196  1.1  mrg       const char *name = XSTR (callee, 0);
   1197  1.1  mrg       write_fn_proto_from_insn (func_decls, name, retval, pat);
   1198  1.1  mrg     }
   1199  1.1  mrg }
   1200  1.1  mrg 
   1201  1.1  mrg /* DECL is an external FUNCTION_DECL, that we're referencing.  If it
   1202  1.1  mrg    is prototyped, record it now.  Otherwise record it as needed at end
   1203  1.1  mrg    of compilation, when we might have more information about it.  */
   1204  1.1  mrg 
   1205  1.1  mrg void
   1206  1.1  mrg nvptx_record_needed_fndecl (tree decl)
   1207  1.1  mrg {
   1208  1.1  mrg   if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
   1209  1.1  mrg     {
   1210  1.1  mrg       tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
   1211  1.1  mrg       if (*slot == NULL)
   1212  1.1  mrg 	*slot = decl;
   1213  1.1  mrg     }
   1214  1.1  mrg   else
   1215  1.1  mrg     nvptx_record_fndecl (decl);
   1216  1.1  mrg }
   1217  1.1  mrg 
   1218  1.1  mrg /* SYM is a SYMBOL_REF.  If it refers to an external function, record
   1219  1.1  mrg    it as needed.  */
   1220  1.1  mrg 
   1221  1.1  mrg static void
   1222  1.1  mrg nvptx_maybe_record_fnsym (rtx sym)
   1223  1.1  mrg {
   1224  1.1  mrg   tree decl = SYMBOL_REF_DECL (sym);
   1225  1.1  mrg 
   1226  1.1  mrg   if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
   1227  1.1  mrg     nvptx_record_needed_fndecl (decl);
   1228  1.1  mrg }
   1229  1.1  mrg 
   1230  1.1  mrg /* Emit a local array to hold some part of a conventional stack frame
   1231  1.1  mrg    and initialize REGNO to point to it.  If the size is zero, it'll
   1232  1.1  mrg    never be valid to dereference, so we can simply initialize to
   1233  1.1  mrg    zero.  */
   1234  1.1  mrg 
   1235  1.1  mrg static void
   1236  1.1  mrg init_frame (FILE  *file, int regno, unsigned align, unsigned size)
   1237  1.1  mrg {
   1238  1.1  mrg   if (size)
   1239  1.1  mrg     fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
   1240  1.1  mrg 	     align, reg_names[regno], size);
   1241  1.1  mrg   fprintf (file, "\t.reg.u%d %s;\n",
   1242  1.1  mrg 	   POINTER_SIZE, reg_names[regno]);
   1243  1.1  mrg   fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
   1244  1.1  mrg 		  :  "\tmov.u%d %s, 0;\n"),
   1245  1.1  mrg 	   POINTER_SIZE, reg_names[regno], reg_names[regno]);
   1246  1.1  mrg }
   1247  1.1  mrg 
   1248  1.1  mrg /* Emit soft stack frame setup sequence.  */
   1249  1.1  mrg 
   1250  1.1  mrg static void
   1251  1.1  mrg init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
   1252  1.1  mrg {
   1253  1.1  mrg   /* Maintain 64-bit stack alignment.  */
   1254  1.1  mrg   unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
   1255  1.1  mrg   size = ROUND_UP (size, keep_align);
   1256  1.1  mrg   int bits = POINTER_SIZE;
   1257  1.1  mrg   const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
   1258  1.1  mrg   const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
   1259  1.1  mrg   const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
   1260  1.1  mrg   const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
   1261  1.1  mrg   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
   1262  1.1  mrg   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
   1263  1.1  mrg   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
   1264  1.1  mrg   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
   1265  1.1  mrg   fprintf (file, "\t{\n");
   1266  1.1  mrg   fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
   1267  1.1  mrg   fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
   1268  1.1  mrg   fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
   1269  1.1  mrg   fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
   1270  1.1  mrg   fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
   1271  1.1  mrg 	   bits == 64 ? ".wide" : ".lo", bits / 8);
   1272  1.1  mrg   fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
   1273  1.1  mrg 
   1274  1.1  mrg   /* Initialize %sspslot = &__nvptx_stacks[tid.y].  */
   1275  1.1  mrg   fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
   1276  1.1  mrg 
   1277  1.1  mrg   /* Initialize %sspprev = __nvptx_stacks[tid.y].  */
   1278  1.1  mrg   fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
   1279  1.1  mrg 	   bits, reg_sspprev, reg_sspslot);
   1280  1.1  mrg 
   1281  1.1  mrg   /* Initialize %frame = %sspprev - size.  */
   1282  1.1  mrg   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
   1283  1.1  mrg 	   bits, reg_frame, reg_sspprev, size);
   1284  1.1  mrg 
   1285  1.1  mrg   /* Apply alignment, if larger than 64.  */
   1286  1.1  mrg   if (alignment > keep_align)
   1287  1.1  mrg     fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
   1288  1.1  mrg 	     bits, reg_frame, reg_frame, -alignment);
   1289  1.1  mrg 
   1290  1.1  mrg   size = crtl->outgoing_args_size;
   1291  1.1  mrg   gcc_assert (size % keep_align == 0);
   1292  1.1  mrg 
   1293  1.1  mrg   /* Initialize %stack.  */
   1294  1.1  mrg   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
   1295  1.1  mrg 	   bits, reg_stack, reg_frame, size);
   1296  1.1  mrg 
   1297  1.1  mrg   if (!crtl->is_leaf)
   1298  1.1  mrg     fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
   1299  1.1  mrg 	     bits, reg_sspslot, reg_stack);
   1300  1.1  mrg   fprintf (file, "\t}\n");
   1301  1.1  mrg   cfun->machine->has_softstack = true;
   1302  1.1  mrg   need_softstack_decl = true;
   1303  1.1  mrg }
   1304  1.1  mrg 
   1305  1.1  mrg /* Emit code to initialize the REGNO predicate register to indicate
   1306  1.1  mrg    whether we are not lane zero on the NAME axis.  */
   1307  1.1  mrg 
   1308  1.1  mrg static void
   1309  1.1  mrg nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
   1310  1.1  mrg {
   1311  1.1  mrg   fprintf (file, "\t{\n");
   1312  1.1  mrg   fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
   1313  1.1  mrg   if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
   1314  1.1  mrg     {
   1315  1.1  mrg       fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
   1316  1.1  mrg       fprintf (file, "\t\t.reg.u64\t%%y64;\n");
   1317  1.1  mrg     }
   1318  1.1  mrg   fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
   1319  1.1  mrg   fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
   1320  1.1  mrg   if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
   1321  1.1  mrg     {
   1322  1.1  mrg       fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
   1323  1.1  mrg       fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
   1324  1.1  mrg       fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
   1325  1.1  mrg 	       "// vector reduction buffer\n",
   1326  1.1  mrg 	       REGNO (cfun->machine->red_partition),
   1327  1.1  mrg 	       vector_red_partition);
   1328  1.1  mrg     }
   1329  1.1  mrg   /* Verify vector_red_size.  */
   1330  1.1  mrg   gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
   1331  1.1  mrg 	      <= vector_red_size);
   1332  1.1  mrg   fprintf (file, "\t}\n");
   1333  1.1  mrg }
   1334  1.1  mrg 
   1335  1.1  mrg /* Emit code to initialize OpenACC worker broadcast and synchronization
   1336  1.1  mrg    registers.  */
   1337  1.1  mrg 
   1338  1.1  mrg static void
   1339  1.1  mrg nvptx_init_oacc_workers (FILE *file)
   1340  1.1  mrg {
   1341  1.1  mrg   fprintf (file, "\t{\n");
   1342  1.1  mrg   fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
   1343  1.1  mrg   if (cfun->machine->bcast_partition)
   1344  1.1  mrg     {
   1345  1.1  mrg       fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
   1346  1.1  mrg       fprintf (file, "\t\t.reg.u64\t%%y64;\n");
   1347  1.1  mrg     }
   1348  1.1  mrg   fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
   1349  1.1  mrg   if (cfun->machine->bcast_partition)
   1350  1.1  mrg     {
   1351  1.1  mrg       fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
   1352  1.1  mrg       fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
   1353  1.1  mrg       fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
   1354  1.1  mrg       fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
   1355  1.1  mrg 	       "// vector broadcast offset\n",
   1356  1.1  mrg 	       REGNO (cfun->machine->bcast_partition),
   1357  1.1  mrg 	       oacc_bcast_partition);
   1358  1.1  mrg     }
   1359  1.1  mrg   /* Verify oacc_bcast_size.  */
   1360  1.1  mrg   gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
   1361  1.1  mrg 	      <= oacc_bcast_size);
   1362  1.1  mrg   if (cfun->machine->sync_bar)
   1363  1.1  mrg     fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
   1364  1.1  mrg 	     "// vector synchronization barrier\n",
   1365  1.1  mrg 	     REGNO (cfun->machine->sync_bar));
   1366  1.1  mrg   fprintf (file, "\t}\n");
   1367  1.1  mrg }
   1368  1.1  mrg 
   1369  1.1  mrg /* Emit code to initialize predicate and master lane index registers for
   1370  1.1  mrg    -muniform-simt code generation variant.  */
   1371  1.1  mrg 
   1372  1.1  mrg static void
   1373  1.1  mrg nvptx_init_unisimt_predicate (FILE *file)
   1374  1.1  mrg {
   1375  1.1  mrg   cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
   1376  1.1  mrg   int loc = REGNO (cfun->machine->unisimt_location);
   1377  1.1  mrg   int bits = POINTER_SIZE;
   1378  1.1  mrg   fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
   1379  1.1  mrg   fprintf (file, "\t{\n");
   1380  1.1  mrg   fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
   1381  1.1  mrg   fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
   1382  1.1  mrg   fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
   1383  1.1  mrg   fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
   1384  1.1  mrg 	   bits == 64 ? ".wide" : ".lo");
   1385  1.1  mrg   fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
   1386  1.1  mrg   fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
   1387  1.1  mrg   if (cfun->machine->unisimt_predicate)
   1388  1.1  mrg     {
   1389  1.1  mrg       int master = REGNO (cfun->machine->unisimt_master);
   1390  1.1  mrg       int pred = REGNO (cfun->machine->unisimt_predicate);
   1391  1.1  mrg       fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
   1392  1.1  mrg       if (cfun->machine->unisimt_outside_simt_predicate)
   1393  1.1  mrg 	{
   1394  1.1  mrg 	  int pred_outside_simt
   1395  1.1  mrg 	    = REGNO (cfun->machine->unisimt_outside_simt_predicate);
   1396  1.1  mrg 	  fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, 0;\n",
   1397  1.1  mrg 		   pred_outside_simt, master);
   1398  1.1  mrg 	}
   1399  1.1  mrg       fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
   1400  1.1  mrg       /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'.  */
   1401  1.1  mrg       fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
   1402  1.1  mrg       /* Compute predicate as 'tid.x == master'.  */
   1403  1.1  mrg       fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
   1404  1.1  mrg     }
   1405  1.1  mrg   fprintf (file, "\t}\n");
   1406  1.1  mrg   need_unisimt_decl = true;
   1407  1.1  mrg }
   1408  1.1  mrg 
   1409  1.1  mrg /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
   1410  1.1  mrg 
   1411  1.1  mrg    extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
   1412  1.1  mrg    void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
   1413  1.1  mrg    {
   1414  1.1  mrg      __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
   1415  1.1  mrg      __nvptx_uni[tid.y] = 0;
   1416  1.1  mrg      gomp_nvptx_main (ORIG, arg);
   1417  1.1  mrg    }
   1418  1.1  mrg    ORIG itself should not be emitted as a PTX .entry function.  */
   1419  1.1  mrg 
   1420  1.1  mrg static void
   1421  1.1  mrg write_omp_entry (FILE *file, const char *name, const char *orig)
   1422  1.1  mrg {
   1423  1.1  mrg   static bool gomp_nvptx_main_declared;
   1424  1.1  mrg   if (!gomp_nvptx_main_declared)
   1425  1.1  mrg     {
   1426  1.1  mrg       gomp_nvptx_main_declared = true;
   1427  1.1  mrg       write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
   1428  1.1  mrg       func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
   1429  1.1  mrg         << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
   1430  1.1  mrg     }
   1431  1.1  mrg   /* PR79332.  Single out this string; it confuses gcc.pot generation.  */
   1432  1.1  mrg #define NTID_Y "%ntid.y"
   1433  1.1  mrg #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
   1434  1.1  mrg  (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
   1435  1.1  mrg {\n\
   1436  1.1  mrg 	.reg.u32 %r<3>;\n\
   1437  1.1  mrg 	.reg.u" PS " %R<4>;\n\
   1438  1.1  mrg 	mov.u32 %r0, %tid.y;\n\
   1439  1.1  mrg 	mov.u32 %r1, " NTID_Y ";\n\
   1440  1.1  mrg 	mov.u32 %r2, %ctaid.x;\n\
   1441  1.1  mrg 	cvt.u" PS ".u32 %R1, %r0;\n\
   1442  1.1  mrg 	" MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
   1443  1.1  mrg 	mov.u" PS " %R0, __nvptx_stacks;\n\
   1444  1.1  mrg 	" MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
   1445  1.1  mrg 	ld.param.u" PS " %R2, [%stack];\n\
   1446  1.1  mrg 	ld.param.u" PS " %R3, [%sz];\n\
   1447  1.1  mrg 	add.u" PS " %R2, %R2, %R3;\n\
   1448  1.1  mrg 	mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
   1449  1.1  mrg 	st.shared.u" PS " [%R0], %R2;\n\
   1450  1.1  mrg 	mov.u" PS " %R0, __nvptx_uni;\n\
   1451  1.1  mrg 	" MAD_PS_32 " %R0, %r0, 4, %R0;\n\
   1452  1.1  mrg 	mov.u32 %r0, 0;\n\
   1453  1.1  mrg 	st.shared.u32 [%R0], %r0;\n\
   1454  1.1  mrg 	mov.u" PS " %R0, \0;\n\
   1455  1.1  mrg 	ld.param.u" PS " %R1, [%arg];\n\
   1456  1.1  mrg 	{\n\
   1457  1.1  mrg 		.param.u" PS " %P<2>;\n\
   1458  1.1  mrg 		st.param.u" PS " [%P0], %R0;\n\
   1459  1.1  mrg 		st.param.u" PS " [%P1], %R1;\n\
   1460  1.1  mrg 		call.uni gomp_nvptx_main, (%P0, %P1);\n\
   1461  1.1  mrg 	}\n\
   1462  1.1  mrg 	ret.uni;\n\
   1463  1.1  mrg }\n"
   1464  1.1  mrg   static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
   1465  1.1  mrg   static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32  ");
   1466  1.1  mrg #undef ENTRY_TEMPLATE
   1467  1.1  mrg #undef NTID_Y
   1468  1.1  mrg   const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
   1469  1.1  mrg   /* Position ENTRY_2 after the embedded nul using strlen of the prefix.  */
   1470  1.1  mrg   const char *entry_2 = entry_1 + strlen (entry64) + 1;
   1471  1.1  mrg   fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
   1472  1.1  mrg   need_softstack_decl = need_unisimt_decl = true;
   1473  1.1  mrg }
   1474  1.1  mrg 
   1475  1.1  mrg /* Implement ASM_DECLARE_FUNCTION_NAME.  Writes the start of a ptx
   1476  1.1  mrg    function, including local var decls and copies from the arguments to
   1477  1.1  mrg    local regs.  */
   1478  1.1  mrg 
   1479  1.1  mrg void
   1480  1.1  mrg nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
   1481  1.1  mrg {
   1482  1.1  mrg   tree fntype = TREE_TYPE (decl);
   1483  1.1  mrg   tree result_type = TREE_TYPE (fntype);
   1484  1.1  mrg   int argno = 0;
   1485  1.1  mrg 
   1486  1.1  mrg   if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
   1487  1.1  mrg       && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
   1488  1.1  mrg     {
   1489  1.1  mrg       char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
   1490  1.1  mrg       sprintf (buf, "%s$impl", name);
   1491  1.1  mrg       write_omp_entry (file, name, buf);
   1492  1.1  mrg       name = buf;
   1493  1.1  mrg     }
   1494  1.1  mrg   /* We construct the initial part of the function into a string
   1495  1.1  mrg      stream, in order to share the prototype writing code.  */
   1496  1.1  mrg   std::stringstream s;
   1497  1.1  mrg   write_fn_proto (s, true, name, decl);
   1498  1.1  mrg   s << "{\n";
   1499  1.1  mrg 
   1500  1.1  mrg   bool return_in_mem = write_return_type (s, false, result_type);
   1501  1.1  mrg   if (return_in_mem)
   1502  1.1  mrg     argno = write_arg_type (s, 0, argno, ptr_type_node, true);
   1503  1.1  mrg 
   1504  1.1  mrg   /* Declare and initialize incoming arguments.  */
   1505  1.1  mrg   tree args = TYPE_ARG_TYPES (fntype);
   1506  1.1  mrg   bool prototyped = true;
   1507  1.1  mrg   if (!args)
   1508  1.1  mrg     {
   1509  1.1  mrg       args = DECL_ARGUMENTS (decl);
   1510  1.1  mrg       prototyped = false;
   1511  1.1  mrg     }
   1512  1.1  mrg 
   1513  1.1  mrg   for (; args != NULL_TREE; args = TREE_CHAIN (args))
   1514  1.1  mrg     {
   1515  1.1  mrg       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
   1516  1.1  mrg 
   1517  1.1  mrg       argno = write_arg_type (s, 0, argno, type, prototyped);
   1518  1.1  mrg     }
   1519  1.1  mrg 
   1520  1.1  mrg   if (stdarg_p (fntype))
   1521  1.1  mrg     argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
   1522  1.1  mrg 			    true);
   1523  1.1  mrg 
   1524  1.1  mrg   if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
   1525  1.1  mrg     write_arg_type (s, STATIC_CHAIN_REGNUM,
   1526  1.1  mrg 		    DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
   1527  1.1  mrg 		    true);
   1528  1.1  mrg 
   1529  1.1  mrg   fprintf (file, "%s", s.str().c_str());
   1530  1.1  mrg 
   1531  1.1  mrg   /* Usually 'crtl->is_leaf' is computed during register allocator
   1532  1.1  mrg      initialization (which is not done on NVPTX) or for pressure-sensitive
   1533  1.1  mrg      optimizations.  Initialize it here, except if already set.  */
   1534  1.1  mrg   if (!crtl->is_leaf)
   1535  1.1  mrg     crtl->is_leaf = leaf_function_p ();
   1536  1.1  mrg 
   1537  1.1  mrg   HOST_WIDE_INT sz = get_frame_size ();
   1538  1.1  mrg   bool need_frameptr = sz || cfun->machine->has_chain;
   1539  1.1  mrg   int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
   1540  1.1  mrg   if (!TARGET_SOFT_STACK)
   1541  1.1  mrg     {
   1542  1.1  mrg       /* Declare a local var for outgoing varargs.  */
   1543  1.1  mrg       if (cfun->machine->has_varadic)
   1544  1.1  mrg 	init_frame (file, STACK_POINTER_REGNUM,
   1545  1.1  mrg 		    UNITS_PER_WORD, crtl->outgoing_args_size);
   1546  1.1  mrg 
   1547  1.1  mrg       /* Declare a local variable for the frame.  Force its size to be
   1548  1.1  mrg 	 DImode-compatible.  */
   1549  1.1  mrg       if (need_frameptr)
   1550  1.1  mrg 	init_frame (file, FRAME_POINTER_REGNUM, alignment,
   1551  1.1  mrg 		    ROUND_UP (sz, GET_MODE_SIZE (DImode)));
   1552  1.1  mrg     }
   1553  1.1  mrg   else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
   1554  1.1  mrg 	   || (cfun->machine->has_simtreg && !crtl->is_leaf))
   1555  1.1  mrg     init_softstack_frame (file, alignment, sz);
   1556  1.1  mrg 
   1557  1.1  mrg   if (cfun->machine->has_simtreg)
   1558  1.1  mrg     {
   1559  1.1  mrg       unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
   1560  1.1  mrg       unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
   1561  1.1  mrg       align = MAX (align, GET_MODE_SIZE (DImode));
   1562  1.1  mrg       if (!crtl->is_leaf || cfun->calls_alloca)
   1563  1.1  mrg 	simtsz = HOST_WIDE_INT_M1U;
   1564  1.1  mrg       if (simtsz == HOST_WIDE_INT_M1U)
   1565  1.1  mrg 	simtsz = nvptx_softstack_size;
   1566  1.1  mrg       if (cfun->machine->has_softstack)
   1567  1.1  mrg 	simtsz += POINTER_SIZE / 8;
   1568  1.1  mrg       simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
   1569  1.1  mrg       if (align > GET_MODE_SIZE (DImode))
   1570  1.1  mrg 	simtsz += align - GET_MODE_SIZE (DImode);
   1571  1.1  mrg       if (simtsz)
   1572  1.1  mrg 	fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
   1573  1.1  mrg 		HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
   1574  1.1  mrg     }
   1575  1.1  mrg 
   1576  1.1  mrg   /* Restore the vector reduction partition register, if necessary.
   1577  1.1  mrg      FIXME: Find out when and why this is necessary, and fix it.  */
   1578  1.1  mrg   if (cfun->machine->red_partition)
   1579  1.1  mrg     regno_reg_rtx[REGNO (cfun->machine->red_partition)]
   1580  1.1  mrg       = cfun->machine->red_partition;
   1581  1.1  mrg 
   1582  1.1  mrg   /* Declare the pseudos we have as ptx registers.  */
   1583  1.1  mrg   int maxregs = max_reg_num ();
   1584  1.1  mrg   for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
   1585  1.1  mrg     {
   1586  1.1  mrg       if (regno_reg_rtx[i] != const0_rtx)
   1587  1.1  mrg 	{
   1588  1.1  mrg 	  machine_mode mode = PSEUDO_REGNO_MODE (i);
   1589  1.1  mrg 	  machine_mode split = maybe_split_mode (mode);
   1590  1.1  mrg 
   1591  1.1  mrg 	  if (split_mode_p (mode))
   1592  1.1  mrg 	    mode = split;
   1593  1.1  mrg 	  fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
   1594  1.1  mrg 	  output_reg (file, i, split, -2);
   1595  1.1  mrg 	  fprintf (file, ";\n");
   1596  1.1  mrg 	}
   1597  1.1  mrg     }
   1598  1.1  mrg 
   1599  1.1  mrg   /* Emit axis predicates. */
   1600  1.1  mrg   if (cfun->machine->axis_predicate[0])
   1601  1.1  mrg     nvptx_init_axis_predicate (file,
   1602  1.1  mrg 			       REGNO (cfun->machine->axis_predicate[0]), "y");
   1603  1.1  mrg   if (cfun->machine->axis_predicate[1])
   1604  1.1  mrg     nvptx_init_axis_predicate (file,
   1605  1.1  mrg 			       REGNO (cfun->machine->axis_predicate[1]), "x");
   1606  1.1  mrg   if (cfun->machine->unisimt_predicate
   1607  1.1  mrg       || (cfun->machine->has_simtreg && !crtl->is_leaf))
   1608  1.1  mrg     nvptx_init_unisimt_predicate (file);
   1609  1.1  mrg   if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
   1610  1.1  mrg     nvptx_init_oacc_workers (file);
   1611  1.1  mrg }
   1612  1.1  mrg 
   1613  1.1  mrg /* Output code for switching uniform-simt state.  ENTERING indicates whether
   1614  1.1  mrg    we are entering or leaving non-uniform execution region.  */
   1615  1.1  mrg 
   1616  1.1  mrg static void
   1617  1.1  mrg nvptx_output_unisimt_switch (FILE *file, bool entering)
   1618  1.1  mrg {
   1619  1.1  mrg   if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
   1620  1.1  mrg     return;
   1621  1.1  mrg   fprintf (file, "\t{\n");
   1622  1.1  mrg   fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
   1623  1.1  mrg   fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
   1624  1.1  mrg   if (cfun->machine->unisimt_outside_simt_predicate)
   1625  1.1  mrg     {
   1626  1.1  mrg       int pred_outside_simt
   1627  1.1  mrg 	= REGNO (cfun->machine->unisimt_outside_simt_predicate);
   1628  1.1  mrg       fprintf (file, "\t\tmov.pred %%r%d, %d;\n", pred_outside_simt,
   1629  1.1  mrg 	       entering ? 0 : 1);
   1630  1.1  mrg     }
   1631  1.1  mrg   if (!crtl->is_leaf)
   1632  1.1  mrg     {
   1633  1.1  mrg       int loc = REGNO (cfun->machine->unisimt_location);
   1634  1.1  mrg       fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
   1635  1.1  mrg     }
   1636  1.1  mrg   if (cfun->machine->unisimt_predicate)
   1637  1.1  mrg     {
   1638  1.1  mrg       int master = REGNO (cfun->machine->unisimt_master);
   1639  1.1  mrg       int pred = REGNO (cfun->machine->unisimt_predicate);
   1640  1.1  mrg       fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
   1641  1.1  mrg       fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
   1642  1.1  mrg 	       master, entering ? "%ustmp2" : "0");
   1643  1.1  mrg       fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
   1644  1.1  mrg     }
   1645  1.1  mrg   fprintf (file, "\t}\n");
   1646  1.1  mrg }
   1647  1.1  mrg 
   1648  1.1  mrg /* Output code for allocating per-lane storage and switching soft-stack pointer.
   1649  1.1  mrg    ENTERING indicates whether we are entering or leaving non-uniform execution.
   1650  1.1  mrg    PTR is the register pointing to allocated storage, it is assigned to on
   1651  1.1  mrg    entering and used to restore state on leaving.  SIZE and ALIGN are used only
   1652  1.1  mrg    on entering.  */
   1653  1.1  mrg 
   1654  1.1  mrg static void
   1655  1.1  mrg nvptx_output_softstack_switch (FILE *file, bool entering,
   1656  1.1  mrg 			       rtx ptr, rtx size, rtx align)
   1657  1.1  mrg {
   1658  1.1  mrg   gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
   1659  1.1  mrg   if (crtl->is_leaf && !cfun->machine->simt_stack_size)
   1660  1.1  mrg     return;
   1661  1.1  mrg   int bits = POINTER_SIZE, regno = REGNO (ptr);
   1662  1.1  mrg   fprintf (file, "\t{\n");
   1663  1.1  mrg   if (entering)
   1664  1.1  mrg     {
   1665  1.1  mrg       fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
   1666  1.1  mrg 	       HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
   1667  1.1  mrg 	       cfun->machine->simt_stack_size);
   1668  1.1  mrg       fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
   1669  1.1  mrg       if (CONST_INT_P (size))
   1670  1.1  mrg 	fprintf (file, HOST_WIDE_INT_PRINT_DEC,
   1671  1.1  mrg 		 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
   1672  1.1  mrg       else
   1673  1.1  mrg 	output_reg (file, REGNO (size), VOIDmode);
   1674  1.1  mrg       fputs (";\n", file);
   1675  1.1  mrg       if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
   1676  1.1  mrg 	fprintf (file,
   1677  1.1  mrg 		 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
   1678  1.1  mrg 		 bits, regno, regno, UINTVAL (align));
   1679  1.1  mrg     }
   1680  1.1  mrg   if (cfun->machine->has_softstack)
   1681  1.1  mrg     {
   1682  1.1  mrg       const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
   1683  1.1  mrg       if (entering)
   1684  1.1  mrg 	{
   1685  1.1  mrg 	  fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
   1686  1.1  mrg 		   bits, regno, bits / 8, reg_stack);
   1687  1.1  mrg 	  fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
   1688  1.1  mrg 		   bits, reg_stack, regno, bits / 8);
   1689  1.1  mrg 	}
   1690  1.1  mrg       else
   1691  1.1  mrg 	{
   1692  1.1  mrg 	  fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
   1693  1.1  mrg 		   bits, reg_stack, regno, bits / 8);
   1694  1.1  mrg 	}
   1695  1.1  mrg       nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
   1696  1.1  mrg     }
   1697  1.1  mrg   fprintf (file, "\t}\n");
   1698  1.1  mrg }
   1699  1.1  mrg 
   1700  1.1  mrg /* Output code to enter non-uniform execution region.  DEST is a register
   1701  1.1  mrg    to hold a per-lane allocation given by SIZE and ALIGN.  */
   1702  1.1  mrg 
   1703  1.1  mrg const char *
   1704  1.1  mrg nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
   1705  1.1  mrg {
   1706  1.1  mrg   nvptx_output_unisimt_switch (asm_out_file, true);
   1707  1.1  mrg   nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
   1708  1.1  mrg   return "";
   1709  1.1  mrg }
   1710  1.1  mrg 
   1711  1.1  mrg /* Output code to leave non-uniform execution region.  SRC is the register
   1712  1.1  mrg    holding per-lane storage previously allocated by omp_simt_enter insn.  */
   1713  1.1  mrg 
   1714  1.1  mrg const char *
   1715  1.1  mrg nvptx_output_simt_exit (rtx src)
   1716  1.1  mrg {
   1717  1.1  mrg   nvptx_output_unisimt_switch (asm_out_file, false);
   1718  1.1  mrg   nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
   1719  1.1  mrg   return "";
   1720  1.1  mrg }
   1721  1.1  mrg 
   1722  1.1  mrg /* Output instruction that sets soft stack pointer in shared memory to the
   1723  1.1  mrg    value in register given by SRC_REGNO.  */
   1724  1.1  mrg 
   1725  1.1  mrg const char *
   1726  1.1  mrg nvptx_output_set_softstack (unsigned src_regno)
   1727  1.1  mrg {
   1728  1.1  mrg   if (cfun->machine->has_softstack && !crtl->is_leaf)
   1729  1.1  mrg     {
   1730  1.1  mrg       fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
   1731  1.1  mrg 	       POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
   1732  1.1  mrg       output_reg (asm_out_file, src_regno, VOIDmode);
   1733  1.1  mrg       fprintf (asm_out_file, ";\n");
   1734  1.1  mrg     }
   1735  1.1  mrg   return "";
   1736  1.1  mrg }
   1737  1.1  mrg /* Output a return instruction.  Also copy the return value to its outgoing
   1738  1.1  mrg    location.  */
   1739  1.1  mrg 
   1740  1.1  mrg const char *
   1741  1.1  mrg nvptx_output_return (void)
   1742  1.1  mrg {
   1743  1.1  mrg   machine_mode mode = (machine_mode)cfun->machine->return_mode;
   1744  1.1  mrg 
   1745  1.1  mrg   if (mode != VOIDmode)
   1746  1.1  mrg     fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
   1747  1.1  mrg 	     nvptx_ptx_type_from_mode (mode, false),
   1748  1.1  mrg 	     reg_names[NVPTX_RETURN_REGNUM],
   1749  1.1  mrg 	     reg_names[NVPTX_RETURN_REGNUM]);
   1750  1.1  mrg 
   1751  1.1  mrg   return "ret;";
   1752  1.1  mrg }
   1753  1.1  mrg 
   1754  1.1  mrg /* Terminate a function by writing a closing brace to FILE.  */
   1755  1.1  mrg 
   1756  1.1  mrg void
   1757  1.1  mrg nvptx_function_end (FILE *file)
   1758  1.1  mrg {
   1759  1.1  mrg   fprintf (file, "}\n");
   1760  1.1  mrg }
   1761  1.1  mrg 
   1762  1.1  mrg /* Decide whether we can make a sibling call to a function.  For ptx, we
   1764  1.1  mrg    can't.  */
   1765  1.1  mrg 
   1766  1.1  mrg static bool
   1767  1.1  mrg nvptx_function_ok_for_sibcall (tree, tree)
   1768  1.1  mrg {
   1769  1.1  mrg   return false;
   1770  1.1  mrg }
   1771  1.1  mrg 
   1772  1.1  mrg /* Return Dynamic ReAlignment Pointer RTX.  For PTX there isn't any.  */
   1773  1.1  mrg 
   1774  1.1  mrg static rtx
   1775  1.1  mrg nvptx_get_drap_rtx (void)
   1776  1.1  mrg {
   1777  1.1  mrg   if (TARGET_SOFT_STACK && stack_realign_drap)
   1778  1.1  mrg     return arg_pointer_rtx;
   1779  1.1  mrg   return NULL_RTX;
   1780  1.1  mrg }
   1781  1.1  mrg 
   1782  1.1  mrg /* Implement the TARGET_CALL_ARGS hook.  Record information about one
   1783  1.1  mrg    argument to the next call.  */
   1784  1.1  mrg 
   1785  1.1  mrg static void
   1786  1.1  mrg nvptx_call_args (rtx arg, tree fntype)
   1787  1.1  mrg {
   1788  1.1  mrg   if (!cfun->machine->doing_call)
   1789  1.1  mrg     {
   1790  1.1  mrg       cfun->machine->doing_call = true;
   1791  1.1  mrg       cfun->machine->is_varadic = false;
   1792  1.1  mrg       cfun->machine->num_args = 0;
   1793  1.1  mrg 
   1794  1.1  mrg       if (fntype && stdarg_p (fntype))
   1795  1.1  mrg 	{
   1796  1.1  mrg 	  cfun->machine->is_varadic = true;
   1797  1.1  mrg 	  cfun->machine->has_varadic = true;
   1798  1.1  mrg 	  cfun->machine->num_args++;
   1799  1.1  mrg 	}
   1800  1.1  mrg     }
   1801  1.1  mrg 
   1802  1.1  mrg   if (REG_P (arg) && arg != pc_rtx)
   1803  1.1  mrg     {
   1804  1.1  mrg       cfun->machine->num_args++;
   1805  1.1  mrg       cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
   1806  1.1  mrg 						  cfun->machine->call_args);
   1807  1.1  mrg     }
   1808  1.1  mrg }
   1809  1.1  mrg 
   1810  1.1  mrg /* Implement the corresponding END_CALL_ARGS hook.  Clear and free the
   1811  1.1  mrg    information we recorded.  */
   1812  1.1  mrg 
   1813  1.1  mrg static void
   1814  1.1  mrg nvptx_end_call_args (void)
   1815  1.1  mrg {
   1816  1.1  mrg   cfun->machine->doing_call = false;
   1817  1.1  mrg   free_EXPR_LIST_list (&cfun->machine->call_args);
   1818  1.1  mrg }
   1819  1.1  mrg 
   1820  1.1  mrg /* Emit the sequence for a call to ADDRESS, setting RETVAL.  Keep
   1821  1.1  mrg    track of whether calls involving static chains or varargs were seen
   1822  1.1  mrg    in the current function.
   1823  1.1  mrg    For libcalls, maintain a hash table of decls we have seen, and
   1824  1.1  mrg    record a function decl for later when encountering a new one.  */
   1825  1.1  mrg 
   1826  1.1  mrg void
   1827  1.1  mrg nvptx_expand_call (rtx retval, rtx address)
   1828  1.1  mrg {
   1829  1.1  mrg   rtx callee = XEXP (address, 0);
   1830  1.1  mrg   rtx varargs = NULL_RTX;
   1831  1.1  mrg   unsigned parallel = 0;
   1832  1.1  mrg 
   1833  1.1  mrg   if (!call_insn_operand (callee, Pmode))
   1834  1.1  mrg     {
   1835  1.1  mrg       callee = force_reg (Pmode, callee);
   1836  1.1  mrg       address = change_address (address, QImode, callee);
   1837  1.1  mrg     }
   1838  1.1  mrg 
   1839  1.1  mrg   if (GET_CODE (callee) == SYMBOL_REF)
   1840  1.1  mrg     {
   1841  1.1  mrg       tree decl = SYMBOL_REF_DECL (callee);
   1842  1.1  mrg       if (decl != NULL_TREE)
   1843  1.1  mrg 	{
   1844  1.1  mrg 	  if (DECL_STATIC_CHAIN (decl))
   1845  1.1  mrg 	    cfun->machine->has_chain = true;
   1846  1.1  mrg 
   1847  1.1  mrg 	  tree attr = oacc_get_fn_attrib (decl);
   1848  1.1  mrg 	  if (attr)
   1849  1.1  mrg 	    {
   1850  1.1  mrg 	      tree dims = TREE_VALUE (attr);
   1851  1.1  mrg 
   1852  1.1  mrg 	      parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
   1853  1.1  mrg 	      for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
   1854  1.1  mrg 		{
   1855  1.1  mrg 		  if (TREE_PURPOSE (dims)
   1856  1.1  mrg 		      && !integer_zerop (TREE_PURPOSE (dims)))
   1857  1.1  mrg 		    break;
   1858  1.1  mrg 		  /* Not on this axis.  */
   1859  1.1  mrg 		  parallel ^= GOMP_DIM_MASK (ix);
   1860  1.1  mrg 		  dims = TREE_CHAIN (dims);
   1861  1.1  mrg 		}
   1862  1.1  mrg 	    }
   1863  1.1  mrg 	}
   1864  1.1  mrg     }
   1865  1.1  mrg 
   1866  1.1  mrg   unsigned nargs = cfun->machine->num_args;
   1867  1.1  mrg   if (cfun->machine->is_varadic)
   1868  1.1  mrg     {
   1869  1.1  mrg       varargs = gen_reg_rtx (Pmode);
   1870  1.1  mrg       emit_move_insn (varargs, stack_pointer_rtx);
   1871  1.1  mrg     }
   1872  1.1  mrg 
   1873  1.1  mrg   rtvec vec = rtvec_alloc (nargs + 1);
   1874  1.1  mrg   rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
   1875  1.1  mrg   int vec_pos = 0;
   1876  1.1  mrg 
   1877  1.1  mrg   rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
   1878  1.1  mrg   rtx tmp_retval = retval;
   1879  1.1  mrg   if (retval)
   1880  1.1  mrg     {
   1881  1.1  mrg       if (!nvptx_register_operand (retval, GET_MODE (retval)))
   1882  1.1  mrg 	tmp_retval = gen_reg_rtx (GET_MODE (retval));
   1883  1.1  mrg       call = gen_rtx_SET (tmp_retval, call);
   1884  1.1  mrg     }
   1885  1.1  mrg   XVECEXP (pat, 0, vec_pos++) = call;
   1886  1.1  mrg 
   1887  1.1  mrg   /* Construct the call insn, including a USE for each argument pseudo
   1888  1.1  mrg      register.  These will be used when printing the insn.  */
   1889  1.1  mrg   for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
   1890  1.1  mrg     XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
   1891  1.1  mrg 
   1892  1.1  mrg   if (varargs)
   1893  1.1  mrg     XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
   1894  1.1  mrg 
   1895  1.1  mrg   gcc_assert (vec_pos == XVECLEN (pat, 0));
   1896  1.1  mrg 
   1897  1.1  mrg   nvptx_emit_forking (parallel, true);
   1898  1.1  mrg   emit_call_insn (pat);
   1899  1.1  mrg   nvptx_emit_joining (parallel, true);
   1900  1.1  mrg 
   1901  1.1  mrg   if (tmp_retval != retval)
   1902  1.1  mrg     emit_move_insn (retval, tmp_retval);
   1903  1.1  mrg }
   1904  1.1  mrg 
   1905  1.1  mrg /* Emit a comparison COMPARE, and return the new test to be used in the
   1906  1.1  mrg    jump.  */
   1907  1.1  mrg 
   1908  1.1  mrg rtx
   1909  1.1  mrg nvptx_expand_compare (rtx compare)
   1910  1.1  mrg {
   1911  1.1  mrg   rtx pred = gen_reg_rtx (BImode);
   1912  1.1  mrg   rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
   1913  1.1  mrg 			    XEXP (compare, 0), XEXP (compare, 1));
   1914  1.1  mrg   emit_insn (gen_rtx_SET (pred, cmp));
   1915  1.1  mrg   return gen_rtx_NE (BImode, pred, const0_rtx);
   1916  1.1  mrg }
   1917  1.1  mrg 
   1918  1.1  mrg /* Expand the oacc fork & join primitive into ptx-required unspecs.  */
   1919  1.1  mrg 
   1920  1.1  mrg void
   1921  1.1  mrg nvptx_expand_oacc_fork (unsigned mode)
   1922  1.1  mrg {
   1923  1.1  mrg   nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
   1924  1.1  mrg }
   1925  1.1  mrg 
   1926  1.1  mrg void
   1927  1.1  mrg nvptx_expand_oacc_join (unsigned mode)
   1928  1.1  mrg {
   1929  1.1  mrg   nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
   1930  1.1  mrg }
   1931  1.1  mrg 
   1932  1.1  mrg /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
   1933  1.1  mrg    objects.  */
   1934  1.1  mrg 
   1935  1.1  mrg static rtx
   1936  1.1  mrg nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
   1937  1.1  mrg {
   1938  1.1  mrg   rtx res;
   1939  1.1  mrg 
   1940  1.1  mrg   switch (GET_MODE (src))
   1941  1.1  mrg     {
   1942  1.1  mrg     case E_DImode:
   1943  1.1  mrg       res = gen_unpackdisi2 (dst0, dst1, src);
   1944  1.1  mrg       break;
   1945  1.1  mrg     case E_DFmode:
   1946  1.1  mrg       res = gen_unpackdfsi2 (dst0, dst1, src);
   1947  1.1  mrg       break;
   1948  1.1  mrg     default: gcc_unreachable ();
   1949  1.1  mrg     }
   1950  1.1  mrg   return res;
   1951  1.1  mrg }
   1952  1.1  mrg 
   1953  1.1  mrg /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
   1954  1.1  mrg    object.  */
   1955  1.1  mrg 
   1956  1.1  mrg static rtx
   1957  1.1  mrg nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
   1958  1.1  mrg {
   1959  1.1  mrg   rtx res;
   1960  1.1  mrg 
   1961  1.1  mrg   switch (GET_MODE (dst))
   1962  1.1  mrg     {
   1963  1.1  mrg     case E_DImode:
   1964  1.1  mrg       res = gen_packsidi2 (dst, src0, src1);
   1965  1.1  mrg       break;
   1966  1.1  mrg     case E_DFmode:
   1967  1.1  mrg       res = gen_packsidf2 (dst, src0, src1);
   1968  1.1  mrg       break;
   1969  1.1  mrg     default: gcc_unreachable ();
   1970  1.1  mrg     }
   1971  1.1  mrg   return res;
   1972  1.1  mrg }
   1973  1.1  mrg 
   1974  1.1  mrg /* Generate an instruction or sequence to broadcast register REG
   1975  1.1  mrg    across the vectors of a single warp.  */
   1976  1.1  mrg 
   1977  1.1  mrg rtx
   1978  1.1  mrg nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
   1979  1.1  mrg {
   1980  1.1  mrg   rtx res;
   1981  1.1  mrg 
   1982  1.1  mrg   switch (GET_MODE (dst))
   1983  1.1  mrg     {
   1984  1.1  mrg       case E_DCmode:
   1985  1.1  mrg       case E_CDImode:
   1986  1.1  mrg 	{
   1987  1.1  mrg 	  gcc_assert (GET_CODE (dst) == CONCAT);
   1988  1.1  mrg 	  gcc_assert (GET_CODE (src) == CONCAT);
   1989  1.1  mrg 	  rtx dst_real = XEXP (dst, 0);
   1990  1.1  mrg 	  rtx dst_imag = XEXP (dst, 1);
   1991  1.1  mrg 	  rtx src_real = XEXP (src, 0);
   1992  1.1  mrg 	  rtx src_imag = XEXP (src, 1);
   1993  1.1  mrg 
   1994  1.1  mrg 	  start_sequence ();
   1995  1.1  mrg 	  emit_insn (nvptx_gen_shuffle (dst_real, src_real, idx, kind));
   1996  1.1  mrg 	  emit_insn (nvptx_gen_shuffle (dst_imag, src_imag, idx, kind));
   1997  1.1  mrg 	  res = get_insns ();
   1998  1.1  mrg 	  end_sequence ();
   1999  1.1  mrg 	}
   2000  1.1  mrg 	break;
   2001  1.1  mrg     case E_SImode:
   2002  1.1  mrg       res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
   2003  1.1  mrg       break;
   2004  1.1  mrg     case E_SFmode:
   2005  1.1  mrg       res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
   2006  1.1  mrg       break;
   2007  1.1  mrg     case E_DImode:
   2008  1.1  mrg     case E_DFmode:
   2009  1.1  mrg       {
   2010  1.1  mrg 	rtx tmp0 = gen_reg_rtx (SImode);
   2011  1.1  mrg 	rtx tmp1 = gen_reg_rtx (SImode);
   2012  1.1  mrg 
   2013  1.1  mrg 	start_sequence ();
   2014  1.1  mrg 	emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
   2015  1.1  mrg 	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
   2016  1.1  mrg 	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
   2017  1.1  mrg 	emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
   2018  1.1  mrg 	res = get_insns ();
   2019  1.1  mrg 	end_sequence ();
   2020  1.1  mrg       }
   2021  1.1  mrg       break;
   2022  1.1  mrg     case E_V2SImode:
   2023  1.1  mrg       {
   2024  1.1  mrg 	rtx src0 = gen_rtx_SUBREG (SImode, src, 0);
   2025  1.1  mrg 	rtx src1 = gen_rtx_SUBREG (SImode, src, 4);
   2026  1.1  mrg 	rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0);
   2027  1.1  mrg 	rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4);
   2028  1.1  mrg 	rtx tmp0 = gen_reg_rtx (SImode);
   2029  1.1  mrg 	rtx tmp1 = gen_reg_rtx (SImode);
   2030  1.1  mrg 	start_sequence ();
   2031  1.1  mrg 	emit_insn (gen_movsi (tmp0, src0));
   2032  1.1  mrg 	emit_insn (gen_movsi (tmp1, src1));
   2033  1.1  mrg 	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
   2034  1.1  mrg 	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
   2035  1.1  mrg 	emit_insn (gen_movsi (dst0, tmp0));
   2036  1.1  mrg 	emit_insn (gen_movsi (dst1, tmp1));
   2037  1.1  mrg 	res = get_insns ();
   2038  1.1  mrg 	end_sequence ();
   2039  1.1  mrg       }
   2040  1.1  mrg       break;
   2041  1.1  mrg     case E_V2DImode:
   2042  1.1  mrg       {
   2043  1.1  mrg 	rtx src0 = gen_rtx_SUBREG (DImode, src, 0);
   2044  1.1  mrg 	rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
   2045  1.1  mrg 	rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
   2046  1.1  mrg 	rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
   2047  1.1  mrg 	rtx tmp0 = gen_reg_rtx (DImode);
   2048  1.1  mrg 	rtx tmp1 = gen_reg_rtx (DImode);
   2049  1.1  mrg 	start_sequence ();
   2050  1.1  mrg 	emit_insn (gen_movdi (tmp0, src0));
   2051  1.1  mrg 	emit_insn (gen_movdi (tmp1, src1));
   2052  1.1  mrg 	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
   2053  1.1  mrg 	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
   2054  1.1  mrg 	emit_insn (gen_movdi (dst0, tmp0));
   2055  1.1  mrg 	emit_insn (gen_movdi (dst1, tmp1));
   2056  1.1  mrg 	res = get_insns ();
   2057  1.1  mrg 	end_sequence ();
   2058  1.1  mrg       }
   2059  1.1  mrg       break;
   2060  1.1  mrg     case E_BImode:
   2061  1.1  mrg       {
   2062  1.1  mrg 	rtx tmp = gen_reg_rtx (SImode);
   2063  1.1  mrg 
   2064  1.1  mrg 	start_sequence ();
   2065  1.1  mrg 	emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
   2066  1.1  mrg 	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
   2067  1.1  mrg 	emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
   2068  1.1  mrg 	res = get_insns ();
   2069  1.1  mrg 	end_sequence ();
   2070  1.1  mrg       }
   2071  1.1  mrg       break;
   2072  1.1  mrg     case E_QImode:
   2073  1.1  mrg     case E_HImode:
   2074  1.1  mrg       {
   2075  1.1  mrg 	rtx tmp = gen_reg_rtx (SImode);
   2076  1.1  mrg 
   2077  1.1  mrg 	start_sequence ();
   2078  1.1  mrg 	emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
   2079  1.1  mrg 	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
   2080  1.1  mrg 	emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
   2081  1.1  mrg 						    tmp)));
   2082  1.1  mrg 	res = get_insns ();
   2083  1.1  mrg 	end_sequence ();
   2084  1.1  mrg       }
   2085  1.1  mrg       break;
   2086  1.1  mrg 
   2087  1.1  mrg     default:
   2088  1.1  mrg       gcc_unreachable ();
   2089  1.1  mrg     }
   2090  1.1  mrg   return res;
   2091  1.1  mrg }
   2092  1.1  mrg 
   2093  1.1  mrg /* Generate an instruction or sequence to broadcast register REG
   2094  1.1  mrg    across the vectors of a single warp.  */
   2095  1.1  mrg 
   2096  1.1  mrg static rtx
   2097  1.1  mrg nvptx_gen_warp_bcast (rtx reg)
   2098  1.1  mrg {
   2099  1.1  mrg   return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
   2100  1.1  mrg }
   2101  1.1  mrg 
   2102  1.1  mrg /* Structure used when generating a worker-level spill or fill.  */
   2103  1.1  mrg 
   2104  1.1  mrg struct broadcast_data_t
   2105  1.1  mrg {
   2106  1.1  mrg   rtx base;  /* Register holding base addr of buffer.  */
   2107  1.1  mrg   rtx ptr;  /* Iteration var,  if needed.  */
   2108  1.1  mrg   unsigned offset; /* Offset into worker buffer.  */
   2109  1.1  mrg };
   2110  1.1  mrg 
   2111  1.1  mrg /* Direction of the spill/fill and looping setup/teardown indicator.  */
   2112  1.1  mrg 
   2113  1.1  mrg enum propagate_mask
   2114  1.1  mrg   {
   2115  1.1  mrg     PM_read = 1 << 0,
   2116  1.1  mrg     PM_write = 1 << 1,
   2117  1.1  mrg     PM_loop_begin = 1 << 2,
   2118  1.1  mrg     PM_loop_end = 1 << 3,
   2119  1.1  mrg 
   2120  1.1  mrg     PM_read_write = PM_read | PM_write
   2121  1.1  mrg   };
   2122  1.1  mrg 
   2123  1.1  mrg /* Generate instruction(s) to spill or fill register REG to/from the
   2124  1.1  mrg    worker broadcast array.  PM indicates what is to be done, REP
   2125  1.1  mrg    how many loop iterations will be executed (0 for not a loop).  */
   2126  1.1  mrg 
   2127  1.1  mrg static rtx
   2128  1.1  mrg nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
   2129  1.1  mrg 			broadcast_data_t *data, bool vector)
   2130  1.1  mrg {
   2131  1.1  mrg   rtx  res;
   2132  1.1  mrg   machine_mode mode = GET_MODE (reg);
   2133  1.1  mrg 
   2134  1.1  mrg   switch (mode)
   2135  1.1  mrg     {
   2136  1.1  mrg     case E_BImode:
   2137  1.1  mrg       {
   2138  1.1  mrg 	rtx tmp = gen_reg_rtx (SImode);
   2139  1.1  mrg 
   2140  1.1  mrg 	start_sequence ();
   2141  1.1  mrg 	if (pm & PM_read)
   2142  1.1  mrg 	  emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
   2143  1.1  mrg 	emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
   2144  1.1  mrg 	if (pm & PM_write)
   2145  1.1  mrg 	  emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
   2146  1.1  mrg 	res = get_insns ();
   2147  1.1  mrg 	end_sequence ();
   2148  1.1  mrg       }
   2149  1.1  mrg       break;
   2150  1.1  mrg 
   2151  1.1  mrg     default:
   2152  1.1  mrg       {
   2153  1.1  mrg 	rtx addr = data->ptr;
   2154  1.1  mrg 
   2155  1.1  mrg 	if (!addr)
   2156  1.1  mrg 	  {
   2157  1.1  mrg 	    unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
   2158  1.1  mrg 
   2159  1.1  mrg 	    oacc_bcast_align = MAX (oacc_bcast_align, align);
   2160  1.1  mrg 	    data->offset = ROUND_UP (data->offset, align);
   2161  1.1  mrg 	    addr = data->base;
   2162  1.1  mrg 	    gcc_assert (data->base != NULL);
   2163  1.1  mrg 	    if (data->offset)
   2164  1.1  mrg 	      addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
   2165  1.1  mrg 	  }
   2166  1.1  mrg 
   2167  1.1  mrg 	addr = gen_rtx_MEM (mode, addr);
   2168  1.1  mrg 	if (pm == PM_read)
   2169  1.1  mrg 	  res = gen_rtx_SET (addr, reg);
   2170  1.1  mrg 	else if (pm == PM_write)
   2171  1.1  mrg 	  res = gen_rtx_SET (reg, addr);
   2172  1.1  mrg 	else
   2173  1.1  mrg 	  gcc_unreachable ();
   2174  1.1  mrg 
   2175  1.1  mrg 	if (data->ptr)
   2176  1.1  mrg 	  {
   2177  1.1  mrg 	    /* We're using a ptr, increment it.  */
   2178  1.1  mrg 	    start_sequence ();
   2179  1.1  mrg 
   2180  1.1  mrg 	    emit_insn (res);
   2181  1.1  mrg 	    emit_insn (gen_adddi3 (data->ptr, data->ptr,
   2182  1.1  mrg 				   GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
   2183  1.1  mrg 	    res = get_insns ();
   2184  1.1  mrg 	    end_sequence ();
   2185  1.1  mrg 	  }
   2186  1.1  mrg 	else
   2187  1.1  mrg 	  rep = 1;
   2188  1.1  mrg 	data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
   2189  1.1  mrg       }
   2190  1.1  mrg       break;
   2191  1.1  mrg     }
   2192  1.1  mrg   return res;
   2193  1.1  mrg }
   2194  1.1  mrg 
   2195  1.1  mrg /* Returns true if X is a valid address for use in a memory reference.  */
   2197  1.1  mrg 
   2198  1.1  mrg static bool
   2199  1.1  mrg nvptx_legitimate_address_p (machine_mode, rtx x, bool)
   2200  1.1  mrg {
   2201  1.1  mrg   enum rtx_code code = GET_CODE (x);
   2202  1.1  mrg 
   2203  1.1  mrg   switch (code)
   2204  1.1  mrg     {
   2205  1.1  mrg     case REG:
   2206  1.1  mrg       return true;
   2207  1.1  mrg 
   2208  1.1  mrg     case PLUS:
   2209  1.1  mrg       if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
   2210  1.1  mrg 	return true;
   2211  1.1  mrg       return false;
   2212  1.1  mrg 
   2213  1.1  mrg     case CONST:
   2214  1.1  mrg     case SYMBOL_REF:
   2215  1.1  mrg     case LABEL_REF:
   2216  1.1  mrg       return true;
   2217  1.1  mrg 
   2218  1.1  mrg     default:
   2219  1.1  mrg       return false;
   2220  1.1  mrg     }
   2221  1.1  mrg }
   2222  1.1  mrg 
   2223  1.1  mrg /* Machinery to output constant initializers.  When beginning an
   2225  1.1  mrg    initializer, we decide on a fragment size (which is visible in ptx
   2226  1.1  mrg    in the type used), and then all initializer data is buffered until
   2227  1.1  mrg    a fragment is filled and ready to be written out.  */
   2228  1.1  mrg 
   2229  1.1  mrg static struct
   2230  1.1  mrg {
   2231  1.1  mrg   unsigned HOST_WIDE_INT mask; /* Mask for storing fragment.  */
   2232  1.1  mrg   unsigned HOST_WIDE_INT val; /* Current fragment value.  */
   2233  1.1  mrg   unsigned HOST_WIDE_INT remaining; /*  Remaining bytes to be written
   2234  1.1  mrg 					out.  */
   2235  1.1  mrg   unsigned size;  /* Fragment size to accumulate.  */
   2236  1.1  mrg   unsigned offset;  /* Offset within current fragment.  */
   2237  1.1  mrg   bool started;   /* Whether we've output any initializer.  */
   2238  1.1  mrg } init_frag;
   2239  1.1  mrg 
   2240  1.1  mrg /* The current fragment is full,  write it out.  SYM may provide a
   2241  1.1  mrg    symbolic reference we should output,  in which case the fragment
   2242  1.1  mrg    value is the addend.  */
   2243  1.1  mrg 
   2244  1.1  mrg static void
   2245  1.1  mrg output_init_frag (rtx sym)
   2246  1.1  mrg {
   2247  1.1  mrg   fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
   2248  1.1  mrg   unsigned HOST_WIDE_INT val = init_frag.val;
   2249  1.1  mrg 
   2250  1.1  mrg   init_frag.started = true;
   2251  1.1  mrg   init_frag.val = 0;
   2252  1.1  mrg   init_frag.offset = 0;
   2253  1.1  mrg   init_frag.remaining--;
   2254  1.1  mrg 
   2255  1.1  mrg   if (sym)
   2256  1.1  mrg     {
   2257  1.1  mrg       bool function = (SYMBOL_REF_DECL (sym)
   2258  1.1  mrg 		       && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
   2259  1.1  mrg       if (!function)
   2260  1.1  mrg 	fprintf (asm_out_file, "generic(");
   2261  1.1  mrg       output_address (VOIDmode, sym);
   2262  1.1  mrg       if (!function)
   2263  1.1  mrg 	fprintf (asm_out_file, ")");
   2264  1.1  mrg       if (val)
   2265  1.1  mrg 	fprintf (asm_out_file, " + ");
   2266  1.1  mrg     }
   2267  1.1  mrg 
   2268  1.1  mrg   if (!sym || val)
   2269  1.1  mrg     fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
   2270  1.1  mrg }
   2271  1.1  mrg 
   2272  1.1  mrg /* Add value VAL of size SIZE to the data we're emitting, and keep
   2273  1.1  mrg    writing out chunks as they fill up.  */
   2274  1.1  mrg 
   2275  1.1  mrg static void
   2276  1.1  mrg nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
   2277  1.1  mrg {
   2278  1.1  mrg   bool negative_p
   2279  1.1  mrg     = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1));
   2280  1.1  mrg 
   2281  1.1  mrg   /* Avoid undefined behaviour.  */
   2282  1.1  mrg   if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT)
   2283  1.1  mrg     val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1;
   2284  1.1  mrg 
   2285  1.1  mrg   for (unsigned part = 0; size; size -= part)
   2286  1.1  mrg     {
   2287  1.1  mrg       if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT)
   2288  1.1  mrg 	/* Avoid undefined behaviour.  */
   2289  1.1  mrg 	val = negative_p ? -1 : 0;
   2290  1.1  mrg       else
   2291  1.1  mrg 	val >>= (part * BITS_PER_UNIT);
   2292  1.1  mrg       part = init_frag.size - init_frag.offset;
   2293  1.1  mrg       part = MIN (part, size);
   2294  1.1  mrg 
   2295  1.1  mrg       unsigned HOST_WIDE_INT partial
   2296  1.1  mrg 	= val << (init_frag.offset * BITS_PER_UNIT);
   2297  1.1  mrg       init_frag.val |= partial & init_frag.mask;
   2298  1.1  mrg       init_frag.offset += part;
   2299  1.1  mrg 
   2300  1.1  mrg       if (init_frag.offset == init_frag.size)
   2301  1.1  mrg 	output_init_frag (NULL);
   2302  1.1  mrg     }
   2303  1.1  mrg }
   2304  1.1  mrg 
   2305  1.1  mrg /* Target hook for assembling integer object X of size SIZE.  */
   2306  1.1  mrg 
   2307  1.1  mrg static bool
   2308  1.1  mrg nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
   2309  1.1  mrg {
   2310  1.1  mrg   HOST_WIDE_INT val = 0;
   2311  1.1  mrg 
   2312  1.1  mrg   switch (GET_CODE (x))
   2313  1.1  mrg     {
   2314  1.1  mrg     default:
   2315  1.1  mrg       /* Let the generic machinery figure it out, usually for a
   2316  1.1  mrg 	 CONST_WIDE_INT.  */
   2317  1.1  mrg       return false;
   2318  1.1  mrg 
   2319  1.1  mrg     case CONST_INT:
   2320  1.1  mrg       nvptx_assemble_value (INTVAL (x), size);
   2321  1.1  mrg       break;
   2322  1.1  mrg 
   2323  1.1  mrg     case CONST:
   2324  1.1  mrg       x = XEXP (x, 0);
   2325  1.1  mrg       gcc_assert (GET_CODE (x) == PLUS);
   2326  1.1  mrg       val = INTVAL (XEXP (x, 1));
   2327  1.1  mrg       x = XEXP (x, 0);
   2328  1.1  mrg       gcc_assert (GET_CODE (x) == SYMBOL_REF);
   2329  1.1  mrg       gcc_fallthrough (); /* FALLTHROUGH */
   2330  1.1  mrg 
   2331  1.1  mrg     case SYMBOL_REF:
   2332  1.1  mrg       gcc_assert (size == init_frag.size);
   2333  1.1  mrg       if (init_frag.offset)
   2334  1.1  mrg 	sorry ("cannot emit unaligned pointers in ptx assembly");
   2335  1.1  mrg 
   2336  1.1  mrg       nvptx_maybe_record_fnsym (x);
   2337  1.1  mrg       init_frag.val = val;
   2338  1.1  mrg       output_init_frag (x);
   2339  1.1  mrg       break;
   2340  1.1  mrg     }
   2341  1.1  mrg 
   2342  1.1  mrg   return true;
   2343  1.1  mrg }
   2344  1.1  mrg 
   2345  1.1  mrg /* Output SIZE zero bytes.  We ignore the FILE argument since the
   2346  1.1  mrg    functions we're calling to perform the output just use
   2347  1.1  mrg    asm_out_file.  */
   2348  1.1  mrg 
   2349  1.1  mrg void
   2350  1.1  mrg nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
   2351  1.1  mrg {
   2352  1.1  mrg   /* Finish the current fragment, if it's started.  */
   2353  1.1  mrg   if (init_frag.offset)
   2354  1.1  mrg     {
   2355  1.1  mrg       unsigned part = init_frag.size - init_frag.offset;
   2356  1.1  mrg       part = MIN (part, (unsigned)size);
   2357  1.1  mrg       size -= part;
   2358  1.1  mrg       nvptx_assemble_value (0, part);
   2359  1.1  mrg     }
   2360  1.1  mrg 
   2361  1.1  mrg   /* If this skip doesn't terminate the initializer, write as many
   2362  1.1  mrg      remaining pieces as possible directly.  */
   2363  1.1  mrg   if (size < init_frag.remaining * init_frag.size)
   2364  1.1  mrg     {
   2365  1.1  mrg       while (size >= init_frag.size)
   2366  1.1  mrg 	{
   2367  1.1  mrg 	  size -= init_frag.size;
   2368  1.1  mrg 	  output_init_frag (NULL_RTX);
   2369  1.1  mrg 	}
   2370  1.1  mrg       if (size)
   2371  1.1  mrg 	nvptx_assemble_value (0, size);
   2372  1.1  mrg     }
   2373  1.1  mrg }
   2374  1.1  mrg 
   2375  1.1  mrg /* Output a string STR with length SIZE.  As in nvptx_output_skip we
   2376  1.1  mrg    ignore the FILE arg.  */
   2377  1.1  mrg 
   2378  1.1  mrg void
   2379  1.1  mrg nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
   2380  1.1  mrg {
   2381  1.1  mrg   for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
   2382  1.1  mrg     nvptx_assemble_value (str[i], 1);
   2383  1.1  mrg }
   2384  1.1  mrg 
   2385  1.1  mrg /* Return true if TYPE is a record type where the last field is an array without
   2386  1.1  mrg    given dimension.  */
   2387  1.1  mrg 
   2388  1.1  mrg static bool
   2389  1.1  mrg flexible_array_member_type_p (const_tree type)
   2390  1.1  mrg {
   2391  1.1  mrg   if (TREE_CODE (type) != RECORD_TYPE)
   2392  1.1  mrg     return false;
   2393  1.1  mrg 
   2394  1.1  mrg   const_tree last_field = NULL_TREE;
   2395  1.1  mrg   for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
   2396  1.1  mrg     last_field = f;
   2397  1.1  mrg 
   2398  1.1  mrg   if (!last_field)
   2399  1.1  mrg     return false;
   2400  1.1  mrg 
   2401  1.1  mrg   const_tree last_field_type = TREE_TYPE (last_field);
   2402  1.1  mrg   if (TREE_CODE (last_field_type) != ARRAY_TYPE)
   2403  1.1  mrg     return false;
   2404  1.1  mrg 
   2405  1.1  mrg   return (! TYPE_DOMAIN (last_field_type)
   2406  1.1  mrg 	  || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
   2407  1.1  mrg }
   2408  1.1  mrg 
   2409  1.1  mrg /* Emit a PTX variable decl and prepare for emission of its
   2410  1.1  mrg    initializer.  NAME is the symbol name and SETION the PTX data
   2411  1.1  mrg    area. The type is TYPE, object size SIZE and alignment is ALIGN.
   2412  1.1  mrg    The caller has already emitted any indentation and linkage
   2413  1.1  mrg    specifier.  It is responsible for any initializer, terminating ;
   2414  1.1  mrg    and newline.  SIZE is in bytes, ALIGN is in bits -- confusingly
   2415  1.1  mrg    this is the opposite way round that PTX wants them!  */
   2416  1.1  mrg 
   2417  1.1  mrg static void
   2418  1.1  mrg nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
   2419  1.1  mrg 			   const_tree type, HOST_WIDE_INT size, unsigned align,
   2420  1.1  mrg 			   bool undefined = false)
   2421  1.1  mrg {
   2422  1.1  mrg   bool atype = (TREE_CODE (type) == ARRAY_TYPE)
   2423  1.1  mrg     && (TYPE_DOMAIN (type) == NULL_TREE);
   2424  1.1  mrg 
   2425  1.1  mrg   if (undefined && flexible_array_member_type_p (type))
   2426  1.1  mrg     {
   2427  1.1  mrg       size = 0;
   2428  1.1  mrg       atype = true;
   2429  1.1  mrg     }
   2430  1.1  mrg 
   2431  1.1  mrg   while (TREE_CODE (type) == ARRAY_TYPE)
   2432  1.1  mrg     type = TREE_TYPE (type);
   2433  1.1  mrg 
   2434  1.1  mrg   if (TREE_CODE (type) == VECTOR_TYPE
   2435  1.1  mrg       || TREE_CODE (type) == COMPLEX_TYPE)
   2436  1.1  mrg     /* Neither vector nor complex types can contain the other.  */
   2437  1.1  mrg     type = TREE_TYPE (type);
   2438  1.1  mrg 
   2439  1.1  mrg   unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type);
   2440  1.1  mrg 
   2441  1.1  mrg   /* Largest mode we're prepared to accept.  For BLKmode types we
   2442  1.1  mrg      don't know if it'll contain pointer constants, so have to choose
   2443  1.1  mrg      pointer size, otherwise we can choose DImode.  */
   2444  1.1  mrg   machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
   2445  1.1  mrg 
   2446  1.1  mrg   elt_size |= GET_MODE_SIZE (elt_mode);
   2447  1.1  mrg   elt_size &= -elt_size; /* Extract LSB set.  */
   2448  1.1  mrg 
   2449  1.1  mrg   init_frag.size = elt_size;
   2450  1.1  mrg   /* Avoid undefined shift behavior by using '2'.  */
   2451  1.1  mrg   init_frag.mask = ((unsigned HOST_WIDE_INT)2
   2452  1.1  mrg 		    << (elt_size * BITS_PER_UNIT - 1)) - 1;
   2453  1.1  mrg   init_frag.val = 0;
   2454  1.1  mrg   init_frag.offset = 0;
   2455  1.1  mrg   init_frag.started = false;
   2456  1.1  mrg   /* Size might not be a multiple of elt size, if there's an
   2457  1.1  mrg      initialized trailing struct array with smaller type than
   2458  1.1  mrg      elt_size. */
   2459  1.1  mrg   init_frag.remaining = (size + elt_size - 1) / elt_size;
   2460  1.1  mrg 
   2461  1.1  mrg   fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ",
   2462  1.1  mrg 	   section, align / BITS_PER_UNIT,
   2463  1.1  mrg 	   elt_size * BITS_PER_UNIT);
   2464  1.1  mrg   assemble_name (file, name);
   2465  1.1  mrg 
   2466  1.1  mrg   if (size)
   2467  1.1  mrg     /* We make everything an array, to simplify any initialization
   2468  1.1  mrg        emission.  */
   2469  1.1  mrg     fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining);
   2470  1.1  mrg   else if (atype)
   2471  1.1  mrg     fprintf (file, "[]");
   2472  1.1  mrg }
   2473  1.1  mrg 
   2474  1.1  mrg /* Called when the initializer for a decl has been completely output through
   2475  1.1  mrg    combinations of the three functions above.  */
   2476  1.1  mrg 
   2477  1.1  mrg static void
   2478  1.1  mrg nvptx_assemble_decl_end (void)
   2479  1.1  mrg {
   2480  1.1  mrg   if (init_frag.offset)
   2481  1.1  mrg     /* This can happen with a packed struct with trailing array member.  */
   2482  1.1  mrg     nvptx_assemble_value (0, init_frag.size - init_frag.offset);
   2483  1.1  mrg   fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
   2484  1.1  mrg }
   2485  1.1  mrg 
   2486  1.1  mrg /* Output an uninitialized common or file-scope variable.  */
   2487  1.1  mrg 
   2488  1.1  mrg void
   2489  1.1  mrg nvptx_output_aligned_decl (FILE *file, const char *name,
   2490  1.1  mrg 			   const_tree decl, HOST_WIDE_INT size, unsigned align)
   2491  1.1  mrg {
   2492  1.1  mrg   write_var_marker (file, true, TREE_PUBLIC (decl), name);
   2493  1.1  mrg 
   2494  1.1  mrg   /* If this is public, it is common.  The nearest thing we have to
   2495  1.1  mrg      common is weak.  */
   2496  1.1  mrg   fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
   2497  1.1  mrg 
   2498  1.1  mrg   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
   2499  1.1  mrg 			     TREE_TYPE (decl), size, align);
   2500  1.1  mrg   nvptx_assemble_decl_end ();
   2501  1.1  mrg }
   2502  1.1  mrg 
   2503  1.1  mrg /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME.  Begin the process of
   2504  1.1  mrg    writing a constant variable EXP with NAME and SIZE and its
   2505  1.1  mrg    initializer to FILE.  */
   2506  1.1  mrg 
   2507  1.1  mrg static void
   2508  1.1  mrg nvptx_asm_declare_constant_name (FILE *file, const char *name,
   2509  1.1  mrg 				 const_tree exp, HOST_WIDE_INT obj_size)
   2510  1.1  mrg {
   2511  1.1  mrg   write_var_marker (file, true, false, name);
   2512  1.1  mrg 
   2513  1.1  mrg   fprintf (file, "\t");
   2514  1.1  mrg 
   2515  1.1  mrg   tree type = TREE_TYPE (exp);
   2516  1.1  mrg   nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
   2517  1.1  mrg 			     TYPE_ALIGN (type));
   2518  1.1  mrg }
   2519  1.1  mrg 
   2520  1.1  mrg /* Implement the ASM_DECLARE_OBJECT_NAME macro.  Used to start writing
   2521  1.1  mrg    a variable DECL with NAME to FILE.  */
   2522  1.1  mrg 
   2523  1.1  mrg void
   2524  1.1  mrg nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
   2525  1.1  mrg {
   2526  1.1  mrg   write_var_marker (file, true, TREE_PUBLIC (decl), name);
   2527  1.1  mrg 
   2528  1.1  mrg   fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
   2529  1.1  mrg 			  : DECL_WEAK (decl) ? ".weak " : ".visible "));
   2530  1.1  mrg 
   2531  1.1  mrg   tree type = TREE_TYPE (decl);
   2532  1.1  mrg   HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
   2533  1.1  mrg   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
   2534  1.1  mrg 			     type, obj_size, DECL_ALIGN (decl));
   2535  1.1  mrg }
   2536  1.1  mrg 
   2537  1.1  mrg /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing.  */
   2538  1.1  mrg 
   2539  1.1  mrg static void
   2540  1.1  mrg nvptx_globalize_label (FILE *, const char *)
   2541  1.1  mrg {
   2542  1.1  mrg }
   2543  1.1  mrg 
   2544  1.1  mrg /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL.  Write an extern
   2545  1.1  mrg    declaration only for variable DECL with NAME to FILE.  */
   2546  1.1  mrg 
   2547  1.1  mrg static void
   2548  1.1  mrg nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
   2549  1.1  mrg {
   2550  1.1  mrg   /* The middle end can place constant pool decls into the varpool as
   2551  1.1  mrg      undefined.  Until that is fixed, catch the problem here.  */
   2552  1.1  mrg   if (DECL_IN_CONSTANT_POOL (decl))
   2553  1.1  mrg     return;
   2554  1.1  mrg 
   2555  1.1  mrg   /*  We support weak defintions, and hence have the right
   2556  1.1  mrg       ASM_WEAKEN_DECL definition.  Diagnose the problem here.  */
   2557  1.1  mrg   if (DECL_WEAK (decl))
   2558  1.1  mrg     error_at (DECL_SOURCE_LOCATION (decl),
   2559  1.1  mrg 	      "PTX does not support weak declarations"
   2560  1.1  mrg 	      " (only weak definitions)");
   2561  1.1  mrg   write_var_marker (file, false, TREE_PUBLIC (decl), name);
   2562  1.1  mrg 
   2563  1.1  mrg   fprintf (file, "\t.extern ");
   2564  1.1  mrg   tree size = DECL_SIZE_UNIT (decl);
   2565  1.1  mrg   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
   2566  1.1  mrg 			     TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
   2567  1.1  mrg 			     DECL_ALIGN (decl), true);
   2568  1.1  mrg   nvptx_assemble_decl_end ();
   2569  1.1  mrg }
   2570  1.1  mrg 
   2571  1.1  mrg /* Output a pattern for a move instruction.  */
   2572  1.1  mrg 
   2573  1.1  mrg const char *
   2574  1.1  mrg nvptx_output_mov_insn (rtx dst, rtx src)
   2575  1.1  mrg {
   2576  1.1  mrg   machine_mode dst_mode = GET_MODE (dst);
   2577  1.1  mrg   machine_mode src_mode = GET_MODE (src);
   2578  1.1  mrg   machine_mode dst_inner = (GET_CODE (dst) == SUBREG
   2579  1.1  mrg 			    ? GET_MODE (XEXP (dst, 0)) : dst_mode);
   2580  1.1  mrg   machine_mode src_inner = (GET_CODE (src) == SUBREG
   2581  1.1  mrg 			    ? GET_MODE (XEXP (src, 0)) : dst_mode);
   2582  1.1  mrg 
   2583  1.1  mrg   rtx sym = src;
   2584  1.1  mrg   if (GET_CODE (sym) == CONST)
   2585  1.1  mrg     sym = XEXP (XEXP (sym, 0), 0);
   2586  1.1  mrg   if (SYMBOL_REF_P (sym))
   2587  1.1  mrg     {
   2588  1.1  mrg       if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
   2589  1.1  mrg 	return "%.\tcvta%D1%t0\t%0, %1;";
   2590  1.1  mrg       nvptx_maybe_record_fnsym (sym);
   2591  1.1  mrg     }
   2592  1.1  mrg 
   2593  1.1  mrg   if (src_inner == dst_inner)
   2594  1.1  mrg     return "%.\tmov%t0\t%0, %1;";
   2595  1.1  mrg 
   2596  1.1  mrg   if (CONSTANT_P (src))
   2597  1.1  mrg     return (GET_MODE_CLASS (dst_inner) == MODE_INT
   2598  1.1  mrg 	    && GET_MODE_CLASS (src_inner) != MODE_FLOAT
   2599  1.1  mrg 	    ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
   2600  1.1  mrg 
   2601  1.1  mrg   if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
   2602  1.1  mrg     {
   2603  1.1  mrg       if (GET_MODE_BITSIZE (dst_mode) == 128
   2604  1.1  mrg 	  && GET_MODE_BITSIZE (src_mode) == 128)
   2605  1.1  mrg 	{
   2606  1.1  mrg 	  /* mov.b128 is not supported.  */
   2607  1.1  mrg 	  if (dst_inner == V2DImode && src_inner == TImode)
   2608  1.1  mrg 	    return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
   2609  1.1  mrg 	  else if (dst_inner == TImode && src_inner == V2DImode)
   2610  1.1  mrg 	    return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
   2611  1.1  mrg 
   2612  1.1  mrg 	  gcc_unreachable ();
   2613  1.1  mrg 	}
   2614  1.1  mrg       return "%.\tmov.b%T0\t%0, %1;";
   2615  1.1  mrg     }
   2616  1.1  mrg 
   2617  1.1  mrg   if (GET_MODE_BITSIZE (src_inner) == 128
   2618  1.1  mrg       && GET_MODE_BITSIZE (src_mode) == 64)
   2619  1.1  mrg     return "%.\tmov.b%T0\t%0, %1;";
   2620  1.1  mrg 
   2621  1.1  mrg   return "%.\tcvt%t0%t1\t%0, %1;";
   2622  1.1  mrg }
   2623  1.1  mrg 
   2624  1.1  mrg /* Output a pre/post barrier for MEM_OPERAND according to MEMMODEL.  */
   2625  1.1  mrg 
   2626  1.1  mrg static void
   2627  1.1  mrg nvptx_output_barrier (rtx *mem_operand, int memmodel, bool pre_p)
   2628  1.1  mrg {
   2629  1.1  mrg   bool post_p = !pre_p;
   2630  1.1  mrg 
   2631  1.1  mrg   switch (memmodel)
   2632  1.1  mrg     {
   2633  1.1  mrg     case MEMMODEL_RELAXED:
   2634  1.1  mrg       return;
   2635  1.1  mrg     case MEMMODEL_CONSUME:
   2636  1.1  mrg     case MEMMODEL_ACQUIRE:
   2637  1.1  mrg     case MEMMODEL_SYNC_ACQUIRE:
   2638  1.1  mrg       if (post_p)
   2639  1.1  mrg 	break;
   2640  1.1  mrg       return;
   2641  1.1  mrg     case MEMMODEL_RELEASE:
   2642  1.1  mrg     case MEMMODEL_SYNC_RELEASE:
   2643  1.1  mrg       if (pre_p)
   2644  1.1  mrg 	break;
   2645  1.1  mrg       return;
   2646  1.1  mrg     case MEMMODEL_ACQ_REL:
   2647  1.1  mrg     case MEMMODEL_SEQ_CST:
   2648  1.1  mrg     case MEMMODEL_SYNC_SEQ_CST:
   2649  1.1  mrg       if (pre_p || post_p)
   2650  1.1  mrg 	break;
   2651  1.1  mrg       return;
   2652  1.1  mrg     default:
   2653  1.1  mrg       gcc_unreachable ();
   2654  1.1  mrg     }
   2655  1.1  mrg 
   2656  1.1  mrg   output_asm_insn ("%.\tmembar%B0;", mem_operand);
   2657  1.1  mrg }
   2658  1.1  mrg 
   2659  1.1  mrg const char *
   2660  1.1  mrg nvptx_output_atomic_insn (const char *asm_template, rtx *operands, int mem_pos,
   2661  1.1  mrg 			  int memmodel_pos)
   2662  1.1  mrg {
   2663  1.1  mrg   nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
   2664  1.1  mrg 			true);
   2665  1.1  mrg   output_asm_insn (asm_template, operands);
   2666  1.1  mrg   nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
   2667  1.1  mrg 			false);
   2668  1.1  mrg   return "";
   2669  1.1  mrg }
   2670  1.1  mrg 
   2671  1.1  mrg static void nvptx_print_operand (FILE *, rtx, int);
   2672  1.1  mrg 
   2673  1.1  mrg /* Output INSN, which is a call to CALLEE with result RESULT.  For ptx, this
   2674  1.1  mrg    involves writing .param declarations and in/out copies into them.  For
   2675  1.1  mrg    indirect calls, also write the .callprototype.  */
   2676  1.1  mrg 
   2677  1.1  mrg const char *
   2678  1.1  mrg nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
   2679  1.1  mrg {
   2680  1.1  mrg   char buf[16];
   2681  1.1  mrg   static int labelno;
   2682  1.1  mrg   bool needs_tgt = register_operand (callee, Pmode);
   2683  1.1  mrg   rtx pat = PATTERN (insn);
   2684  1.1  mrg   if (GET_CODE (pat) == COND_EXEC)
   2685  1.1  mrg     pat = COND_EXEC_CODE (pat);
   2686  1.1  mrg   int arg_end = XVECLEN (pat, 0);
   2687  1.1  mrg   tree decl = NULL_TREE;
   2688  1.1  mrg 
   2689  1.1  mrg   fprintf (asm_out_file, "\t{\n");
   2690  1.1  mrg   if (result != NULL)
   2691  1.1  mrg     fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
   2692  1.1  mrg 	     nvptx_ptx_type_from_mode (GET_MODE (result), false),
   2693  1.1  mrg 	     reg_names[NVPTX_RETURN_REGNUM]);
   2694  1.1  mrg 
   2695  1.1  mrg   /* Ensure we have a ptx declaration in the output if necessary.  */
   2696  1.1  mrg   if (GET_CODE (callee) == SYMBOL_REF)
   2697  1.1  mrg     {
   2698  1.1  mrg       decl = SYMBOL_REF_DECL (callee);
   2699  1.1  mrg       if (!decl
   2700  1.1  mrg 	  || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
   2701  1.1  mrg 	nvptx_record_libfunc (callee, result, pat);
   2702  1.1  mrg       else if (DECL_EXTERNAL (decl))
   2703  1.1  mrg 	nvptx_record_fndecl (decl);
   2704  1.1  mrg     }
   2705  1.1  mrg 
   2706  1.1  mrg   if (needs_tgt)
   2707  1.1  mrg     {
   2708  1.1  mrg       ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
   2709  1.1  mrg       labelno++;
   2710  1.1  mrg       ASM_OUTPUT_LABEL (asm_out_file, buf);
   2711  1.1  mrg       std::stringstream s;
   2712  1.1  mrg       write_fn_proto_from_insn (s, NULL, result, pat);
   2713  1.1  mrg       fputs (s.str().c_str(), asm_out_file);
   2714  1.1  mrg     }
   2715  1.1  mrg 
   2716  1.1  mrg   for (int argno = 1; argno < arg_end; argno++)
   2717  1.1  mrg     {
   2718  1.1  mrg       rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
   2719  1.1  mrg       machine_mode mode = GET_MODE (t);
   2720  1.1  mrg       const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
   2721  1.1  mrg 
   2722  1.1  mrg       /* Mode splitting has already been done.  */
   2723  1.1  mrg       fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
   2724  1.1  mrg 	       "\t\tst.param%s [%%out_arg%d], ",
   2725  1.1  mrg 	       ptx_type, argno, ptx_type, argno);
   2726  1.1  mrg       output_reg (asm_out_file, REGNO (t), VOIDmode);
   2727  1.1  mrg       fprintf (asm_out_file, ";\n");
   2728  1.1  mrg     }
   2729  1.1  mrg 
   2730  1.1  mrg   /* The '.' stands for the call's predicate, if any.  */
   2731  1.1  mrg   nvptx_print_operand (asm_out_file, NULL_RTX, '.');
   2732  1.1  mrg   fprintf (asm_out_file, "\t\tcall ");
   2733  1.1  mrg   if (result != NULL_RTX)
   2734  1.1  mrg     fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
   2735  1.1  mrg 
   2736  1.1  mrg   if (decl)
   2737  1.1  mrg     {
   2738  1.1  mrg       char *replaced_dots = NULL;
   2739  1.1  mrg       const char *name = get_fnname_from_decl (decl);
   2740  1.1  mrg       const char *replacement = nvptx_name_replacement (name);
   2741  1.1  mrg       if (replacement != name)
   2742  1.1  mrg 	name = replacement;
   2743  1.1  mrg       else
   2744  1.1  mrg 	{
   2745  1.1  mrg 	  replaced_dots = nvptx_replace_dot (name);
   2746  1.1  mrg 	  if (replaced_dots)
   2747  1.1  mrg 	    name = replaced_dots;
   2748  1.1  mrg 	}
   2749  1.1  mrg       assemble_name (asm_out_file, name);
   2750  1.1  mrg       if (replaced_dots)
   2751  1.1  mrg 	XDELETE (replaced_dots);
   2752  1.1  mrg     }
   2753  1.1  mrg   else
   2754  1.1  mrg     output_address (VOIDmode, callee);
   2755  1.1  mrg 
   2756  1.1  mrg   const char *open = "(";
   2757  1.1  mrg   for (int argno = 1; argno < arg_end; argno++)
   2758  1.1  mrg     {
   2759  1.1  mrg       fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
   2760  1.1  mrg       open = "";
   2761  1.1  mrg     }
   2762  1.1  mrg   if (decl && DECL_STATIC_CHAIN (decl))
   2763  1.1  mrg     {
   2764  1.1  mrg       fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
   2765  1.1  mrg       open = "";
   2766  1.1  mrg     }
   2767  1.1  mrg   if (!open[0])
   2768  1.1  mrg     fprintf (asm_out_file, ")");
   2769  1.1  mrg 
   2770  1.1  mrg   if (needs_tgt)
   2771  1.1  mrg     {
   2772  1.1  mrg       fprintf (asm_out_file, ", ");
   2773  1.1  mrg       assemble_name (asm_out_file, buf);
   2774  1.1  mrg     }
   2775  1.1  mrg   fprintf (asm_out_file, ";\n");
   2776  1.1  mrg 
   2777  1.1  mrg   if (find_reg_note (insn, REG_NORETURN, NULL))
   2778  1.1  mrg     {
   2779  1.1  mrg       /* No return functions confuse the PTX JIT, as it doesn't realize
   2780  1.1  mrg 	 the flow control barrier they imply.  It can seg fault if it
   2781  1.1  mrg 	 encounters what looks like an unexitable loop.  Emit a trailing
   2782  1.1  mrg 	 trap and exit, which it does grok.  */
   2783  1.1  mrg       fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
   2784  1.1  mrg       fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
   2785  1.1  mrg     }
   2786  1.1  mrg 
   2787  1.1  mrg   if (result)
   2788  1.1  mrg     {
   2789  1.1  mrg       static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
   2790  1.1  mrg 
   2791  1.1  mrg       if (!rval[0])
   2792  1.1  mrg 	/* We must escape the '%' that starts RETURN_REGNUM.  */
   2793  1.1  mrg 	sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
   2794  1.1  mrg 		 reg_names[NVPTX_RETURN_REGNUM]);
   2795  1.1  mrg       return rval;
   2796  1.1  mrg     }
   2797  1.1  mrg 
   2798  1.1  mrg   return "}";
   2799  1.1  mrg }
   2800  1.1  mrg 
   2801  1.1  mrg /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P.  */
   2802  1.1  mrg 
   2803  1.1  mrg static bool
   2804  1.1  mrg nvptx_print_operand_punct_valid_p (unsigned char c)
   2805  1.1  mrg {
   2806  1.1  mrg   return c == '.' || c== '#';
   2807  1.1  mrg }
   2808  1.1  mrg 
   2809  1.1  mrg /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE.  */
   2810  1.1  mrg 
   2811  1.1  mrg static void
   2812  1.1  mrg nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
   2813  1.1  mrg {
   2814  1.1  mrg   rtx off;
   2815  1.1  mrg   if (GET_CODE (x) == CONST)
   2816  1.1  mrg     x = XEXP (x, 0);
   2817  1.1  mrg   switch (GET_CODE (x))
   2818  1.1  mrg     {
   2819  1.1  mrg     case PLUS:
   2820  1.1  mrg       off = XEXP (x, 1);
   2821  1.1  mrg       output_address (VOIDmode, XEXP (x, 0));
   2822  1.1  mrg       fprintf (file, "+");
   2823  1.1  mrg       output_address (VOIDmode, off);
   2824  1.1  mrg       break;
   2825  1.1  mrg 
   2826  1.1  mrg     case SYMBOL_REF:
   2827  1.1  mrg     case LABEL_REF:
   2828  1.1  mrg       output_addr_const (file, x);
   2829  1.1  mrg       break;
   2830  1.1  mrg 
   2831  1.1  mrg     default:
   2832  1.1  mrg       gcc_assert (GET_CODE (x) != MEM);
   2833  1.1  mrg       nvptx_print_operand (file, x, 0);
   2834  1.1  mrg       break;
   2835  1.1  mrg     }
   2836  1.1  mrg }
   2837  1.1  mrg 
   2838  1.1  mrg /* Write assembly language output for the address ADDR to FILE.  */
   2839  1.1  mrg 
   2840  1.1  mrg static void
   2841  1.1  mrg nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
   2842  1.1  mrg {
   2843  1.1  mrg   nvptx_print_address_operand (file, addr, mode);
   2844  1.1  mrg }
   2845  1.1  mrg 
   2846  1.1  mrg static nvptx_data_area
   2847  1.1  mrg nvptx_mem_data_area (const_rtx x)
   2848  1.1  mrg {
   2849  1.1  mrg   gcc_assert (GET_CODE (x) == MEM);
   2850  1.1  mrg 
   2851  1.1  mrg   const_rtx addr = XEXP (x, 0);
   2852  1.1  mrg   subrtx_iterator::array_type array;
   2853  1.1  mrg   FOR_EACH_SUBRTX (iter, array, addr, ALL)
   2854  1.1  mrg     if (SYMBOL_REF_P (*iter))
   2855  1.1  mrg       return SYMBOL_DATA_AREA (*iter);
   2856  1.1  mrg 
   2857  1.1  mrg   return DATA_AREA_GENERIC;
   2858  1.1  mrg }
   2859  1.1  mrg 
   2860  1.1  mrg bool
   2861  1.1  mrg nvptx_mem_maybe_shared_p (const_rtx x)
   2862  1.1  mrg {
   2863  1.1  mrg   nvptx_data_area area = nvptx_mem_data_area (x);
   2864  1.1  mrg   return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC;
   2865  1.1  mrg }
   2866  1.1  mrg 
   2867  1.1  mrg /* Print an operand, X, to FILE, with an optional modifier in CODE.
   2868  1.1  mrg 
   2869  1.1  mrg    Meaning of CODE:
   2870  1.1  mrg    . -- print the predicate for the instruction or an emptry string for an
   2871  1.1  mrg         unconditional one.
   2872  1.1  mrg    # -- print a rounding mode for the instruction
   2873  1.1  mrg 
   2874  1.1  mrg    A -- print a data area for a MEM
   2875  1.1  mrg    c -- print an opcode suffix for a comparison operator, including a type code
   2876  1.1  mrg    D -- print a data area for a MEM operand
   2877  1.1  mrg    S -- print a shuffle kind specified by CONST_INT
   2878  1.1  mrg    t -- print a type opcode suffix, promoting QImode to 32 bits
   2879  1.1  mrg    T -- print a type size in bits
   2880  1.1  mrg    u -- print a type opcode suffix without promotions.
   2881  1.1  mrg    x -- print a destination operand that may also be a bit bucket.  */
   2882  1.1  mrg 
   2883  1.1  mrg static void
   2884  1.1  mrg nvptx_print_operand (FILE *file, rtx x, int code)
   2885  1.1  mrg {
   2886  1.1  mrg   if (code == '.')
   2887  1.1  mrg     {
   2888  1.1  mrg       x = current_insn_predicate;
   2889  1.1  mrg       if (x)
   2890  1.1  mrg 	{
   2891  1.1  mrg 	  fputs ("@", file);
   2892  1.1  mrg 	  if (GET_CODE (x) == EQ)
   2893  1.1  mrg 	    fputs ("!", file);
   2894  1.1  mrg 	  output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
   2895  1.1  mrg 	}
   2896  1.1  mrg       return;
   2897  1.1  mrg     }
   2898  1.1  mrg   else if (code == '#')
   2899  1.1  mrg     {
   2900  1.1  mrg       fputs (".rn", file);
   2901  1.1  mrg       return;
   2902  1.1  mrg     }
   2903  1.1  mrg 
   2904  1.1  mrg   enum rtx_code x_code = GET_CODE (x);
   2905  1.1  mrg   machine_mode mode = GET_MODE (x);
   2906  1.1  mrg 
   2907  1.1  mrg   switch (code)
   2908  1.1  mrg     {
   2909  1.1  mrg     case 'x':
   2910  1.1  mrg       if (current_output_insn != NULL
   2911  1.1  mrg 	  && find_reg_note (current_output_insn, REG_UNUSED, x) != NULL_RTX)
   2912  1.1  mrg 	{
   2913  1.1  mrg 	  fputs ("_", file);
   2914  1.1  mrg 	  return;
   2915  1.1  mrg 	}
   2916  1.1  mrg       goto common;
   2917  1.1  mrg     case 'B':
   2918  1.1  mrg       if (SYMBOL_REF_P (XEXP (x, 0)))
   2919  1.1  mrg 	switch (SYMBOL_DATA_AREA (XEXP (x, 0)))
   2920  1.1  mrg 	  {
   2921  1.1  mrg 	  case DATA_AREA_GENERIC:
   2922  1.1  mrg 	    /* Assume worst-case: global.  */
   2923  1.1  mrg 	    gcc_fallthrough (); /* FALLTHROUGH.  */
   2924  1.1  mrg 	  case DATA_AREA_GLOBAL:
   2925  1.1  mrg 	    break;
   2926  1.1  mrg 	  case DATA_AREA_SHARED:
   2927  1.1  mrg 	    fputs (".cta", file);
   2928  1.1  mrg 	    return;
   2929  1.1  mrg 	  case DATA_AREA_LOCAL:
   2930  1.1  mrg 	  case DATA_AREA_CONST:
   2931  1.1  mrg 	  case DATA_AREA_PARAM:
   2932  1.1  mrg 	  default:
   2933  1.1  mrg 	    gcc_unreachable ();
   2934  1.1  mrg 	  }
   2935  1.1  mrg 
   2936  1.1  mrg       /* There are 2 cases where membar.sys differs from membar.gl:
   2937  1.1  mrg 	 - host accesses global memory (f.i. systemwide atomics)
   2938  1.1  mrg 	 - 2 or more devices are setup in peer-to-peer mode, and one
   2939  1.1  mrg 	   peer can access global memory of other peer.
   2940  1.1  mrg 	 Neither are currently supported by openMP/OpenACC on nvptx, but
   2941  1.1  mrg 	 that could change, so we default to membar.sys.  We could support
   2942  1.1  mrg 	 this more optimally by adding DATA_AREA_SYS and then emitting
   2943  1.1  mrg 	 .gl for DATA_AREA_GLOBAL and .sys for DATA_AREA_SYS.  */
   2944  1.1  mrg       fputs (".sys", file);
   2945  1.1  mrg       return;
   2946  1.1  mrg 
   2947  1.1  mrg     case 'A':
   2948  1.1  mrg       x = XEXP (x, 0);
   2949  1.1  mrg       gcc_fallthrough (); /* FALLTHROUGH. */
   2950  1.1  mrg 
   2951  1.1  mrg     case 'D':
   2952  1.1  mrg       if (GET_CODE (x) == CONST)
   2953  1.1  mrg 	x = XEXP (x, 0);
   2954  1.1  mrg       if (GET_CODE (x) == PLUS)
   2955  1.1  mrg 	x = XEXP (x, 0);
   2956  1.1  mrg 
   2957  1.1  mrg       if (GET_CODE (x) == SYMBOL_REF)
   2958  1.1  mrg 	fputs (section_for_sym (x), file);
   2959  1.1  mrg       break;
   2960  1.1  mrg 
   2961  1.1  mrg     case 't':
   2962  1.1  mrg     case 'u':
   2963  1.1  mrg       if (x_code == SUBREG)
   2964  1.1  mrg 	{
   2965  1.1  mrg 	  machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
   2966  1.1  mrg 	  if (VECTOR_MODE_P (inner_mode)
   2967  1.1  mrg 	      && (GET_MODE_SIZE (mode)
   2968  1.1  mrg 		  <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
   2969  1.1  mrg 	    mode = GET_MODE_INNER (inner_mode);
   2970  1.1  mrg 	  else if (split_mode_p (inner_mode))
   2971  1.1  mrg 	    mode = maybe_split_mode (inner_mode);
   2972  1.1  mrg 	  else
   2973  1.1  mrg 	    mode = inner_mode;
   2974  1.1  mrg 	}
   2975  1.1  mrg       fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
   2976  1.1  mrg       break;
   2977  1.1  mrg 
   2978  1.1  mrg     case 'H':
   2979  1.1  mrg     case 'L':
   2980  1.1  mrg       {
   2981  1.1  mrg 	rtx inner_x = SUBREG_REG (x);
   2982  1.1  mrg 	machine_mode inner_mode = GET_MODE (inner_x);
   2983  1.1  mrg 	machine_mode split = maybe_split_mode (inner_mode);
   2984  1.1  mrg 
   2985  1.1  mrg 	output_reg (file, REGNO (inner_x), split,
   2986  1.1  mrg 		    (code == 'H'
   2987  1.1  mrg 		     ? GET_MODE_SIZE (inner_mode) / 2
   2988  1.1  mrg 		     : 0));
   2989  1.1  mrg       }
   2990  1.1  mrg       break;
   2991  1.1  mrg 
   2992  1.1  mrg     case 'S':
   2993  1.1  mrg       {
   2994  1.1  mrg 	nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
   2995  1.1  mrg 	/* Same order as nvptx_shuffle_kind.  */
   2996  1.1  mrg 	static const char *const kinds[] =
   2997  1.1  mrg 	  {".up", ".down", ".bfly", ".idx"};
   2998  1.1  mrg 	fputs (kinds[kind], file);
   2999  1.1  mrg       }
   3000  1.1  mrg       break;
   3001  1.1  mrg 
   3002  1.1  mrg     case 'T':
   3003  1.1  mrg       fprintf (file, "%d", GET_MODE_BITSIZE (mode));
   3004  1.1  mrg       break;
   3005  1.1  mrg 
   3006  1.1  mrg     case 'j':
   3007  1.1  mrg       fprintf (file, "@");
   3008  1.1  mrg       goto common;
   3009  1.1  mrg 
   3010  1.1  mrg     case 'J':
   3011  1.1  mrg       fprintf (file, "@!");
   3012  1.1  mrg       goto common;
   3013  1.1  mrg 
   3014  1.1  mrg     case 'c':
   3015  1.1  mrg       mode = GET_MODE (XEXP (x, 0));
   3016  1.1  mrg       switch (x_code)
   3017  1.1  mrg 	{
   3018  1.1  mrg 	case EQ:
   3019  1.1  mrg 	  fputs (".eq", file);
   3020  1.1  mrg 	  break;
   3021  1.1  mrg 	case NE:
   3022  1.1  mrg 	  if (FLOAT_MODE_P (mode))
   3023  1.1  mrg 	    fputs (".neu", file);
   3024  1.1  mrg 	  else
   3025  1.1  mrg 	    fputs (".ne", file);
   3026  1.1  mrg 	  break;
   3027  1.1  mrg 	case LE:
   3028  1.1  mrg 	case LEU:
   3029  1.1  mrg 	  fputs (".le", file);
   3030  1.1  mrg 	  break;
   3031  1.1  mrg 	case GE:
   3032  1.1  mrg 	case GEU:
   3033  1.1  mrg 	  fputs (".ge", file);
   3034  1.1  mrg 	  break;
   3035  1.1  mrg 	case LT:
   3036  1.1  mrg 	case LTU:
   3037  1.1  mrg 	  fputs (".lt", file);
   3038  1.1  mrg 	  break;
   3039  1.1  mrg 	case GT:
   3040  1.1  mrg 	case GTU:
   3041  1.1  mrg 	  fputs (".gt", file);
   3042  1.1  mrg 	  break;
   3043  1.1  mrg 	case LTGT:
   3044  1.1  mrg 	  fputs (".ne", file);
   3045  1.1  mrg 	  break;
   3046  1.1  mrg 	case UNEQ:
   3047  1.1  mrg 	  fputs (".equ", file);
   3048  1.1  mrg 	  break;
   3049  1.1  mrg 	case UNLE:
   3050  1.1  mrg 	  fputs (".leu", file);
   3051  1.1  mrg 	  break;
   3052  1.1  mrg 	case UNGE:
   3053  1.1  mrg 	  fputs (".geu", file);
   3054  1.1  mrg 	  break;
   3055  1.1  mrg 	case UNLT:
   3056  1.1  mrg 	  fputs (".ltu", file);
   3057  1.1  mrg 	  break;
   3058  1.1  mrg 	case UNGT:
   3059  1.1  mrg 	  fputs (".gtu", file);
   3060  1.1  mrg 	  break;
   3061  1.1  mrg 	case UNORDERED:
   3062  1.1  mrg 	  fputs (".nan", file);
   3063  1.1  mrg 	  break;
   3064  1.1  mrg 	case ORDERED:
   3065  1.1  mrg 	  fputs (".num", file);
   3066  1.1  mrg 	  break;
   3067  1.1  mrg 	default:
   3068  1.1  mrg 	  gcc_unreachable ();
   3069  1.1  mrg 	}
   3070  1.1  mrg       if (FLOAT_MODE_P (mode)
   3071  1.1  mrg 	  || x_code == EQ || x_code == NE
   3072  1.1  mrg 	  || x_code == GEU || x_code == GTU
   3073  1.1  mrg 	  || x_code == LEU || x_code == LTU)
   3074  1.1  mrg 	fputs (nvptx_ptx_type_from_mode (mode, true), file);
   3075  1.1  mrg       else
   3076  1.1  mrg 	fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
   3077  1.1  mrg       break;
   3078  1.1  mrg     default:
   3079  1.1  mrg     common:
   3080  1.1  mrg       switch (x_code)
   3081  1.1  mrg 	{
   3082  1.1  mrg 	case SUBREG:
   3083  1.1  mrg 	  {
   3084  1.1  mrg 	    rtx inner_x = SUBREG_REG (x);
   3085  1.1  mrg 	    machine_mode inner_mode = GET_MODE (inner_x);
   3086  1.1  mrg 	    machine_mode split = maybe_split_mode (inner_mode);
   3087  1.1  mrg 
   3088  1.1  mrg 	    if (VECTOR_MODE_P (inner_mode)
   3089  1.1  mrg 		&& (GET_MODE_SIZE (mode)
   3090  1.1  mrg 		    <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
   3091  1.1  mrg 	      {
   3092  1.1  mrg 		output_reg (file, REGNO (inner_x), VOIDmode);
   3093  1.1  mrg 		fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
   3094  1.1  mrg 	      }
   3095  1.1  mrg 	    else if (split_mode_p (inner_mode)
   3096  1.1  mrg 		&& (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
   3097  1.1  mrg 	      output_reg (file, REGNO (inner_x), split);
   3098  1.1  mrg 	    else
   3099  1.1  mrg 	      output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
   3100  1.1  mrg 	  }
   3101  1.1  mrg 	  break;
   3102  1.1  mrg 
   3103  1.1  mrg 	case REG:
   3104  1.1  mrg 	  output_reg (file, REGNO (x), maybe_split_mode (mode));
   3105  1.1  mrg 	  break;
   3106  1.1  mrg 
   3107  1.1  mrg 	case MEM:
   3108  1.1  mrg 	  fputc ('[', file);
   3109  1.1  mrg 	  nvptx_print_address_operand (file, XEXP (x, 0), mode);
   3110  1.1  mrg 	  fputc (']', file);
   3111  1.1  mrg 	  break;
   3112  1.1  mrg 
   3113  1.1  mrg 	case CONST_INT:
   3114  1.1  mrg 	  output_addr_const (file, x);
   3115  1.1  mrg 	  break;
   3116  1.1  mrg 
   3117  1.1  mrg 	case CONST:
   3118  1.1  mrg 	case SYMBOL_REF:
   3119  1.1  mrg 	case LABEL_REF:
   3120  1.1  mrg 	  /* We could use output_addr_const, but that can print things like
   3121  1.1  mrg 	     "x-8", which breaks ptxas.  Need to ensure it is output as
   3122  1.1  mrg 	     "x+-8".  */
   3123  1.1  mrg 	  nvptx_print_address_operand (file, x, VOIDmode);
   3124  1.1  mrg 	  break;
   3125  1.1  mrg 
   3126  1.1  mrg 	case CONST_DOUBLE:
   3127  1.1  mrg 	  long vals[2];
   3128  1.1  mrg 	  real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
   3129  1.1  mrg 	  vals[0] &= 0xffffffff;
   3130  1.1  mrg 	  vals[1] &= 0xffffffff;
   3131  1.1  mrg 	  if (mode == SFmode)
   3132  1.1  mrg 	    fprintf (file, "0f%08lx", vals[0]);
   3133  1.1  mrg 	  else
   3134  1.1  mrg 	    fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
   3135  1.1  mrg 	  break;
   3136  1.1  mrg 
   3137  1.1  mrg 	case CONST_VECTOR:
   3138  1.1  mrg 	  {
   3139  1.1  mrg 	    unsigned n = CONST_VECTOR_NUNITS (x);
   3140  1.1  mrg 	    fprintf (file, "{ ");
   3141  1.1  mrg 	    for (unsigned i = 0; i < n; ++i)
   3142  1.1  mrg 	      {
   3143  1.1  mrg 		if (i != 0)
   3144  1.1  mrg 		  fprintf (file, ", ");
   3145  1.1  mrg 
   3146  1.1  mrg 		rtx elem = CONST_VECTOR_ELT (x, i);
   3147  1.1  mrg 		output_addr_const (file, elem);
   3148  1.1  mrg 	      }
   3149  1.1  mrg 	    fprintf (file, " }");
   3150  1.1  mrg 	  }
   3151  1.1  mrg 	  break;
   3152  1.1  mrg 
   3153  1.1  mrg 	default:
   3154  1.1  mrg 	  output_addr_const (file, x);
   3155  1.1  mrg 	}
   3156  1.1  mrg     }
   3157  1.1  mrg }
   3158  1.1  mrg 
   3159  1.1  mrg /* Record replacement regs used to deal with subreg operands.  */
   3161  1.1  mrg struct reg_replace
   3162  1.1  mrg {
   3163  1.1  mrg   rtx replacement[MAX_RECOG_OPERANDS];
   3164  1.1  mrg   machine_mode mode;
   3165  1.1  mrg   int n_allocated;
   3166  1.1  mrg   int n_in_use;
   3167  1.1  mrg };
   3168  1.1  mrg 
   3169  1.1  mrg /* Allocate or reuse a replacement in R and return the rtx.  */
   3170  1.1  mrg 
   3171  1.1  mrg static rtx
   3172  1.1  mrg get_replacement (struct reg_replace *r)
   3173  1.1  mrg {
   3174  1.1  mrg   if (r->n_allocated == r->n_in_use)
   3175  1.1  mrg     r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
   3176  1.1  mrg   return r->replacement[r->n_in_use++];
   3177  1.1  mrg }
   3178  1.1  mrg 
   3179  1.1  mrg /* Clean up subreg operands.  In ptx assembly, everything is typed, and
   3180  1.1  mrg    the presence of subregs would break the rules for most instructions.
   3181  1.1  mrg    Replace them with a suitable new register of the right size, plus
   3182  1.1  mrg    conversion copyin/copyout instructions.  */
   3183  1.1  mrg 
   3184  1.1  mrg static void
   3185  1.1  mrg nvptx_reorg_subreg (void)
   3186  1.1  mrg {
   3187  1.1  mrg   struct reg_replace qiregs, hiregs, siregs, diregs;
   3188  1.1  mrg   rtx_insn *insn, *next;
   3189  1.1  mrg 
   3190  1.1  mrg   qiregs.n_allocated = 0;
   3191  1.1  mrg   hiregs.n_allocated = 0;
   3192  1.1  mrg   siregs.n_allocated = 0;
   3193  1.1  mrg   diregs.n_allocated = 0;
   3194  1.1  mrg   qiregs.mode = QImode;
   3195  1.1  mrg   hiregs.mode = HImode;
   3196  1.1  mrg   siregs.mode = SImode;
   3197  1.1  mrg   diregs.mode = DImode;
   3198  1.1  mrg 
   3199  1.1  mrg   for (insn = get_insns (); insn; insn = next)
   3200  1.1  mrg     {
   3201  1.1  mrg       next = NEXT_INSN (insn);
   3202  1.1  mrg       if (!NONDEBUG_INSN_P (insn)
   3203  1.1  mrg 	  || asm_noperands (PATTERN (insn)) >= 0
   3204  1.1  mrg 	  || GET_CODE (PATTERN (insn)) == USE
   3205  1.1  mrg 	  || GET_CODE (PATTERN (insn)) == CLOBBER)
   3206  1.1  mrg 	continue;
   3207  1.1  mrg 
   3208  1.1  mrg       qiregs.n_in_use = 0;
   3209  1.1  mrg       hiregs.n_in_use = 0;
   3210  1.1  mrg       siregs.n_in_use = 0;
   3211  1.1  mrg       diregs.n_in_use = 0;
   3212  1.1  mrg       extract_insn (insn);
   3213  1.1  mrg       enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
   3214  1.1  mrg 
   3215  1.1  mrg       for (int i = 0; i < recog_data.n_operands; i++)
   3216  1.1  mrg 	{
   3217  1.1  mrg 	  rtx op = recog_data.operand[i];
   3218  1.1  mrg 	  if (GET_CODE (op) != SUBREG)
   3219  1.1  mrg 	    continue;
   3220  1.1  mrg 
   3221  1.1  mrg 	  rtx inner = SUBREG_REG (op);
   3222  1.1  mrg 
   3223  1.1  mrg 	  machine_mode outer_mode = GET_MODE (op);
   3224  1.1  mrg 	  machine_mode inner_mode = GET_MODE (inner);
   3225  1.1  mrg 	  gcc_assert (s_ok);
   3226  1.1  mrg 	  if (s_ok
   3227  1.1  mrg 	      && (GET_MODE_PRECISION (inner_mode)
   3228  1.1  mrg 		  >= GET_MODE_PRECISION (outer_mode)))
   3229  1.1  mrg 	    continue;
   3230  1.1  mrg 	  gcc_assert (SCALAR_INT_MODE_P (outer_mode));
   3231  1.1  mrg 	  struct reg_replace *r = (outer_mode == QImode ? &qiregs
   3232  1.1  mrg 				   : outer_mode == HImode ? &hiregs
   3233  1.1  mrg 				   : outer_mode == SImode ? &siregs
   3234  1.1  mrg 				   : &diregs);
   3235  1.1  mrg 	  rtx new_reg = get_replacement (r);
   3236  1.1  mrg 
   3237  1.1  mrg 	  if (recog_data.operand_type[i] != OP_OUT)
   3238  1.1  mrg 	    {
   3239  1.1  mrg 	      enum rtx_code code;
   3240  1.1  mrg 	      if (GET_MODE_PRECISION (inner_mode)
   3241  1.1  mrg 		  < GET_MODE_PRECISION (outer_mode))
   3242  1.1  mrg 		code = ZERO_EXTEND;
   3243  1.1  mrg 	      else
   3244  1.1  mrg 		code = TRUNCATE;
   3245  1.1  mrg 
   3246  1.1  mrg 	      rtx pat = gen_rtx_SET (new_reg,
   3247  1.1  mrg 				     gen_rtx_fmt_e (code, outer_mode, inner));
   3248  1.1  mrg 	      emit_insn_before (pat, insn);
   3249  1.1  mrg 	    }
   3250  1.1  mrg 
   3251  1.1  mrg 	  if (recog_data.operand_type[i] != OP_IN)
   3252  1.1  mrg 	    {
   3253  1.1  mrg 	      enum rtx_code code;
   3254  1.1  mrg 	      if (GET_MODE_PRECISION (inner_mode)
   3255  1.1  mrg 		  < GET_MODE_PRECISION (outer_mode))
   3256  1.1  mrg 		code = TRUNCATE;
   3257  1.1  mrg 	      else
   3258  1.1  mrg 		code = ZERO_EXTEND;
   3259  1.1  mrg 
   3260  1.1  mrg 	      rtx pat = gen_rtx_SET (inner,
   3261  1.1  mrg 				     gen_rtx_fmt_e (code, inner_mode, new_reg));
   3262  1.1  mrg 	      emit_insn_after (pat, insn);
   3263  1.1  mrg 	    }
   3264  1.1  mrg 	  validate_change (insn, recog_data.operand_loc[i], new_reg, false);
   3265  1.1  mrg 	}
   3266  1.1  mrg     }
   3267  1.1  mrg }
   3268  1.1  mrg 
   3269  1.1  mrg /* Return a SImode "master lane index" register for uniform-simt, allocating on
   3270  1.1  mrg    first use.  */
   3271  1.1  mrg 
   3272  1.1  mrg static rtx
   3273  1.1  mrg nvptx_get_unisimt_master ()
   3274  1.1  mrg {
   3275  1.1  mrg   rtx &master = cfun->machine->unisimt_master;
   3276  1.1  mrg   return master ? master : master = gen_reg_rtx (SImode);
   3277  1.1  mrg }
   3278  1.1  mrg 
   3279  1.1  mrg /* Return a BImode "predicate" register for uniform-simt, similar to above.  */
   3280  1.1  mrg 
   3281  1.1  mrg static rtx
   3282  1.1  mrg nvptx_get_unisimt_predicate ()
   3283  1.1  mrg {
   3284  1.1  mrg   rtx &pred = cfun->machine->unisimt_predicate;
   3285  1.1  mrg   return pred ? pred : pred = gen_reg_rtx (BImode);
   3286  1.1  mrg }
   3287  1.1  mrg 
   3288  1.1  mrg static rtx
   3289  1.1  mrg nvptx_get_unisimt_outside_simt_predicate ()
   3290  1.1  mrg {
   3291  1.1  mrg   rtx &pred = cfun->machine->unisimt_outside_simt_predicate;
   3292  1.1  mrg   return pred ? pred : pred = gen_reg_rtx (BImode);
   3293  1.1  mrg }
   3294  1.1  mrg 
   3295  1.1  mrg /* Return true if given call insn references one of the functions provided by
   3296  1.1  mrg    the CUDA runtime: malloc, free, vprintf.  */
   3297  1.1  mrg 
   3298  1.1  mrg static bool
   3299  1.1  mrg nvptx_call_insn_is_syscall_p (rtx_insn *insn)
   3300  1.1  mrg {
   3301  1.1  mrg   rtx pat = PATTERN (insn);
   3302  1.1  mrg   gcc_checking_assert (GET_CODE (pat) == PARALLEL);
   3303  1.1  mrg   pat = XVECEXP (pat, 0, 0);
   3304  1.1  mrg   if (GET_CODE (pat) == SET)
   3305  1.1  mrg     pat = SET_SRC (pat);
   3306  1.1  mrg   gcc_checking_assert (GET_CODE (pat) == CALL
   3307  1.1  mrg 		       && GET_CODE (XEXP (pat, 0)) == MEM);
   3308  1.1  mrg   rtx addr = XEXP (XEXP (pat, 0), 0);
   3309  1.1  mrg   if (GET_CODE (addr) != SYMBOL_REF)
   3310  1.1  mrg     return false;
   3311  1.1  mrg   const char *name = XSTR (addr, 0);
   3312  1.1  mrg   /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
   3313  1.1  mrg      references with forced assembler name refer to PTX syscalls.  For vprintf,
   3314  1.1  mrg      accept both normal and forced-assembler-name references.  */
   3315  1.1  mrg   return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
   3316  1.1  mrg 	  || !strcmp (name, "*malloc")
   3317  1.1  mrg 	  || !strcmp (name, "*free"));
   3318  1.1  mrg }
   3319  1.1  mrg 
   3320  1.1  mrg /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
   3321  1.1  mrg    propagate its value from lane MASTER to current lane.  */
   3322  1.1  mrg 
   3323  1.1  mrg static bool
   3324  1.1  mrg nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
   3325  1.1  mrg {
   3326  1.1  mrg   rtx reg;
   3327  1.1  mrg   if (GET_CODE (set) == SET
   3328  1.1  mrg       && REG_P (reg = SET_DEST (set))
   3329  1.1  mrg       && find_reg_note (insn, REG_UNUSED, reg) == NULL_RTX)
   3330  1.1  mrg     {
   3331  1.1  mrg       emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX),
   3332  1.1  mrg 		       insn);
   3333  1.1  mrg       return true;
   3334  1.1  mrg     }
   3335  1.1  mrg 
   3336  1.1  mrg   return false;
   3337  1.1  mrg }
   3338  1.1  mrg 
   3339  1.1  mrg static void
   3340  1.1  mrg predicate_insn (rtx_insn *insn, rtx pred)
   3341  1.1  mrg {
   3342  1.1  mrg   rtx pat = PATTERN (insn);
   3343  1.1  mrg   pred = gen_rtx_NE (BImode, pred, const0_rtx);
   3344  1.1  mrg   pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
   3345  1.1  mrg   bool changed_p = validate_change (insn, &PATTERN (insn), pat, false);
   3346  1.1  mrg   gcc_assert (changed_p);
   3347  1.1  mrg }
   3348  1.1  mrg 
   3349  1.1  mrg /* Adjust code for uniform-simt code generation variant by making atomics and
   3350  1.1  mrg    "syscalls" conditionally executed, and inserting shuffle-based propagation
   3351  1.1  mrg    for registers being set.  */
   3352  1.1  mrg 
   3353  1.1  mrg static void
   3354  1.1  mrg nvptx_reorg_uniform_simt ()
   3355  1.1  mrg {
   3356  1.1  mrg   rtx_insn *insn, *next;
   3357  1.1  mrg 
   3358  1.1  mrg   for (insn = get_insns (); insn; insn = next)
   3359  1.1  mrg     {
   3360  1.1  mrg       next = NEXT_INSN (insn);
   3361  1.1  mrg 
   3362  1.1  mrg       /* Skip NOTE, USE, etc.  */
   3363  1.1  mrg       if (!INSN_P (insn) || recog_memoized (insn) == -1)
   3364  1.1  mrg 	continue;
   3365  1.1  mrg 
   3366  1.1  mrg       if (CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
   3367  1.1  mrg 	{
   3368  1.1  mrg 	  /* Handle syscall.  */
   3369  1.1  mrg 	}
   3370  1.1  mrg       else if (get_attr_atomic (insn))
   3371  1.1  mrg 	{
   3372  1.1  mrg 	  /* Handle atomic insn.  */
   3373  1.1  mrg 	}
   3374  1.1  mrg       else
   3375  1.1  mrg 	continue;
   3376  1.1  mrg 
   3377  1.1  mrg       rtx pat = PATTERN (insn);
   3378  1.1  mrg       rtx master = nvptx_get_unisimt_master ();
   3379  1.1  mrg       bool shuffle_p = false;
   3380  1.1  mrg       switch (GET_CODE (pat))
   3381  1.1  mrg        {
   3382  1.1  mrg        case PARALLEL:
   3383  1.1  mrg 	 for (int i = 0; i < XVECLEN (pat, 0); i++)
   3384  1.1  mrg 	   shuffle_p
   3385  1.1  mrg 	     |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
   3386  1.1  mrg 	 break;
   3387  1.1  mrg        case SET:
   3388  1.1  mrg 	 shuffle_p |= nvptx_unisimt_handle_set (pat, insn, master);
   3389  1.1  mrg 	 break;
   3390  1.1  mrg        default:
   3391  1.1  mrg 	 gcc_unreachable ();
   3392  1.1  mrg        }
   3393  1.1  mrg 
   3394  1.1  mrg       if (shuffle_p && TARGET_PTX_6_0)
   3395  1.1  mrg 	{
   3396  1.1  mrg 	  /* The shuffle is a sync, so uniformity is guaranteed.  */
   3397  1.1  mrg 	}
   3398  1.1  mrg       else
   3399  1.1  mrg 	{
   3400  1.1  mrg 	  if (TARGET_PTX_6_0)
   3401  1.1  mrg 	    {
   3402  1.1  mrg 	      gcc_assert (!shuffle_p);
   3403  1.1  mrg 	      /* Emit after the insn, to guarantee uniformity.  */
   3404  1.1  mrg 	      emit_insn_after (gen_nvptx_warpsync (), insn);
   3405  1.1  mrg 	    }
   3406  1.1  mrg 	  else
   3407  1.1  mrg 	    {
   3408  1.1  mrg 	      /* Emit after the insn (and before the shuffle, if there are any)
   3409  1.1  mrg 		 to check uniformity.  */
   3410  1.1  mrg 	      emit_insn_after (gen_nvptx_uniform_warp_check (), insn);
   3411  1.1  mrg 	    }
   3412  1.1  mrg 	}
   3413  1.1  mrg 
   3414  1.1  mrg       rtx pred = nvptx_get_unisimt_predicate ();
   3415  1.1  mrg       predicate_insn (insn, pred);
   3416  1.1  mrg 
   3417  1.1  mrg       pred = NULL_RTX;
   3418  1.1  mrg       for (rtx_insn *post = NEXT_INSN (insn); post != next;
   3419  1.1  mrg 	   post = NEXT_INSN (post))
   3420  1.1  mrg 	{
   3421  1.1  mrg 	  if (pred == NULL_RTX)
   3422  1.1  mrg 	    pred = nvptx_get_unisimt_outside_simt_predicate ();
   3423  1.1  mrg 	  predicate_insn (post, pred);
   3424  1.1  mrg 	}
   3425  1.1  mrg     }
   3426  1.1  mrg }
   3427  1.1  mrg 
   3428  1.1  mrg /* Offloading function attributes.  */
   3429  1.1  mrg 
   3430  1.1  mrg struct offload_attrs
   3431  1.1  mrg {
   3432  1.1  mrg   unsigned mask;
   3433  1.1  mrg   int num_gangs;
   3434  1.1  mrg   int num_workers;
   3435  1.1  mrg   int vector_length;
   3436  1.1  mrg };
   3437  1.1  mrg 
   3438  1.1  mrg /* Define entries for cfun->machine->axis_dim.  */
   3439  1.1  mrg 
   3440  1.1  mrg #define MACH_VECTOR_LENGTH 0
   3441  1.1  mrg #define MACH_MAX_WORKERS 1
   3442  1.1  mrg 
   3443  1.1  mrg static void populate_offload_attrs (offload_attrs *oa);
   3444  1.1  mrg 
   3445  1.1  mrg static void
   3446  1.1  mrg init_axis_dim (void)
   3447  1.1  mrg {
   3448  1.1  mrg   offload_attrs oa;
   3449  1.1  mrg   int max_workers;
   3450  1.1  mrg 
   3451  1.1  mrg   populate_offload_attrs (&oa);
   3452  1.1  mrg 
   3453  1.1  mrg   if (oa.num_workers == 0)
   3454  1.1  mrg     max_workers = PTX_CTA_SIZE / oa.vector_length;
   3455  1.1  mrg   else
   3456  1.1  mrg     max_workers = oa.num_workers;
   3457  1.1  mrg 
   3458  1.1  mrg   cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
   3459  1.1  mrg   cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
   3460  1.1  mrg   cfun->machine->axis_dim_init_p = true;
   3461  1.1  mrg }
   3462  1.1  mrg 
   3463  1.1  mrg static int ATTRIBUTE_UNUSED
   3464  1.1  mrg nvptx_mach_max_workers ()
   3465  1.1  mrg {
   3466  1.1  mrg   if (!cfun->machine->axis_dim_init_p)
   3467  1.1  mrg     init_axis_dim ();
   3468  1.1  mrg   return cfun->machine->axis_dim[MACH_MAX_WORKERS];
   3469  1.1  mrg }
   3470  1.1  mrg 
   3471  1.1  mrg static int ATTRIBUTE_UNUSED
   3472  1.1  mrg nvptx_mach_vector_length ()
   3473  1.1  mrg {
   3474  1.1  mrg   if (!cfun->machine->axis_dim_init_p)
   3475  1.1  mrg     init_axis_dim ();
   3476  1.1  mrg   return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
   3477  1.1  mrg }
   3478  1.1  mrg 
   3479  1.1  mrg /* Loop structure of the function.  The entire function is described as
   3480  1.1  mrg    a NULL loop.  */
   3481  1.1  mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'.  */
   3482  1.1  mrg 
   3483  1.1  mrg struct parallel
   3484  1.1  mrg {
   3485  1.1  mrg   /* Parent parallel.  */
   3486  1.1  mrg   parallel *parent;
   3487  1.1  mrg 
   3488  1.1  mrg   /* Next sibling parallel.  */
   3489  1.1  mrg   parallel *next;
   3490  1.1  mrg 
   3491  1.1  mrg   /* First child parallel.  */
   3492  1.1  mrg   parallel *inner;
   3493  1.1  mrg 
   3494  1.1  mrg   /* Partitioning mask of the parallel.  */
   3495  1.1  mrg   unsigned mask;
   3496  1.1  mrg 
   3497  1.1  mrg   /* Partitioning used within inner parallels. */
   3498  1.1  mrg   unsigned inner_mask;
   3499  1.1  mrg 
   3500  1.1  mrg   /* Location of parallel forked and join.  The forked is the first
   3501  1.1  mrg      block in the parallel and the join is the first block after of
   3502  1.1  mrg      the partition.  */
   3503  1.1  mrg   basic_block forked_block;
   3504  1.1  mrg   basic_block join_block;
   3505  1.1  mrg 
   3506  1.1  mrg   rtx_insn *forked_insn;
   3507  1.1  mrg   rtx_insn *join_insn;
   3508  1.1  mrg 
   3509  1.1  mrg   rtx_insn *fork_insn;
   3510  1.1  mrg   rtx_insn *joining_insn;
   3511  1.1  mrg 
   3512  1.1  mrg   /* Basic blocks in this parallel, but not in child parallels.  The
   3513  1.1  mrg      FORKED and JOINING blocks are in the partition.  The FORK and JOIN
   3514  1.1  mrg      blocks are not.  */
   3515  1.1  mrg   auto_vec<basic_block> blocks;
   3516  1.1  mrg 
   3517  1.1  mrg public:
   3518  1.1  mrg   parallel (parallel *parent, unsigned mode);
   3519  1.1  mrg   ~parallel ();
   3520  1.1  mrg };
   3521  1.1  mrg 
   3522  1.1  mrg /* Constructor links the new parallel into it's parent's chain of
   3523  1.1  mrg    children.  */
   3524  1.1  mrg 
   3525  1.1  mrg parallel::parallel (parallel *parent_, unsigned mask_)
   3526  1.1  mrg   :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
   3527  1.1  mrg {
   3528  1.1  mrg   forked_block = join_block = 0;
   3529  1.1  mrg   forked_insn = join_insn = 0;
   3530  1.1  mrg   fork_insn = joining_insn = 0;
   3531  1.1  mrg 
   3532  1.1  mrg   if (parent)
   3533  1.1  mrg     {
   3534  1.1  mrg       next = parent->inner;
   3535  1.1  mrg       parent->inner = this;
   3536  1.1  mrg     }
   3537  1.1  mrg }
   3538  1.1  mrg 
   3539  1.1  mrg parallel::~parallel ()
   3540  1.1  mrg {
   3541  1.1  mrg   delete inner;
   3542  1.1  mrg   delete next;
   3543  1.1  mrg }
   3544  1.1  mrg 
   3545  1.1  mrg /* Map of basic blocks to insns */
   3546  1.1  mrg typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
   3547  1.1  mrg 
   3548  1.1  mrg /* A tuple of an insn of interest and the BB in which it resides.  */
   3549  1.1  mrg typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
   3550  1.1  mrg typedef auto_vec<insn_bb_t> insn_bb_vec_t;
   3551  1.1  mrg 
   3552  1.1  mrg /* Split basic blocks such that each forked and join unspecs are at
   3553  1.1  mrg    the start of their basic blocks.  Thus afterwards each block will
   3554  1.1  mrg    have a single partitioning mode.  We also do the same for return
   3555  1.1  mrg    insns, as they are executed by every thread.  Return the
   3556  1.1  mrg    partitioning mode of the function as a whole.  Populate MAP with
   3557  1.1  mrg    head and tail blocks.  We also clear the BB visited flag, which is
   3558  1.1  mrg    used when finding partitions.  */
   3559  1.1  mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_split_blocks'.  */
   3560  1.1  mrg 
   3561  1.1  mrg static void
   3562  1.1  mrg nvptx_split_blocks (bb_insn_map_t *map)
   3563  1.1  mrg {
   3564  1.1  mrg   insn_bb_vec_t worklist;
   3565  1.1  mrg   basic_block block;
   3566  1.1  mrg   rtx_insn *insn;
   3567  1.1  mrg 
   3568  1.1  mrg   /* Locate all the reorg instructions of interest.  */
   3569  1.1  mrg   FOR_ALL_BB_FN (block, cfun)
   3570  1.1  mrg     {
   3571  1.1  mrg       bool seen_insn = false;
   3572  1.1  mrg 
   3573  1.1  mrg       /* Clear visited flag, for use by parallel locator  */
   3574  1.1  mrg       block->flags &= ~BB_VISITED;
   3575  1.1  mrg 
   3576  1.1  mrg       FOR_BB_INSNS (block, insn)
   3577  1.1  mrg 	{
   3578  1.1  mrg 	  if (!INSN_P (insn))
   3579  1.1  mrg 	    continue;
   3580  1.1  mrg 	  switch (recog_memoized (insn))
   3581  1.1  mrg 	    {
   3582  1.1  mrg 	    default:
   3583  1.1  mrg 	      seen_insn = true;
   3584  1.1  mrg 	      continue;
   3585  1.1  mrg 	    case CODE_FOR_nvptx_forked:
   3586  1.1  mrg 	    case CODE_FOR_nvptx_join:
   3587  1.1  mrg 	      break;
   3588  1.1  mrg 
   3589  1.1  mrg 	    case CODE_FOR_return:
   3590  1.1  mrg 	      /* We also need to split just before return insns, as
   3591  1.1  mrg 		 that insn needs executing by all threads, but the
   3592  1.1  mrg 		 block it is in probably does not.  */
   3593  1.1  mrg 	      break;
   3594  1.1  mrg 	    }
   3595  1.1  mrg 
   3596  1.1  mrg 	  if (seen_insn)
   3597  1.1  mrg 	    /* We've found an instruction that  must be at the start of
   3598  1.1  mrg 	       a block, but isn't.  Add it to the worklist.  */
   3599  1.1  mrg 	    worklist.safe_push (insn_bb_t (insn, block));
   3600  1.1  mrg 	  else
   3601  1.1  mrg 	    /* It was already the first instruction.  Just add it to
   3602  1.1  mrg 	       the map.  */
   3603  1.1  mrg 	    map->get_or_insert (block) = insn;
   3604  1.1  mrg 	  seen_insn = true;
   3605  1.1  mrg 	}
   3606  1.1  mrg     }
   3607  1.1  mrg 
   3608  1.1  mrg   /* Split blocks on the worklist.  */
   3609  1.1  mrg   unsigned ix;
   3610  1.1  mrg   insn_bb_t *elt;
   3611  1.1  mrg   basic_block remap = 0;
   3612  1.1  mrg   for (ix = 0; worklist.iterate (ix, &elt); ix++)
   3613  1.1  mrg     {
   3614  1.1  mrg       if (remap != elt->second)
   3615  1.1  mrg 	{
   3616  1.1  mrg 	  block = elt->second;
   3617  1.1  mrg 	  remap = block;
   3618  1.1  mrg 	}
   3619  1.1  mrg 
   3620  1.1  mrg       /* Split block before insn. The insn is in the new block  */
   3621  1.1  mrg       edge e = split_block (block, PREV_INSN (elt->first));
   3622  1.1  mrg 
   3623  1.1  mrg       block = e->dest;
   3624  1.1  mrg       map->get_or_insert (block) = elt->first;
   3625  1.1  mrg     }
   3626  1.1  mrg }
   3627  1.1  mrg 
   3628  1.1  mrg /* Return true if MASK contains parallelism that requires shared
   3629  1.1  mrg    memory to broadcast.  */
   3630  1.1  mrg 
   3631  1.1  mrg static bool
   3632  1.1  mrg nvptx_needs_shared_bcast (unsigned mask)
   3633  1.1  mrg {
   3634  1.1  mrg   bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
   3635  1.1  mrg   bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
   3636  1.1  mrg     && nvptx_mach_vector_length () != PTX_WARP_SIZE;
   3637  1.1  mrg 
   3638  1.1  mrg   return worker || large_vector;
   3639  1.1  mrg }
   3640  1.1  mrg 
   3641  1.1  mrg /* BLOCK is a basic block containing a head or tail instruction.
   3642  1.1  mrg    Locate the associated prehead or pretail instruction, which must be
   3643  1.1  mrg    in the single predecessor block.  */
   3644  1.1  mrg 
   3645  1.1  mrg static rtx_insn *
   3646  1.1  mrg nvptx_discover_pre (basic_block block, int expected)
   3647  1.1  mrg {
   3648  1.1  mrg   gcc_assert (block->preds->length () == 1);
   3649  1.1  mrg   basic_block pre_block = (*block->preds)[0]->src;
   3650  1.1  mrg   rtx_insn *pre_insn;
   3651  1.1  mrg 
   3652  1.1  mrg   for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
   3653  1.1  mrg        pre_insn = PREV_INSN (pre_insn))
   3654  1.1  mrg     gcc_assert (pre_insn != BB_HEAD (pre_block));
   3655  1.1  mrg 
   3656  1.1  mrg   gcc_assert (recog_memoized (pre_insn) == expected);
   3657  1.1  mrg   return pre_insn;
   3658  1.1  mrg }
   3659  1.1  mrg 
   3660  1.1  mrg /* Dump this parallel and all its inner parallels.  */
   3661  1.1  mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_dump_pars'.  */
   3662  1.1  mrg 
   3663  1.1  mrg static void
   3664  1.1  mrg nvptx_dump_pars (parallel *par, unsigned depth)
   3665  1.1  mrg {
   3666  1.1  mrg   fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
   3667  1.1  mrg 	   depth, par->mask,
   3668  1.1  mrg 	   par->forked_block ? par->forked_block->index : -1,
   3669  1.1  mrg 	   par->join_block ? par->join_block->index : -1);
   3670  1.1  mrg 
   3671  1.1  mrg   fprintf (dump_file, "    blocks:");
   3672  1.1  mrg 
   3673  1.1  mrg   basic_block block;
   3674  1.1  mrg   for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
   3675  1.1  mrg     fprintf (dump_file, " %d", block->index);
   3676  1.1  mrg   fprintf (dump_file, "\n");
   3677  1.1  mrg   if (par->inner)
   3678  1.1  mrg     nvptx_dump_pars (par->inner, depth + 1);
   3679  1.1  mrg 
   3680  1.1  mrg   if (par->next)
   3681  1.1  mrg     nvptx_dump_pars (par->next, depth);
   3682  1.1  mrg }
   3683  1.1  mrg 
   3684  1.1  mrg /* If BLOCK contains a fork/join marker, process it to create or
   3685  1.1  mrg    terminate a loop structure.  Add this block to the current loop,
   3686  1.1  mrg    and then walk successor blocks.   */
   3687  1.1  mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_find_par'.  */
   3688  1.1  mrg 
   3689  1.1  mrg static parallel *
   3690  1.1  mrg nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
   3691  1.1  mrg {
   3692  1.1  mrg   if (block->flags & BB_VISITED)
   3693  1.1  mrg     return par;
   3694  1.1  mrg   block->flags |= BB_VISITED;
   3695  1.1  mrg 
   3696  1.1  mrg   if (rtx_insn **endp = map->get (block))
   3697  1.1  mrg     {
   3698  1.1  mrg       rtx_insn *end = *endp;
   3699  1.1  mrg 
   3700  1.1  mrg       /* This is a block head or tail, or return instruction.  */
   3701  1.1  mrg       switch (recog_memoized (end))
   3702  1.1  mrg 	{
   3703  1.1  mrg 	case CODE_FOR_return:
   3704  1.1  mrg 	  /* Return instructions are in their own block, and we
   3705  1.1  mrg 	     don't need to do anything more.  */
   3706  1.1  mrg 	  return par;
   3707  1.1  mrg 
   3708  1.1  mrg 	case CODE_FOR_nvptx_forked:
   3709  1.1  mrg 	  /* Loop head, create a new inner loop and add it into
   3710  1.1  mrg 	     our parent's child list.  */
   3711  1.1  mrg 	  {
   3712  1.1  mrg 	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
   3713  1.1  mrg 
   3714  1.1  mrg 	    gcc_assert (mask);
   3715  1.1  mrg 	    par = new parallel (par, mask);
   3716  1.1  mrg 	    par->forked_block = block;
   3717  1.1  mrg 	    par->forked_insn = end;
   3718  1.1  mrg 	    if (nvptx_needs_shared_bcast (mask))
   3719  1.1  mrg 	      par->fork_insn
   3720  1.1  mrg 		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
   3721  1.1  mrg 	  }
   3722  1.1  mrg 	  break;
   3723  1.1  mrg 
   3724  1.1  mrg 	case CODE_FOR_nvptx_join:
   3725  1.1  mrg 	  /* A loop tail.  Finish the current loop and return to
   3726  1.1  mrg 	     parent.  */
   3727  1.1  mrg 	  {
   3728  1.1  mrg 	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
   3729  1.1  mrg 
   3730  1.1  mrg 	    gcc_assert (par->mask == mask);
   3731  1.1  mrg 	    gcc_assert (par->join_block == NULL);
   3732  1.1  mrg 	    par->join_block = block;
   3733  1.1  mrg 	    par->join_insn = end;
   3734  1.1  mrg 	    if (nvptx_needs_shared_bcast (mask))
   3735  1.1  mrg 	      par->joining_insn
   3736  1.1  mrg 		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
   3737  1.1  mrg 	    par = par->parent;
   3738  1.1  mrg 	  }
   3739  1.1  mrg 	  break;
   3740  1.1  mrg 
   3741  1.1  mrg 	default:
   3742  1.1  mrg 	  gcc_unreachable ();
   3743  1.1  mrg 	}
   3744  1.1  mrg     }
   3745  1.1  mrg 
   3746  1.1  mrg   if (par)
   3747  1.1  mrg     /* Add this block onto the current loop's list of blocks.  */
   3748  1.1  mrg     par->blocks.safe_push (block);
   3749  1.1  mrg   else
   3750  1.1  mrg     /* This must be the entry block.  Create a NULL parallel.  */
   3751  1.1  mrg     par = new parallel (0, 0);
   3752  1.1  mrg 
   3753  1.1  mrg   /* Walk successor blocks.  */
   3754  1.1  mrg   edge e;
   3755  1.1  mrg   edge_iterator ei;
   3756  1.1  mrg 
   3757  1.1  mrg   FOR_EACH_EDGE (e, ei, block->succs)
   3758  1.1  mrg     nvptx_find_par (map, par, e->dest);
   3759  1.1  mrg 
   3760  1.1  mrg   return par;
   3761  1.1  mrg }
   3762  1.1  mrg 
   3763  1.1  mrg /* DFS walk the CFG looking for fork & join markers.  Construct
   3764  1.1  mrg    loop structures as we go.  MAP is a mapping of basic blocks
   3765  1.1  mrg    to head & tail markers, discovered when splitting blocks.  This
   3766  1.1  mrg    speeds up the discovery.  We rely on the BB visited flag having
   3767  1.1  mrg    been cleared when splitting blocks.  */
   3768  1.1  mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_discover_pars'.  */
   3769  1.1  mrg 
   3770  1.1  mrg static parallel *
   3771  1.1  mrg nvptx_discover_pars (bb_insn_map_t *map)
   3772  1.1  mrg {
   3773  1.1  mrg   basic_block block;
   3774  1.1  mrg 
   3775  1.1  mrg   /* Mark exit blocks as visited.  */
   3776  1.1  mrg   block = EXIT_BLOCK_PTR_FOR_FN (cfun);
   3777  1.1  mrg   block->flags |= BB_VISITED;
   3778  1.1  mrg 
   3779  1.1  mrg   /* And entry block as not.  */
   3780  1.1  mrg   block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
   3781  1.1  mrg   block->flags &= ~BB_VISITED;
   3782  1.1  mrg 
   3783  1.1  mrg   parallel *par = nvptx_find_par (map, 0, block);
   3784  1.1  mrg 
   3785  1.1  mrg   if (dump_file)
   3786  1.1  mrg     {
   3787  1.1  mrg       fprintf (dump_file, "\nLoops\n");
   3788  1.1  mrg       nvptx_dump_pars (par, 0);
   3789  1.1  mrg       fprintf (dump_file, "\n");
   3790  1.1  mrg     }
   3791  1.1  mrg 
   3792  1.1  mrg   return par;
   3793  1.1  mrg }
   3794  1.1  mrg 
   3795  1.1  mrg /* Analyse a group of BBs within a partitioned region and create N
   3796  1.1  mrg    Single-Entry-Single-Exit regions.  Some of those regions will be
   3797  1.1  mrg    trivial ones consisting of a single BB.  The blocks of a
   3798  1.1  mrg    partitioned region might form a set of disjoint graphs -- because
   3799  1.1  mrg    the region encloses a differently partitoned sub region.
   3800  1.1  mrg 
   3801  1.1  mrg    We use the linear time algorithm described in 'Finding Regions Fast:
   3802  1.1  mrg    Single Entry Single Exit and control Regions in Linear Time'
   3803  1.1  mrg    Johnson, Pearson & Pingali.  That algorithm deals with complete
   3804  1.1  mrg    CFGs, where a back edge is inserted from END to START, and thus the
   3805  1.1  mrg    problem becomes one of finding equivalent loops.
   3806  1.1  mrg 
   3807  1.1  mrg    In this case we have a partial CFG.  We complete it by redirecting
   3808  1.1  mrg    any incoming edge to the graph to be from an arbitrary external BB,
   3809  1.1  mrg    and similarly redirecting any outgoing edge to be to  that BB.
   3810  1.1  mrg    Thus we end up with a closed graph.
   3811  1.1  mrg 
   3812  1.1  mrg    The algorithm works by building a spanning tree of an undirected
   3813  1.1  mrg    graph and keeping track of back edges from nodes further from the
   3814  1.1  mrg    root in the tree to nodes nearer to the root in the tree.  In the
   3815  1.1  mrg    description below, the root is up and the tree grows downwards.
   3816  1.1  mrg 
   3817  1.1  mrg    We avoid having to deal with degenerate back-edges to the same
   3818  1.1  mrg    block, by splitting each BB into 3 -- one for input edges, one for
   3819  1.1  mrg    the node itself and one for the output edges.  Such back edges are
   3820  1.1  mrg    referred to as 'Brackets'.  Cycle equivalent nodes will have the
   3821  1.1  mrg    same set of brackets.
   3822  1.1  mrg 
   3823  1.1  mrg    Determining bracket equivalency is done by maintaining a list of
   3824  1.1  mrg    brackets in such a manner that the list length and final bracket
   3825  1.1  mrg    uniquely identify the set.
   3826  1.1  mrg 
   3827  1.1  mrg    We use coloring to mark all BBs with cycle equivalency with the
   3828  1.1  mrg    same color.  This is the output of the 'Finding Regions Fast'
   3829  1.1  mrg    algorithm.  Notice it doesn't actually find the set of nodes within
   3830  1.1  mrg    a particular region, just unorderd sets of nodes that are the
   3831  1.1  mrg    entries and exits of SESE regions.
   3832  1.1  mrg 
   3833  1.1  mrg    After determining cycle equivalency, we need to find the minimal
   3834  1.1  mrg    set of SESE regions.  Do this with a DFS coloring walk of the
   3835  1.1  mrg    complete graph.  We're either 'looking' or 'coloring'.  When
   3836  1.1  mrg    looking, and we're in the subgraph, we start coloring the color of
   3837  1.1  mrg    the current node, and remember that node as the start of the
   3838  1.1  mrg    current color's SESE region.  Every time we go to a new node, we
   3839  1.1  mrg    decrement the count of nodes with thet color.  If it reaches zero,
   3840  1.1  mrg    we remember that node as the end of the current color's SESE region
   3841  1.1  mrg    and return to 'looking'.  Otherwise we color the node the current
   3842  1.1  mrg    color.
   3843  1.1  mrg 
   3844  1.1  mrg    This way we end up with coloring the inside of non-trivial SESE
   3845  1.1  mrg    regions with the color of that region.  */
   3846  1.1  mrg 
   3847  1.1  mrg /* A pair of BBs.  We use this to represent SESE regions.  */
   3848  1.1  mrg typedef std::pair<basic_block, basic_block> bb_pair_t;
   3849  1.1  mrg typedef auto_vec<bb_pair_t> bb_pair_vec_t;
   3850  1.1  mrg 
   3851  1.1  mrg /* A node in the undirected CFG.  The discriminator SECOND indicates just
   3852  1.1  mrg    above or just below the BB idicated by FIRST.  */
   3853  1.1  mrg typedef std::pair<basic_block, int> pseudo_node_t;
   3854  1.1  mrg 
   3855  1.1  mrg /* A bracket indicates an edge towards the root of the spanning tree of the
   3856  1.1  mrg    undirected graph.  Each bracket has a color, determined
   3857  1.1  mrg    from the currrent set of brackets.  */
   3858  1.1  mrg struct bracket
   3859  1.1  mrg {
   3860  1.1  mrg   pseudo_node_t back; /* Back target */
   3861  1.1  mrg 
   3862  1.1  mrg   /* Current color and size of set.  */
   3863  1.1  mrg   unsigned color;
   3864  1.1  mrg   unsigned size;
   3865  1.1  mrg 
   3866  1.1  mrg   bracket (pseudo_node_t back_)
   3867  1.1  mrg   : back (back_), color (~0u), size (~0u)
   3868  1.1  mrg   {
   3869  1.1  mrg   }
   3870  1.1  mrg 
   3871  1.1  mrg   unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
   3872  1.1  mrg   {
   3873  1.1  mrg     if (length != size)
   3874  1.1  mrg       {
   3875  1.1  mrg 	size = length;
   3876  1.1  mrg 	color = color_counts.length ();
   3877  1.1  mrg 	color_counts.quick_push (0);
   3878  1.1  mrg       }
   3879  1.1  mrg     color_counts[color]++;
   3880  1.1  mrg     return color;
   3881  1.1  mrg   }
   3882  1.1  mrg };
   3883  1.1  mrg 
   3884  1.1  mrg typedef auto_vec<bracket> bracket_vec_t;
   3885  1.1  mrg 
   3886  1.1  mrg /* Basic block info for finding SESE regions.    */
   3887  1.1  mrg 
   3888  1.1  mrg struct bb_sese
   3889  1.1  mrg {
   3890  1.1  mrg   int node;  /* Node number in spanning tree.  */
   3891  1.1  mrg   int parent; /* Parent node number.  */
   3892  1.1  mrg 
   3893  1.1  mrg   /* The algorithm splits each node A into Ai, A', Ao. The incoming
   3894  1.1  mrg      edges arrive at pseudo-node Ai and the outgoing edges leave at
   3895  1.1  mrg      pseudo-node Ao.  We have to remember which way we arrived at a
   3896  1.1  mrg      particular node when generating the spanning tree.  dir > 0 means
   3897  1.1  mrg      we arrived at Ai, dir < 0 means we arrived at Ao.  */
   3898  1.1  mrg   int dir;
   3899  1.1  mrg 
   3900  1.1  mrg   /* Lowest numbered pseudo-node reached via a backedge from thsis
   3901  1.1  mrg      node, or any descendant.  */
   3902  1.1  mrg   pseudo_node_t high;
   3903  1.1  mrg 
   3904  1.1  mrg   int color;  /* Cycle-equivalence color  */
   3905  1.1  mrg 
   3906  1.1  mrg   /* Stack of brackets for this node.  */
   3907  1.1  mrg   bracket_vec_t brackets;
   3908  1.1  mrg 
   3909  1.1  mrg   bb_sese (unsigned node_, unsigned p, int dir_)
   3910  1.1  mrg   :node (node_), parent (p), dir (dir_)
   3911  1.1  mrg   {
   3912  1.1  mrg   }
   3913  1.1  mrg   ~bb_sese ();
   3914  1.1  mrg 
   3915  1.1  mrg   /* Push a bracket ending at BACK.  */
   3916  1.1  mrg   void push (const pseudo_node_t &back)
   3917  1.1  mrg   {
   3918  1.1  mrg     if (dump_file)
   3919  1.1  mrg       fprintf (dump_file, "Pushing backedge %d:%+d\n",
   3920  1.1  mrg 	       back.first ? back.first->index : 0, back.second);
   3921  1.1  mrg     brackets.safe_push (bracket (back));
   3922  1.1  mrg   }
   3923  1.1  mrg 
   3924  1.1  mrg   void append (bb_sese *child);
   3925  1.1  mrg   void remove (const pseudo_node_t &);
   3926  1.1  mrg 
   3927  1.1  mrg   /* Set node's color.  */
   3928  1.1  mrg   void set_color (auto_vec<unsigned> &color_counts)
   3929  1.1  mrg   {
   3930  1.1  mrg     color = brackets.last ().get_color (color_counts, brackets.length ());
   3931  1.1  mrg   }
   3932  1.1  mrg };
   3933  1.1  mrg 
   3934  1.1  mrg bb_sese::~bb_sese ()
   3935  1.1  mrg {
   3936  1.1  mrg }
   3937  1.1  mrg 
   3938  1.1  mrg /* Destructively append CHILD's brackets.  */
   3939  1.1  mrg 
   3940  1.1  mrg void
   3941  1.1  mrg bb_sese::append (bb_sese *child)
   3942  1.1  mrg {
   3943  1.1  mrg   if (int len = child->brackets.length ())
   3944  1.1  mrg     {
   3945  1.1  mrg       int ix;
   3946  1.1  mrg 
   3947  1.1  mrg       if (dump_file)
   3948  1.1  mrg 	{
   3949  1.1  mrg 	  for (ix = 0; ix < len; ix++)
   3950  1.1  mrg 	    {
   3951  1.1  mrg 	      const pseudo_node_t &pseudo = child->brackets[ix].back;
   3952  1.1  mrg 	      fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
   3953  1.1  mrg 		       child->node, pseudo.first ? pseudo.first->index : 0,
   3954  1.1  mrg 		       pseudo.second);
   3955  1.1  mrg 	    }
   3956  1.1  mrg 	}
   3957  1.1  mrg       if (!brackets.length ())
   3958  1.1  mrg 	std::swap (brackets, child->brackets);
   3959  1.1  mrg       else
   3960  1.1  mrg 	{
   3961  1.1  mrg 	  brackets.reserve (len);
   3962  1.1  mrg 	  for (ix = 0; ix < len; ix++)
   3963  1.1  mrg 	    brackets.quick_push (child->brackets[ix]);
   3964  1.1  mrg 	}
   3965  1.1  mrg     }
   3966  1.1  mrg }
   3967  1.1  mrg 
   3968  1.1  mrg /* Remove brackets that terminate at PSEUDO.  */
   3969  1.1  mrg 
   3970  1.1  mrg void
   3971  1.1  mrg bb_sese::remove (const pseudo_node_t &pseudo)
   3972  1.1  mrg {
   3973  1.1  mrg   unsigned removed = 0;
   3974  1.1  mrg   int len = brackets.length ();
   3975  1.1  mrg 
   3976  1.1  mrg   for (int ix = 0; ix < len; ix++)
   3977  1.1  mrg     {
   3978  1.1  mrg       if (brackets[ix].back == pseudo)
   3979  1.1  mrg 	{
   3980  1.1  mrg 	  if (dump_file)
   3981  1.1  mrg 	    fprintf (dump_file, "Removing backedge %d:%+d\n",
   3982  1.1  mrg 		     pseudo.first ? pseudo.first->index : 0, pseudo.second);
   3983  1.1  mrg 	  removed++;
   3984  1.1  mrg 	}
   3985  1.1  mrg       else if (removed)
   3986  1.1  mrg 	brackets[ix-removed] = brackets[ix];
   3987  1.1  mrg     }
   3988  1.1  mrg   while (removed--)
   3989  1.1  mrg     brackets.pop ();
   3990  1.1  mrg }
   3991  1.1  mrg 
   3992  1.1  mrg /* Accessors for BB's aux pointer.  */
   3993  1.1  mrg #define BB_SET_SESE(B, S) ((B)->aux = (S))
   3994  1.1  mrg #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
   3995  1.1  mrg 
   3996  1.1  mrg /* DFS walk creating SESE data structures.  Only cover nodes with
   3997  1.1  mrg    BB_VISITED set.  Append discovered blocks to LIST.  We number in
   3998  1.1  mrg    increments of 3 so that the above and below pseudo nodes can be
   3999  1.1  mrg    implicitly numbered too.  */
   4000  1.1  mrg 
   4001  1.1  mrg static int
   4002  1.1  mrg nvptx_sese_number (int n, int p, int dir, basic_block b,
   4003  1.1  mrg 		   auto_vec<basic_block> *list)
   4004  1.1  mrg {
   4005  1.1  mrg   if (BB_GET_SESE (b))
   4006  1.1  mrg     return n;
   4007  1.1  mrg 
   4008  1.1  mrg   if (dump_file)
   4009  1.1  mrg     fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
   4010  1.1  mrg 	     b->index, n, p, dir);
   4011  1.1  mrg 
   4012  1.1  mrg   BB_SET_SESE (b, new bb_sese (n, p, dir));
   4013  1.1  mrg   p = n;
   4014  1.1  mrg 
   4015  1.1  mrg   n += 3;
   4016  1.1  mrg   list->quick_push (b);
   4017  1.1  mrg 
   4018  1.1  mrg   /* First walk the nodes on the 'other side' of this node, then walk
   4019  1.1  mrg      the nodes on the same side.  */
   4020  1.1  mrg   for (unsigned ix = 2; ix; ix--)
   4021  1.1  mrg     {
   4022  1.1  mrg       vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
   4023  1.1  mrg       size_t offset = (dir > 0 ? offsetof (edge_def, dest)
   4024  1.1  mrg 		       : offsetof (edge_def, src));
   4025  1.1  mrg       edge e;
   4026  1.1  mrg       edge_iterator ei;
   4027  1.1  mrg 
   4028  1.1  mrg       FOR_EACH_EDGE (e, ei, edges)
   4029  1.1  mrg 	{
   4030  1.1  mrg 	  basic_block target = *(basic_block *)((char *)e + offset);
   4031  1.1  mrg 
   4032  1.1  mrg 	  if (target->flags & BB_VISITED)
   4033  1.1  mrg 	    n = nvptx_sese_number (n, p, dir, target, list);
   4034  1.1  mrg 	}
   4035  1.1  mrg       dir = -dir;
   4036  1.1  mrg     }
   4037  1.1  mrg   return n;
   4038  1.1  mrg }
   4039  1.1  mrg 
   4040  1.1  mrg /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
   4041  1.1  mrg    EDGES are the outgoing edges and OFFSET is the offset to the src
   4042  1.1  mrg    or dst block on the edges.   */
   4043  1.1  mrg 
   4044  1.1  mrg static void
   4045  1.1  mrg nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
   4046  1.1  mrg 		   vec<edge, va_gc> *edges, size_t offset)
   4047  1.1  mrg {
   4048  1.1  mrg   edge e;
   4049  1.1  mrg   edge_iterator ei;
   4050  1.1  mrg   int hi_back = depth;
   4051  1.1  mrg   pseudo_node_t node_back (nullptr, depth);
   4052  1.1  mrg   int hi_child = depth;
   4053  1.1  mrg   pseudo_node_t node_child (nullptr, depth);
   4054  1.1  mrg   basic_block child = NULL;
   4055  1.1  mrg   unsigned num_children = 0;
   4056  1.1  mrg   int usd = -dir * sese->dir;
   4057  1.1  mrg 
   4058  1.1  mrg   if (dump_file)
   4059  1.1  mrg     fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
   4060  1.1  mrg 	     me->index, sese->node, dir);
   4061  1.1  mrg 
   4062  1.1  mrg   if (dir < 0)
   4063  1.1  mrg     {
   4064  1.1  mrg       /* This is the above pseudo-child.  It has the BB itself as an
   4065  1.1  mrg 	 additional child node.  */
   4066  1.1  mrg       node_child = sese->high;
   4067  1.1  mrg       hi_child = node_child.second;
   4068  1.1  mrg       if (node_child.first)
   4069  1.1  mrg 	hi_child += BB_GET_SESE (node_child.first)->node;
   4070  1.1  mrg       num_children++;
   4071  1.1  mrg     }
   4072  1.1  mrg 
   4073  1.1  mrg   /* Examine each edge.
   4074  1.1  mrg      - if it is a child (a) append its bracket list and (b) record
   4075  1.1  mrg           whether it is the child with the highest reaching bracket.
   4076  1.1  mrg      - if it is an edge to ancestor, record whether it's the highest
   4077  1.1  mrg           reaching backlink.  */
   4078  1.1  mrg   FOR_EACH_EDGE (e, ei, edges)
   4079  1.1  mrg     {
   4080  1.1  mrg       basic_block target = *(basic_block *)((char *)e + offset);
   4081  1.1  mrg 
   4082  1.1  mrg       if (bb_sese *t_sese = BB_GET_SESE (target))
   4083  1.1  mrg 	{
   4084  1.1  mrg 	  if (t_sese->parent == sese->node && !(t_sese->dir + usd))
   4085  1.1  mrg 	    {
   4086  1.1  mrg 	      /* Child node.  Append its bracket list. */
   4087  1.1  mrg 	      num_children++;
   4088  1.1  mrg 	      sese->append (t_sese);
   4089  1.1  mrg 
   4090  1.1  mrg 	      /* Compare it's hi value.  */
   4091  1.1  mrg 	      int t_hi = t_sese->high.second;
   4092  1.1  mrg 
   4093  1.1  mrg 	      if (basic_block child_hi_block = t_sese->high.first)
   4094  1.1  mrg 		t_hi += BB_GET_SESE (child_hi_block)->node;
   4095  1.1  mrg 
   4096  1.1  mrg 	      if (hi_child > t_hi)
   4097  1.1  mrg 		{
   4098  1.1  mrg 		  hi_child = t_hi;
   4099  1.1  mrg 		  node_child = t_sese->high;
   4100  1.1  mrg 		  child = target;
   4101  1.1  mrg 		}
   4102  1.1  mrg 	    }
   4103  1.1  mrg 	  else if (t_sese->node < sese->node + dir
   4104  1.1  mrg 		   && !(dir < 0 && sese->parent == t_sese->node))
   4105  1.1  mrg 	    {
   4106  1.1  mrg 	      /* Non-parental ancestor node -- a backlink.  */
   4107  1.1  mrg 	      int d = usd * t_sese->dir;
   4108  1.1  mrg 	      int back = t_sese->node + d;
   4109  1.1  mrg 
   4110  1.1  mrg 	      if (hi_back > back)
   4111  1.1  mrg 		{
   4112  1.1  mrg 		  hi_back = back;
   4113  1.1  mrg 		  node_back = pseudo_node_t (target, d);
   4114  1.1  mrg 		}
   4115  1.1  mrg 	    }
   4116  1.1  mrg 	}
   4117  1.1  mrg       else
   4118  1.1  mrg 	{ /* Fallen off graph, backlink to entry node.  */
   4119  1.1  mrg 	  hi_back = 0;
   4120  1.1  mrg 	  node_back = pseudo_node_t (nullptr, 0);
   4121  1.1  mrg 	}
   4122  1.1  mrg     }
   4123  1.1  mrg 
   4124  1.1  mrg   /* Remove any brackets that terminate at this pseudo node.  */
   4125  1.1  mrg   sese->remove (pseudo_node_t (me, dir));
   4126  1.1  mrg 
   4127  1.1  mrg   /* Now push any backlinks from this pseudo node.  */
   4128  1.1  mrg   FOR_EACH_EDGE (e, ei, edges)
   4129  1.1  mrg     {
   4130  1.1  mrg       basic_block target = *(basic_block *)((char *)e + offset);
   4131  1.1  mrg       if (bb_sese *t_sese = BB_GET_SESE (target))
   4132  1.1  mrg 	{
   4133  1.1  mrg 	  if (t_sese->node < sese->node + dir
   4134  1.1  mrg 	      && !(dir < 0 && sese->parent == t_sese->node))
   4135  1.1  mrg 	    /* Non-parental ancestor node - backedge from me.  */
   4136  1.1  mrg 	    sese->push (pseudo_node_t (target, usd * t_sese->dir));
   4137  1.1  mrg 	}
   4138  1.1  mrg       else
   4139  1.1  mrg 	{
   4140  1.1  mrg 	  /* back edge to entry node */
   4141  1.1  mrg 	  sese->push (pseudo_node_t (nullptr, 0));
   4142  1.1  mrg 	}
   4143  1.1  mrg     }
   4144  1.1  mrg 
   4145  1.1  mrg  /* If this node leads directly or indirectly to a no-return region of
   4146  1.1  mrg      the graph, then fake a backedge to entry node.  */
   4147  1.1  mrg   if (!sese->brackets.length () || !edges || !edges->length ())
   4148  1.1  mrg     {
   4149  1.1  mrg       hi_back = 0;
   4150  1.1  mrg       node_back = pseudo_node_t (nullptr, 0);
   4151  1.1  mrg       sese->push (node_back);
   4152  1.1  mrg     }
   4153  1.1  mrg 
   4154  1.1  mrg   /* Record the highest reaching backedge from us or a descendant.  */
   4155  1.1  mrg   sese->high = hi_back < hi_child ? node_back : node_child;
   4156  1.1  mrg 
   4157  1.1  mrg   if (num_children > 1)
   4158  1.1  mrg     {
   4159  1.1  mrg       /* There is more than one child -- this is a Y shaped piece of
   4160  1.1  mrg 	 spanning tree.  We have to insert a fake backedge from this
   4161  1.1  mrg 	 node to the highest ancestor reached by not-the-highest
   4162  1.1  mrg 	 reaching child.  Note that there may be multiple children
   4163  1.1  mrg 	 with backedges to the same highest node.  That's ok and we
   4164  1.1  mrg 	 insert the edge to that highest node.  */
   4165  1.1  mrg       hi_child = depth;
   4166  1.1  mrg       if (dir < 0 && child)
   4167  1.1  mrg 	{
   4168  1.1  mrg 	  node_child = sese->high;
   4169  1.1  mrg 	  hi_child = node_child.second;
   4170  1.1  mrg 	  if (node_child.first)
   4171  1.1  mrg 	    hi_child += BB_GET_SESE (node_child.first)->node;
   4172  1.1  mrg 	}
   4173  1.1  mrg 
   4174  1.1  mrg       FOR_EACH_EDGE (e, ei, edges)
   4175  1.1  mrg 	{
   4176  1.1  mrg 	  basic_block target = *(basic_block *)((char *)e + offset);
   4177  1.1  mrg 
   4178  1.1  mrg 	  if (target == child)
   4179  1.1  mrg 	    /* Ignore the highest child. */
   4180  1.1  mrg 	    continue;
   4181  1.1  mrg 
   4182  1.1  mrg 	  bb_sese *t_sese = BB_GET_SESE (target);
   4183  1.1  mrg 	  if (!t_sese)
   4184  1.1  mrg 	    continue;
   4185  1.1  mrg 	  if (t_sese->parent != sese->node)
   4186  1.1  mrg 	    /* Not a child. */
   4187  1.1  mrg 	    continue;
   4188  1.1  mrg 
   4189  1.1  mrg 	  /* Compare its hi value.  */
   4190  1.1  mrg 	  int t_hi = t_sese->high.second;
   4191  1.1  mrg 
   4192  1.1  mrg 	  if (basic_block child_hi_block = t_sese->high.first)
   4193  1.1  mrg 	    t_hi += BB_GET_SESE (child_hi_block)->node;
   4194  1.1  mrg 
   4195  1.1  mrg 	  if (hi_child > t_hi)
   4196  1.1  mrg 	    {
   4197  1.1  mrg 	      hi_child = t_hi;
   4198  1.1  mrg 	      node_child = t_sese->high;
   4199  1.1  mrg 	    }
   4200  1.1  mrg 	}
   4201  1.1  mrg 
   4202  1.1  mrg       sese->push (node_child);
   4203  1.1  mrg     }
   4204  1.1  mrg }
   4205  1.1  mrg 
   4206  1.1  mrg 
   4207  1.1  mrg /* DFS walk of BB graph.  Color node BLOCK according to COLORING then
   4208  1.1  mrg    proceed to successors.  Set SESE entry and exit nodes of
   4209  1.1  mrg    REGIONS.  */
   4210  1.1  mrg 
   4211  1.1  mrg static void
   4212  1.1  mrg nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
   4213  1.1  mrg 		  basic_block block, int coloring)
   4214  1.1  mrg {
   4215  1.1  mrg   bb_sese *sese = BB_GET_SESE (block);
   4216  1.1  mrg 
   4217  1.1  mrg   if (block->flags & BB_VISITED)
   4218  1.1  mrg     {
   4219  1.1  mrg       /* If we've already encountered this block, either we must not
   4220  1.1  mrg 	 be coloring, or it must have been colored the current color.  */
   4221  1.1  mrg       gcc_assert (coloring < 0 || (sese && coloring == sese->color));
   4222  1.1  mrg       return;
   4223  1.1  mrg     }
   4224  1.1  mrg 
   4225  1.1  mrg   block->flags |= BB_VISITED;
   4226  1.1  mrg 
   4227  1.1  mrg   if (sese)
   4228  1.1  mrg     {
   4229  1.1  mrg       if (coloring < 0)
   4230  1.1  mrg 	{
   4231  1.1  mrg 	  /* Start coloring a region.  */
   4232  1.1  mrg 	  regions[sese->color].first = block;
   4233  1.1  mrg 	  coloring = sese->color;
   4234  1.1  mrg 	}
   4235  1.1  mrg 
   4236  1.1  mrg       if (!--color_counts[sese->color] && sese->color == coloring)
   4237  1.1  mrg 	{
   4238  1.1  mrg 	  /* Found final block of SESE region.  */
   4239  1.1  mrg 	  regions[sese->color].second = block;
   4240  1.1  mrg 	  coloring = -1;
   4241  1.1  mrg 	}
   4242  1.1  mrg       else
   4243  1.1  mrg 	/* Color the node, so we can assert on revisiting the node
   4244  1.1  mrg 	   that the graph is indeed SESE.  */
   4245  1.1  mrg 	sese->color = coloring;
   4246  1.1  mrg     }
   4247  1.1  mrg   else
   4248  1.1  mrg     /* Fallen off the subgraph, we cannot be coloring.  */
   4249  1.1  mrg     gcc_assert (coloring < 0);
   4250  1.1  mrg 
   4251  1.1  mrg   /* Walk each successor block.  */
   4252  1.1  mrg   if (block->succs && block->succs->length ())
   4253  1.1  mrg     {
   4254  1.1  mrg       edge e;
   4255  1.1  mrg       edge_iterator ei;
   4256  1.1  mrg 
   4257  1.1  mrg       FOR_EACH_EDGE (e, ei, block->succs)
   4258  1.1  mrg 	nvptx_sese_color (color_counts, regions, e->dest, coloring);
   4259  1.1  mrg     }
   4260  1.1  mrg   else
   4261  1.1  mrg     gcc_assert (coloring < 0);
   4262  1.1  mrg }
   4263  1.1  mrg 
   4264  1.1  mrg /* Find minimal set of SESE regions covering BLOCKS.  REGIONS might
   4265  1.1  mrg    end up with NULL entries in it.  */
   4266  1.1  mrg 
   4267  1.1  mrg static void
   4268  1.1  mrg nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
   4269  1.1  mrg {
   4270  1.1  mrg   basic_block block;
   4271  1.1  mrg   int ix;
   4272  1.1  mrg 
   4273  1.1  mrg   /* First clear each BB of the whole function.  */
   4274  1.1  mrg   FOR_ALL_BB_FN (block, cfun)
   4275  1.1  mrg     {
   4276  1.1  mrg       block->flags &= ~BB_VISITED;
   4277  1.1  mrg       BB_SET_SESE (block, 0);
   4278  1.1  mrg     }
   4279  1.1  mrg 
   4280  1.1  mrg   /* Mark blocks in the function that are in this graph.  */
   4281  1.1  mrg   for (ix = 0; blocks.iterate (ix, &block); ix++)
   4282  1.1  mrg     block->flags |= BB_VISITED;
   4283  1.1  mrg 
   4284  1.1  mrg   /* Counts of nodes assigned to each color.  There cannot be more
   4285  1.1  mrg      colors than blocks (and hopefully there will be fewer).  */
   4286  1.1  mrg   auto_vec<unsigned> color_counts;
   4287  1.1  mrg   color_counts.reserve (blocks.length ());
   4288  1.1  mrg 
   4289  1.1  mrg   /* Worklist of nodes in the spanning tree.  Again, there cannot be
   4290  1.1  mrg      more nodes in the tree than blocks (there will be fewer if the
   4291  1.1  mrg      CFG of blocks is disjoint).  */
   4292  1.1  mrg   auto_vec<basic_block> spanlist;
   4293  1.1  mrg   spanlist.reserve (blocks.length ());
   4294  1.1  mrg 
   4295  1.1  mrg   /* Make sure every block has its cycle class determined.  */
   4296  1.1  mrg   for (ix = 0; blocks.iterate (ix, &block); ix++)
   4297  1.1  mrg     {
   4298  1.1  mrg       if (BB_GET_SESE (block))
   4299  1.1  mrg 	/* We already met this block in an earlier graph solve.  */
   4300  1.1  mrg 	continue;
   4301  1.1  mrg 
   4302  1.1  mrg       if (dump_file)
   4303  1.1  mrg 	fprintf (dump_file, "Searching graph starting at %d\n", block->index);
   4304  1.1  mrg 
   4305  1.1  mrg       /* Number the nodes reachable from block initial DFS order.  */
   4306  1.1  mrg       int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
   4307  1.1  mrg 
   4308  1.1  mrg       /* Now walk in reverse DFS order to find cycle equivalents.  */
   4309  1.1  mrg       while (spanlist.length ())
   4310  1.1  mrg 	{
   4311  1.1  mrg 	  block = spanlist.pop ();
   4312  1.1  mrg 	  bb_sese *sese = BB_GET_SESE (block);
   4313  1.1  mrg 
   4314  1.1  mrg 	  /* Do the pseudo node below.  */
   4315  1.1  mrg 	  nvptx_sese_pseudo (block, sese, depth, +1,
   4316  1.1  mrg 			     sese->dir > 0 ? block->succs : block->preds,
   4317  1.1  mrg 			     (sese->dir > 0 ? offsetof (edge_def, dest)
   4318  1.1  mrg 			      : offsetof (edge_def, src)));
   4319  1.1  mrg 	  sese->set_color (color_counts);
   4320  1.1  mrg 	  /* Do the pseudo node above.  */
   4321  1.1  mrg 	  nvptx_sese_pseudo (block, sese, depth, -1,
   4322  1.1  mrg 			     sese->dir < 0 ? block->succs : block->preds,
   4323  1.1  mrg 			     (sese->dir < 0 ? offsetof (edge_def, dest)
   4324  1.1  mrg 			      : offsetof (edge_def, src)));
   4325  1.1  mrg 	}
   4326  1.1  mrg       if (dump_file)
   4327  1.1  mrg 	fprintf (dump_file, "\n");
   4328  1.1  mrg     }
   4329  1.1  mrg 
   4330  1.1  mrg   if (dump_file)
   4331  1.1  mrg     {
   4332  1.1  mrg       unsigned count;
   4333  1.1  mrg       const char *comma = "";
   4334  1.1  mrg 
   4335  1.1  mrg       fprintf (dump_file, "Found %d cycle equivalents\n",
   4336  1.1  mrg 	       color_counts.length ());
   4337  1.1  mrg       for (ix = 0; color_counts.iterate (ix, &count); ix++)
   4338  1.1  mrg 	{
   4339  1.1  mrg 	  fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
   4340  1.1  mrg 
   4341  1.1  mrg 	  comma = "";
   4342  1.1  mrg 	  for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
   4343  1.1  mrg 	    if (BB_GET_SESE (block)->color == ix)
   4344  1.1  mrg 	      {
   4345  1.1  mrg 		block->flags |= BB_VISITED;
   4346  1.1  mrg 		fprintf (dump_file, "%s%d", comma, block->index);
   4347  1.1  mrg 		comma=",";
   4348  1.1  mrg 	      }
   4349  1.1  mrg 	  fprintf (dump_file, "}");
   4350  1.1  mrg 	  comma = ", ";
   4351  1.1  mrg 	}
   4352  1.1  mrg       fprintf (dump_file, "\n");
   4353  1.1  mrg    }
   4354  1.1  mrg 
   4355  1.1  mrg   /* Now we've colored every block in the subgraph.  We now need to
   4356  1.1  mrg      determine the minimal set of SESE regions that cover that
   4357  1.1  mrg      subgraph.  Do this with a DFS walk of the complete function.
   4358  1.1  mrg      During the walk we're either 'looking' or 'coloring'.  When we
   4359  1.1  mrg      reach the last node of a particular color, we stop coloring and
   4360  1.1  mrg      return to looking.  */
   4361  1.1  mrg 
   4362  1.1  mrg   /* There cannot be more SESE regions than colors.  */
   4363  1.1  mrg   regions.reserve (color_counts.length ());
   4364  1.1  mrg   for (ix = color_counts.length (); ix--;)
   4365  1.1  mrg     regions.quick_push (bb_pair_t (0, 0));
   4366  1.1  mrg 
   4367  1.1  mrg   for (ix = 0; blocks.iterate (ix, &block); ix++)
   4368  1.1  mrg     block->flags &= ~BB_VISITED;
   4369  1.1  mrg 
   4370  1.1  mrg   nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
   4371  1.1  mrg 
   4372  1.1  mrg   if (dump_file)
   4373  1.1  mrg     {
   4374  1.1  mrg       const char *comma = "";
   4375  1.1  mrg       int len = regions.length ();
   4376  1.1  mrg 
   4377  1.1  mrg       fprintf (dump_file, "SESE regions:");
   4378  1.1  mrg       for (ix = 0; ix != len; ix++)
   4379  1.1  mrg 	{
   4380  1.1  mrg 	  basic_block from = regions[ix].first;
   4381  1.1  mrg 	  basic_block to = regions[ix].second;
   4382  1.1  mrg 
   4383  1.1  mrg 	  if (from)
   4384  1.1  mrg 	    {
   4385  1.1  mrg 	      fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
   4386  1.1  mrg 	      if (to != from)
   4387  1.1  mrg 		fprintf (dump_file, "->%d", to->index);
   4388  1.1  mrg 
   4389  1.1  mrg 	      int color = BB_GET_SESE (from)->color;
   4390  1.1  mrg 
   4391  1.1  mrg 	      /* Print the blocks within the region (excluding ends).  */
   4392  1.1  mrg 	      FOR_EACH_BB_FN (block, cfun)
   4393  1.1  mrg 		{
   4394  1.1  mrg 		  bb_sese *sese = BB_GET_SESE (block);
   4395  1.1  mrg 
   4396  1.1  mrg 		  if (sese && sese->color == color
   4397  1.1  mrg 		      && block != from && block != to)
   4398  1.1  mrg 		    fprintf (dump_file, ".%d", block->index);
   4399  1.1  mrg 		}
   4400  1.1  mrg 	      fprintf (dump_file, "}");
   4401  1.1  mrg 	    }
   4402  1.1  mrg 	  comma = ",";
   4403  1.1  mrg 	}
   4404  1.1  mrg       fprintf (dump_file, "\n\n");
   4405  1.1  mrg     }
   4406  1.1  mrg 
   4407  1.1  mrg   for (ix = 0; blocks.iterate (ix, &block); ix++)
   4408  1.1  mrg     delete BB_GET_SESE (block);
   4409  1.1  mrg }
   4410  1.1  mrg 
   4411  1.1  mrg #undef BB_SET_SESE
   4412  1.1  mrg #undef BB_GET_SESE
   4413  1.1  mrg 
   4414  1.1  mrg /* Propagate live state at the start of a partitioned region.  IS_CALL
   4415  1.1  mrg    indicates whether the propagation is for a (partitioned) call
   4416  1.1  mrg    instruction.  BLOCK provides the live register information, and
   4417  1.1  mrg    might not contain INSN. Propagation is inserted just after INSN. RW
   4418  1.1  mrg    indicates whether we are reading and/or writing state.  This
   4419  1.1  mrg    separation is needed for worker-level proppagation where we
   4420  1.1  mrg    essentially do a spill & fill.  FN is the underlying worker
   4421  1.1  mrg    function to generate the propagation instructions for single
   4422  1.1  mrg    register.  DATA is user data.
   4423  1.1  mrg 
   4424  1.1  mrg    Returns true if we didn't emit any instructions.
   4425  1.1  mrg 
   4426  1.1  mrg    We propagate the live register set for non-calls and the entire
   4427  1.1  mrg    frame for calls and non-calls.  We could do better by (a)
   4428  1.1  mrg    propagating just the live set that is used within the partitioned
   4429  1.1  mrg    regions and (b) only propagating stack entries that are used.  The
   4430  1.1  mrg    latter might be quite hard to determine.  */
   4431  1.1  mrg 
   4432  1.1  mrg typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
   4433  1.1  mrg 
   4434  1.1  mrg static bool
   4435  1.1  mrg nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
   4436  1.1  mrg 		 propagate_mask rw, propagator_fn fn, void *data, bool vector)
   4437  1.1  mrg {
   4438  1.1  mrg   bitmap live = DF_LIVE_IN (block);
   4439  1.1  mrg   bitmap_iterator iterator;
   4440  1.1  mrg   unsigned ix;
   4441  1.1  mrg   bool empty = true;
   4442  1.1  mrg 
   4443  1.1  mrg   /* Copy the frame array.  */
   4444  1.1  mrg   HOST_WIDE_INT fs = get_frame_size ();
   4445  1.1  mrg   if (fs)
   4446  1.1  mrg     {
   4447  1.1  mrg       rtx tmp = gen_reg_rtx (DImode);
   4448  1.1  mrg       rtx idx = NULL_RTX;
   4449  1.1  mrg       rtx ptr = gen_reg_rtx (Pmode);
   4450  1.1  mrg       rtx pred = NULL_RTX;
   4451  1.1  mrg       rtx_code_label *label = NULL;
   4452  1.1  mrg 
   4453  1.1  mrg       empty = false;
   4454  1.1  mrg       /* The frame size might not be DImode compatible, but the frame
   4455  1.1  mrg 	 array's declaration will be.  So it's ok to round up here.  */
   4456  1.1  mrg       fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
   4457  1.1  mrg       /* Detect single iteration loop. */
   4458  1.1  mrg       if (fs == 1)
   4459  1.1  mrg 	fs = 0;
   4460  1.1  mrg 
   4461  1.1  mrg       start_sequence ();
   4462  1.1  mrg       emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
   4463  1.1  mrg       if (fs)
   4464  1.1  mrg 	{
   4465  1.1  mrg 	  idx = gen_reg_rtx (SImode);
   4466  1.1  mrg 	  pred = gen_reg_rtx (BImode);
   4467  1.1  mrg 	  label = gen_label_rtx ();
   4468  1.1  mrg 
   4469  1.1  mrg 	  emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
   4470  1.1  mrg 	  /* Allow worker function to initialize anything needed.  */
   4471  1.1  mrg 	  rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
   4472  1.1  mrg 	  if (init)
   4473  1.1  mrg 	    emit_insn (init);
   4474  1.1  mrg 	  emit_label (label);
   4475  1.1  mrg 	  LABEL_NUSES (label)++;
   4476  1.1  mrg 	  emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
   4477  1.1  mrg 	}
   4478  1.1  mrg       if (rw & PM_read)
   4479  1.1  mrg 	emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
   4480  1.1  mrg       emit_insn (fn (tmp, rw, fs, data, vector));
   4481  1.1  mrg       if (rw & PM_write)
   4482  1.1  mrg 	emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
   4483  1.1  mrg       if (fs)
   4484  1.1  mrg 	{
   4485  1.1  mrg 	  emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
   4486  1.1  mrg 	  emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
   4487  1.1  mrg 	  emit_insn (gen_br_true_uni (pred, label));
   4488  1.1  mrg 	  rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
   4489  1.1  mrg 	  if (fini)
   4490  1.1  mrg 	    emit_insn (fini);
   4491  1.1  mrg 	  emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
   4492  1.1  mrg 	}
   4493  1.1  mrg       emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
   4494  1.1  mrg       emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
   4495  1.1  mrg       rtx cpy = get_insns ();
   4496  1.1  mrg       end_sequence ();
   4497  1.1  mrg       insn = emit_insn_after (cpy, insn);
   4498  1.1  mrg     }
   4499  1.1  mrg 
   4500  1.1  mrg   if (!is_call)
   4501  1.1  mrg     /* Copy live registers.  */
   4502  1.1  mrg     EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
   4503  1.1  mrg       {
   4504  1.1  mrg 	rtx reg = regno_reg_rtx[ix];
   4505  1.1  mrg 
   4506  1.1  mrg 	if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
   4507  1.1  mrg 	  {
   4508  1.1  mrg 	    rtx bcast = fn (reg, rw, 0, data, vector);
   4509  1.1  mrg 
   4510  1.1  mrg 	    insn = emit_insn_after (bcast, insn);
   4511  1.1  mrg 	    empty = false;
   4512  1.1  mrg 	  }
   4513  1.1  mrg       }
   4514  1.1  mrg   return empty;
   4515  1.1  mrg }
   4516  1.1  mrg 
   4517  1.1  mrg /* Worker for nvptx_warp_propagate.  */
   4518  1.1  mrg 
   4519  1.1  mrg static rtx
   4520  1.1  mrg warp_prop_gen (rtx reg, propagate_mask pm,
   4521  1.1  mrg 	       unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
   4522  1.1  mrg 	       bool ARG_UNUSED (vector))
   4523  1.1  mrg {
   4524  1.1  mrg   if (!(pm & PM_read_write))
   4525  1.1  mrg     return 0;
   4526  1.1  mrg 
   4527  1.1  mrg   return nvptx_gen_warp_bcast (reg);
   4528  1.1  mrg }
   4529  1.1  mrg 
   4530  1.1  mrg /* Propagate state that is live at start of BLOCK across the vectors
   4531  1.1  mrg    of a single warp.  Propagation is inserted just after INSN.
   4532  1.1  mrg    IS_CALL and return as for nvptx_propagate.  */
   4533  1.1  mrg 
   4534  1.1  mrg static bool
   4535  1.1  mrg nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
   4536  1.1  mrg {
   4537  1.1  mrg   return nvptx_propagate (is_call, block, insn, PM_read_write,
   4538  1.1  mrg 			  warp_prop_gen, 0, false);
   4539  1.1  mrg }
   4540  1.1  mrg 
   4541  1.1  mrg /* Worker for nvptx_shared_propagate.  */
   4542  1.1  mrg 
   4543  1.1  mrg static rtx
   4544  1.1  mrg shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
   4545  1.1  mrg 		 bool vector)
   4546  1.1  mrg {
   4547  1.1  mrg   broadcast_data_t *data = (broadcast_data_t *)data_;
   4548  1.1  mrg 
   4549  1.1  mrg   if (pm & PM_loop_begin)
   4550  1.1  mrg     {
   4551  1.1  mrg       /* Starting a loop, initialize pointer.    */
   4552  1.1  mrg       unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
   4553  1.1  mrg 
   4554  1.1  mrg       oacc_bcast_align = MAX (oacc_bcast_align, align);
   4555  1.1  mrg       data->offset = ROUND_UP (data->offset, align);
   4556  1.1  mrg 
   4557  1.1  mrg       data->ptr = gen_reg_rtx (Pmode);
   4558  1.1  mrg 
   4559  1.1  mrg       return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
   4560  1.1  mrg     }
   4561  1.1  mrg   else if (pm & PM_loop_end)
   4562  1.1  mrg     {
   4563  1.1  mrg       rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
   4564  1.1  mrg       data->ptr = NULL_RTX;
   4565  1.1  mrg       return clobber;
   4566  1.1  mrg     }
   4567  1.1  mrg   else
   4568  1.1  mrg     return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
   4569  1.1  mrg }
   4570  1.1  mrg 
   4571  1.1  mrg /* Spill or fill live state that is live at start of BLOCK.  PRE_P
   4572  1.1  mrg    indicates if this is just before partitioned mode (do spill), or
   4573  1.1  mrg    just after it starts (do fill). Sequence is inserted just after
   4574  1.1  mrg    INSN.  IS_CALL and return as for nvptx_propagate.  */
   4575  1.1  mrg 
   4576  1.1  mrg static bool
   4577  1.1  mrg nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
   4578  1.1  mrg 			rtx_insn *insn, bool vector)
   4579  1.1  mrg {
   4580  1.1  mrg   broadcast_data_t data;
   4581  1.1  mrg 
   4582  1.1  mrg   data.base = gen_reg_rtx (Pmode);
   4583  1.1  mrg   data.offset = 0;
   4584  1.1  mrg   data.ptr = NULL_RTX;
   4585  1.1  mrg 
   4586  1.1  mrg   bool empty = nvptx_propagate (is_call, block, insn,
   4587  1.1  mrg 				pre_p ? PM_read : PM_write, shared_prop_gen,
   4588  1.1  mrg 				&data, vector);
   4589  1.1  mrg   gcc_assert (empty == !data.offset);
   4590  1.1  mrg   if (data.offset)
   4591  1.1  mrg     {
   4592  1.1  mrg       rtx bcast_sym = oacc_bcast_sym;
   4593  1.1  mrg 
   4594  1.1  mrg       /* Stuff was emitted, initialize the base pointer now.  */
   4595  1.1  mrg       if (vector && nvptx_mach_max_workers () > 1)
   4596  1.1  mrg 	{
   4597  1.1  mrg 	  if (!cfun->machine->bcast_partition)
   4598  1.1  mrg 	    {
   4599  1.1  mrg 	      /* It would be nice to place this register in
   4600  1.1  mrg 		 DATA_AREA_SHARED.  */
   4601  1.1  mrg 	      cfun->machine->bcast_partition = gen_reg_rtx (DImode);
   4602  1.1  mrg 	    }
   4603  1.1  mrg 	  if (!cfun->machine->sync_bar)
   4604  1.1  mrg 	    cfun->machine->sync_bar = gen_reg_rtx (SImode);
   4605  1.1  mrg 
   4606  1.1  mrg 	  bcast_sym = cfun->machine->bcast_partition;
   4607  1.1  mrg 	}
   4608  1.1  mrg 
   4609  1.1  mrg       rtx init = gen_rtx_SET (data.base, bcast_sym);
   4610  1.1  mrg       emit_insn_after (init, insn);
   4611  1.1  mrg 
   4612  1.1  mrg       unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
   4613  1.1  mrg       unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
   4614  1.1  mrg 			   ? nvptx_mach_max_workers () + 1
   4615  1.1  mrg 			   : 1);
   4616  1.1  mrg 
   4617  1.1  mrg       oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
   4618  1.1  mrg       oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
   4619  1.1  mrg     }
   4620  1.1  mrg   return empty;
   4621  1.1  mrg }
   4622  1.1  mrg 
   4623  1.1  mrg /* Emit a CTA-level synchronization barrier.  LOCK is the barrier number,
   4624  1.1  mrg    which is an integer or a register.  THREADS is the number of threads
   4625  1.1  mrg    controlled by the barrier.  */
   4626  1.1  mrg 
   4627  1.1  mrg static rtx
   4628  1.1  mrg nvptx_cta_sync (rtx lock, int threads)
   4629  1.1  mrg {
   4630  1.1  mrg   return gen_nvptx_barsync (lock, GEN_INT (threads));
   4631  1.1  mrg }
   4632  1.1  mrg 
   4633  1.1  mrg #if WORKAROUND_PTXJIT_BUG
   4634  1.1  mrg /* Return first real insn in BB, or return NULL_RTX if BB does not contain
   4635  1.1  mrg    real insns.  */
   4636  1.1  mrg 
   4637  1.1  mrg static rtx_insn *
   4638  1.1  mrg bb_first_real_insn (basic_block bb)
   4639  1.1  mrg {
   4640  1.1  mrg   rtx_insn *insn;
   4641  1.1  mrg 
   4642  1.1  mrg   /* Find first insn of from block.  */
   4643  1.1  mrg   FOR_BB_INSNS (bb, insn)
   4644  1.1  mrg     if (INSN_P (insn))
   4645  1.1  mrg       return insn;
   4646  1.1  mrg 
   4647  1.1  mrg   return 0;
   4648  1.1  mrg }
   4649  1.1  mrg #endif
   4650  1.1  mrg 
   4651  1.1  mrg /* Return true if INSN needs neutering.  */
   4652  1.1  mrg 
   4653  1.1  mrg static bool
   4654  1.1  mrg needs_neutering_p (rtx_insn *insn)
   4655  1.1  mrg {
   4656  1.1  mrg   if (!INSN_P (insn))
   4657  1.1  mrg     return false;
   4658  1.1  mrg 
   4659  1.1  mrg   switch (recog_memoized (insn))
   4660  1.1  mrg     {
   4661  1.1  mrg     case CODE_FOR_nvptx_fork:
   4662  1.1  mrg     case CODE_FOR_nvptx_forked:
   4663  1.1  mrg     case CODE_FOR_nvptx_joining:
   4664  1.1  mrg     case CODE_FOR_nvptx_join:
   4665  1.1  mrg     case CODE_FOR_nvptx_barsync:
   4666  1.1  mrg       return false;
   4667  1.1  mrg     default:
   4668  1.1  mrg       return true;
   4669  1.1  mrg     }
   4670  1.1  mrg }
   4671  1.1  mrg 
   4672  1.1  mrg /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM.  */
   4673  1.1  mrg 
   4674  1.1  mrg static bool
   4675  1.1  mrg verify_neutering_jumps (basic_block from,
   4676  1.1  mrg 			rtx_insn *vector_jump, rtx_insn *worker_jump,
   4677  1.1  mrg 			rtx_insn *vector_label, rtx_insn *worker_label)
   4678  1.1  mrg {
   4679  1.1  mrg   basic_block bb = from;
   4680  1.1  mrg   rtx_insn *insn = BB_HEAD (bb);
   4681  1.1  mrg   bool seen_worker_jump = false;
   4682  1.1  mrg   bool seen_vector_jump = false;
   4683  1.1  mrg   bool seen_worker_label = false;
   4684  1.1  mrg   bool seen_vector_label = false;
   4685  1.1  mrg   bool worker_neutered = false;
   4686  1.1  mrg   bool vector_neutered = false;
   4687  1.1  mrg   while (true)
   4688  1.1  mrg     {
   4689  1.1  mrg       if (insn == worker_jump)
   4690  1.1  mrg 	{
   4691  1.1  mrg 	  seen_worker_jump = true;
   4692  1.1  mrg 	  worker_neutered = true;
   4693  1.1  mrg 	  gcc_assert (!vector_neutered);
   4694  1.1  mrg 	}
   4695  1.1  mrg       else if (insn == vector_jump)
   4696  1.1  mrg 	{
   4697  1.1  mrg 	  seen_vector_jump = true;
   4698  1.1  mrg 	  vector_neutered = true;
   4699  1.1  mrg 	}
   4700  1.1  mrg       else if (insn == worker_label)
   4701  1.1  mrg 	{
   4702  1.1  mrg 	  seen_worker_label = true;
   4703  1.1  mrg 	  gcc_assert (worker_neutered);
   4704  1.1  mrg 	  worker_neutered = false;
   4705  1.1  mrg 	}
   4706  1.1  mrg       else if (insn == vector_label)
   4707  1.1  mrg 	{
   4708  1.1  mrg 	  seen_vector_label = true;
   4709  1.1  mrg 	  gcc_assert (vector_neutered);
   4710  1.1  mrg 	  vector_neutered = false;
   4711  1.1  mrg 	}
   4712  1.1  mrg       else if (INSN_P (insn))
   4713  1.1  mrg 	switch (recog_memoized (insn))
   4714  1.1  mrg 	  {
   4715  1.1  mrg 	  case CODE_FOR_nvptx_barsync:
   4716  1.1  mrg 	    gcc_assert (!vector_neutered && !worker_neutered);
   4717  1.1  mrg 	    break;
   4718  1.1  mrg 	  default:
   4719  1.1  mrg 	    break;
   4720  1.1  mrg 	  }
   4721  1.1  mrg 
   4722  1.1  mrg       if (insn != BB_END (bb))
   4723  1.1  mrg 	insn = NEXT_INSN (insn);
   4724  1.1  mrg       else if (JUMP_P (insn) && single_succ_p (bb)
   4725  1.1  mrg 	       && !seen_vector_jump && !seen_worker_jump)
   4726  1.1  mrg 	{
   4727  1.1  mrg 	  bb = single_succ (bb);
   4728  1.1  mrg 	  insn = BB_HEAD (bb);
   4729  1.1  mrg 	}
   4730  1.1  mrg       else
   4731  1.1  mrg 	break;
   4732  1.1  mrg     }
   4733  1.1  mrg 
   4734  1.1  mrg   gcc_assert (!(vector_jump && !seen_vector_jump));
   4735  1.1  mrg   gcc_assert (!(worker_jump && !seen_worker_jump));
   4736  1.1  mrg 
   4737  1.1  mrg   if (seen_vector_label || seen_worker_label)
   4738  1.1  mrg     {
   4739  1.1  mrg       gcc_assert (!(vector_label && !seen_vector_label));
   4740  1.1  mrg       gcc_assert (!(worker_label && !seen_worker_label));
   4741  1.1  mrg 
   4742  1.1  mrg       return true;
   4743  1.1  mrg     }
   4744  1.1  mrg 
   4745  1.1  mrg   return false;
   4746  1.1  mrg }
   4747  1.1  mrg 
   4748  1.1  mrg /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO.  */
   4749  1.1  mrg 
   4750  1.1  mrg static void
   4751  1.1  mrg verify_neutering_labels (basic_block to, rtx_insn *vector_label,
   4752  1.1  mrg 			 rtx_insn *worker_label)
   4753  1.1  mrg {
   4754  1.1  mrg   basic_block bb = to;
   4755  1.1  mrg   rtx_insn *insn = BB_END (bb);
   4756  1.1  mrg   bool seen_worker_label = false;
   4757  1.1  mrg   bool seen_vector_label = false;
   4758  1.1  mrg   while (true)
   4759  1.1  mrg     {
   4760  1.1  mrg       if (insn == worker_label)
   4761  1.1  mrg 	{
   4762  1.1  mrg 	  seen_worker_label = true;
   4763  1.1  mrg 	  gcc_assert (!seen_vector_label);
   4764  1.1  mrg 	}
   4765  1.1  mrg       else if (insn == vector_label)
   4766  1.1  mrg 	seen_vector_label = true;
   4767  1.1  mrg       else if (INSN_P (insn))
   4768  1.1  mrg 	switch (recog_memoized (insn))
   4769  1.1  mrg 	  {
   4770  1.1  mrg 	  case CODE_FOR_nvptx_barsync:
   4771  1.1  mrg 	    gcc_assert (!seen_vector_label && !seen_worker_label);
   4772  1.1  mrg 	    break;
   4773  1.1  mrg 	  }
   4774  1.1  mrg 
   4775  1.1  mrg       if (insn != BB_HEAD (bb))
   4776  1.1  mrg 	insn = PREV_INSN (insn);
   4777  1.1  mrg       else
   4778  1.1  mrg 	break;
   4779  1.1  mrg     }
   4780  1.1  mrg 
   4781  1.1  mrg   gcc_assert (!(vector_label && !seen_vector_label));
   4782  1.1  mrg   gcc_assert (!(worker_label && !seen_worker_label));
   4783  1.1  mrg }
   4784  1.1  mrg 
   4785  1.1  mrg /* Single neutering according to MASK.  FROM is the incoming block and
   4786  1.1  mrg    TO is the outgoing block.  These may be the same block. Insert at
   4787  1.1  mrg    start of FROM:
   4788  1.1  mrg 
   4789  1.1  mrg      if (tid.<axis>) goto end.
   4790  1.1  mrg 
   4791  1.1  mrg    and insert before ending branch of TO (if there is such an insn):
   4792  1.1  mrg 
   4793  1.1  mrg      end:
   4794  1.1  mrg      <possibly-broadcast-cond>
   4795  1.1  mrg      <branch>
   4796  1.1  mrg 
   4797  1.1  mrg    We currently only use differnt FROM and TO when skipping an entire
   4798  1.1  mrg    loop.  We could do more if we detected superblocks.  */
   4799  1.1  mrg 
   4800  1.1  mrg static void
   4801  1.1  mrg nvptx_single (unsigned mask, basic_block from, basic_block to)
   4802  1.1  mrg {
   4803  1.1  mrg   rtx_insn *head = BB_HEAD (from);
   4804  1.1  mrg   rtx_insn *tail = BB_END (to);
   4805  1.1  mrg   unsigned skip_mask = mask;
   4806  1.1  mrg 
   4807  1.1  mrg   while (true)
   4808  1.1  mrg     {
   4809  1.1  mrg       /* Find first insn of from block.  */
   4810  1.1  mrg       while (head != BB_END (from) && !needs_neutering_p (head))
   4811  1.1  mrg 	head = NEXT_INSN (head);
   4812  1.1  mrg 
   4813  1.1  mrg       if (from == to)
   4814  1.1  mrg 	break;
   4815  1.1  mrg 
   4816  1.1  mrg       if (!(JUMP_P (head) && single_succ_p (from)))
   4817  1.1  mrg 	break;
   4818  1.1  mrg 
   4819  1.1  mrg       basic_block jump_target = single_succ (from);
   4820  1.1  mrg       if (!single_pred_p (jump_target))
   4821  1.1  mrg 	break;
   4822  1.1  mrg 
   4823  1.1  mrg       from = jump_target;
   4824  1.1  mrg       head = BB_HEAD (from);
   4825  1.1  mrg     }
   4826  1.1  mrg 
   4827  1.1  mrg   /* Find last insn of to block */
   4828  1.1  mrg   rtx_insn *limit = from == to ? head : BB_HEAD (to);
   4829  1.1  mrg   while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
   4830  1.1  mrg     tail = PREV_INSN (tail);
   4831  1.1  mrg 
   4832  1.1  mrg   /* Detect if tail is a branch.  */
   4833  1.1  mrg   rtx tail_branch = NULL_RTX;
   4834  1.1  mrg   rtx cond_branch = NULL_RTX;
   4835  1.1  mrg   if (tail && INSN_P (tail))
   4836  1.1  mrg     {
   4837  1.1  mrg       tail_branch = PATTERN (tail);
   4838  1.1  mrg       if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
   4839  1.1  mrg 	tail_branch = NULL_RTX;
   4840  1.1  mrg       else
   4841  1.1  mrg 	{
   4842  1.1  mrg 	  cond_branch = SET_SRC (tail_branch);
   4843  1.1  mrg 	  if (GET_CODE (cond_branch) != IF_THEN_ELSE)
   4844  1.1  mrg 	    cond_branch = NULL_RTX;
   4845  1.1  mrg 	}
   4846  1.1  mrg     }
   4847  1.1  mrg 
   4848  1.1  mrg   if (tail == head)
   4849  1.1  mrg     {
   4850  1.1  mrg       /* If this is empty, do nothing.  */
   4851  1.1  mrg       if (!head || !needs_neutering_p (head))
   4852  1.1  mrg 	return;
   4853  1.1  mrg 
   4854  1.1  mrg       if (cond_branch)
   4855  1.1  mrg 	{
   4856  1.1  mrg 	  /* If we're only doing vector single, there's no need to
   4857  1.1  mrg 	     emit skip code because we'll not insert anything.  */
   4858  1.1  mrg 	  if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
   4859  1.1  mrg 	    skip_mask = 0;
   4860  1.1  mrg 	}
   4861  1.1  mrg       else if (tail_branch)
   4862  1.1  mrg 	/* Block with only unconditional branch.  Nothing to do.  */
   4863  1.1  mrg 	return;
   4864  1.1  mrg     }
   4865  1.1  mrg 
   4866  1.1  mrg   /* Insert the vector test inside the worker test.  */
   4867  1.1  mrg   unsigned mode;
   4868  1.1  mrg   rtx_insn *before = tail;
   4869  1.1  mrg   rtx_insn *neuter_start = NULL;
   4870  1.1  mrg   rtx_insn *worker_label = NULL, *vector_label = NULL;
   4871  1.1  mrg   rtx_insn *worker_jump = NULL, *vector_jump = NULL;
   4872  1.1  mrg   rtx_insn *warp_sync = NULL;
   4873  1.1  mrg   for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
   4874  1.1  mrg     if (GOMP_DIM_MASK (mode) & skip_mask)
   4875  1.1  mrg       {
   4876  1.1  mrg 	rtx_code_label *label = gen_label_rtx ();
   4877  1.1  mrg 	rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
   4878  1.1  mrg 	rtx_insn **mode_jump
   4879  1.1  mrg 	  = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
   4880  1.1  mrg 	rtx_insn **mode_label
   4881  1.1  mrg 	  = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
   4882  1.1  mrg 
   4883  1.1  mrg 	if (!pred)
   4884  1.1  mrg 	  {
   4885  1.1  mrg 	    pred = gen_reg_rtx (BImode);
   4886  1.1  mrg 	    cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
   4887  1.1  mrg 	  }
   4888  1.1  mrg 
   4889  1.1  mrg 	rtx br;
   4890  1.1  mrg 	if (mode == GOMP_DIM_VECTOR)
   4891  1.1  mrg 	  br = gen_br_true (pred, label);
   4892  1.1  mrg 	else
   4893  1.1  mrg 	  br = gen_br_true_uni (pred, label);
   4894  1.1  mrg 	if (neuter_start)
   4895  1.1  mrg 	  neuter_start = emit_insn_after (br, neuter_start);
   4896  1.1  mrg 	else
   4897  1.1  mrg 	  neuter_start = emit_insn_before (br, head);
   4898  1.1  mrg 	*mode_jump = neuter_start;
   4899  1.1  mrg 
   4900  1.1  mrg 	LABEL_NUSES (label)++;
   4901  1.1  mrg 	rtx_insn *label_insn;
   4902  1.1  mrg 	if (tail_branch)
   4903  1.1  mrg 	  {
   4904  1.1  mrg 	    label_insn = emit_label_before (label, before);
   4905  1.1  mrg 	    if (mode == GOMP_DIM_VECTOR)
   4906  1.1  mrg 	      {
   4907  1.1  mrg 		if (TARGET_PTX_6_0)
   4908  1.1  mrg 		  warp_sync = emit_insn_after (gen_nvptx_warpsync (),
   4909  1.1  mrg 					       label_insn);
   4910  1.1  mrg 		else
   4911  1.1  mrg 		  warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
   4912  1.1  mrg 					       label_insn);
   4913  1.1  mrg 	      }
   4914  1.1  mrg 	    before = label_insn;
   4915  1.1  mrg 	  }
   4916  1.1  mrg 	else
   4917  1.1  mrg 	  {
   4918  1.1  mrg 	    label_insn = emit_label_after (label, tail);
   4919  1.1  mrg 	    if (mode == GOMP_DIM_VECTOR)
   4920  1.1  mrg 	      {
   4921  1.1  mrg 		if (TARGET_PTX_6_0)
   4922  1.1  mrg 		  warp_sync = emit_insn_after (gen_nvptx_warpsync (),
   4923  1.1  mrg 					       label_insn);
   4924  1.1  mrg 		else
   4925  1.1  mrg 		  warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
   4926  1.1  mrg 					       label_insn);
   4927  1.1  mrg 	      }
   4928  1.1  mrg 	    if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
   4929  1.1  mrg 		&& CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
   4930  1.1  mrg 	      emit_insn_after (gen_exit (), label_insn);
   4931  1.1  mrg 	  }
   4932  1.1  mrg 
   4933  1.1  mrg 	*mode_label = label_insn;
   4934  1.1  mrg       }
   4935  1.1  mrg 
   4936  1.1  mrg   /* Now deal with propagating the branch condition.  */
   4937  1.1  mrg   if (cond_branch)
   4938  1.1  mrg     {
   4939  1.1  mrg       rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
   4940  1.1  mrg 
   4941  1.1  mrg       if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
   4942  1.1  mrg 	  && nvptx_mach_vector_length () == PTX_WARP_SIZE)
   4943  1.1  mrg 	{
   4944  1.1  mrg 	  /* Vector mode only, do a shuffle.  */
   4945  1.1  mrg #if WORKAROUND_PTXJIT_BUG
   4946  1.1  mrg 	  /* The branch condition %rcond is propagated like this:
   4947  1.1  mrg 
   4948  1.1  mrg 		{
   4949  1.1  mrg 		    .reg .u32 %x;
   4950  1.1  mrg 		    mov.u32 %x,%tid.x;
   4951  1.1  mrg 		    setp.ne.u32 %rnotvzero,%x,0;
   4952  1.1  mrg 		 }
   4953  1.1  mrg 
   4954  1.1  mrg 		 @%rnotvzero bra Lskip;
   4955  1.1  mrg 		 setp.<op>.<type> %rcond,op1,op2;
   4956  1.1  mrg 		 Lskip:
   4957  1.1  mrg 		 selp.u32 %rcondu32,1,0,%rcond;
   4958  1.1  mrg 		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
   4959  1.1  mrg 		 setp.ne.u32 %rcond,%rcondu32,0;
   4960  1.1  mrg 
   4961  1.1  mrg 	     There seems to be a bug in the ptx JIT compiler (observed at driver
   4962  1.1  mrg 	     version 381.22, at -O1 and higher for sm_61), that drops the shfl
   4963  1.1  mrg 	     unless %rcond is initialized to something before 'bra Lskip'.  The
   4964  1.1  mrg 	     bug is not observed with ptxas from cuda 8.0.61.
   4965  1.1  mrg 
   4966  1.1  mrg 	     It is true that the code is non-trivial: at Lskip, %rcond is
   4967  1.1  mrg 	     uninitialized in threads 1-31, and after the selp the same holds
   4968  1.1  mrg 	     for %rcondu32.  But shfl propagates the defined value in thread 0
   4969  1.1  mrg 	     to threads 1-31, so after the shfl %rcondu32 is defined in threads
   4970  1.1  mrg 	     0-31, and after the setp.ne %rcond is defined in threads 0-31.
   4971  1.1  mrg 
   4972  1.1  mrg 	     There is nothing in the PTX spec to suggest that this is wrong, or
   4973  1.1  mrg 	     to explain why the extra initialization is needed.  So, we classify
   4974  1.1  mrg 	     it as a JIT bug, and the extra initialization as workaround:
   4975  1.1  mrg 
   4976  1.1  mrg 		{
   4977  1.1  mrg 		    .reg .u32 %x;
   4978  1.1  mrg 		    mov.u32 %x,%tid.x;
   4979  1.1  mrg 		    setp.ne.u32 %rnotvzero,%x,0;
   4980  1.1  mrg 		}
   4981  1.1  mrg 
   4982  1.1  mrg 		+.reg .pred %rcond2;
   4983  1.1  mrg 		+setp.eq.u32 %rcond2, 1, 0;
   4984  1.1  mrg 
   4985  1.1  mrg 		 @%rnotvzero bra Lskip;
   4986  1.1  mrg 		 setp.<op>.<type> %rcond,op1,op2;
   4987  1.1  mrg 		+mov.pred %rcond2, %rcond;
   4988  1.1  mrg 		 Lskip:
   4989  1.1  mrg 		+mov.pred %rcond, %rcond2;
   4990  1.1  mrg 		 selp.u32 %rcondu32,1,0,%rcond;
   4991  1.1  mrg 		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
   4992  1.1  mrg 		 setp.ne.u32 %rcond,%rcondu32,0;
   4993  1.1  mrg 	  */
   4994  1.1  mrg 	  rtx_insn *label = PREV_INSN (tail);
   4995  1.1  mrg 	  if (label == warp_sync)
   4996  1.1  mrg 	    label = PREV_INSN (label);
   4997  1.1  mrg 	  gcc_assert (label && LABEL_P (label));
   4998  1.1  mrg 	  rtx tmp = gen_reg_rtx (BImode);
   4999  1.1  mrg 	  emit_insn_before (gen_movbi (tmp, const0_rtx),
   5000  1.1  mrg 			    bb_first_real_insn (from));
   5001  1.1  mrg 	  emit_insn_before (gen_rtx_SET (tmp, pvar), label);
   5002  1.1  mrg 	  emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
   5003  1.1  mrg #endif
   5004  1.1  mrg 	  emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
   5005  1.1  mrg 	}
   5006  1.1  mrg       else
   5007  1.1  mrg 	{
   5008  1.1  mrg 	  /* Includes worker mode, do spill & fill.  By construction
   5009  1.1  mrg 	     we should never have worker mode only. */
   5010  1.1  mrg 	  broadcast_data_t data;
   5011  1.1  mrg 	  unsigned size = GET_MODE_SIZE (SImode);
   5012  1.1  mrg 	  bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
   5013  1.1  mrg 	  bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
   5014  1.1  mrg 	  rtx barrier = GEN_INT (0);
   5015  1.1  mrg 	  int threads = 0;
   5016  1.1  mrg 
   5017  1.1  mrg 	  data.base = oacc_bcast_sym;
   5018  1.1  mrg 	  data.ptr = 0;
   5019  1.1  mrg 
   5020  1.1  mrg 	  bool use_partitioning_p = (vector && !worker
   5021  1.1  mrg 				     && nvptx_mach_max_workers () > 1
   5022  1.1  mrg 				     && cfun->machine->bcast_partition);
   5023  1.1  mrg 	  if (use_partitioning_p)
   5024  1.1  mrg 	    {
   5025  1.1  mrg 	      data.base = cfun->machine->bcast_partition;
   5026  1.1  mrg 	      barrier = cfun->machine->sync_bar;
   5027  1.1  mrg 	      threads = nvptx_mach_vector_length ();
   5028  1.1  mrg 	    }
   5029  1.1  mrg 	  gcc_assert (data.base != NULL);
   5030  1.1  mrg 	  gcc_assert (barrier);
   5031  1.1  mrg 
   5032  1.1  mrg 	  unsigned int psize = ROUND_UP (size, oacc_bcast_align);
   5033  1.1  mrg 	  unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
   5034  1.1  mrg 			       ? nvptx_mach_max_workers () + 1
   5035  1.1  mrg 			       : 1);
   5036  1.1  mrg 
   5037  1.1  mrg 	  oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
   5038  1.1  mrg 	  oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
   5039  1.1  mrg 
   5040  1.1  mrg 	  data.offset = 0;
   5041  1.1  mrg 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
   5042  1.1  mrg 						    vector),
   5043  1.1  mrg 			    before);
   5044  1.1  mrg 
   5045  1.1  mrg 	  /* Barrier so other workers can see the write.  */
   5046  1.1  mrg 	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
   5047  1.1  mrg 	  data.offset = 0;
   5048  1.1  mrg 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
   5049  1.1  mrg 						    vector),
   5050  1.1  mrg 			    tail);
   5051  1.1  mrg 	  /* This barrier is needed to avoid worker zero clobbering
   5052  1.1  mrg 	     the broadcast buffer before all the other workers have
   5053  1.1  mrg 	     had a chance to read this instance of it.  */
   5054  1.1  mrg 	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
   5055  1.1  mrg 	}
   5056  1.1  mrg 
   5057  1.1  mrg       extract_insn (tail);
   5058  1.1  mrg       rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
   5059  1.1  mrg 				 UNSPEC_BR_UNIFIED);
   5060  1.1  mrg       validate_change (tail, recog_data.operand_loc[0], unsp, false);
   5061  1.1  mrg     }
   5062  1.1  mrg 
   5063  1.1  mrg   bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
   5064  1.1  mrg 					    vector_label, worker_label);
   5065  1.1  mrg   if (!seen_label)
   5066  1.1  mrg     verify_neutering_labels (to, vector_label, worker_label);
   5067  1.1  mrg }
   5068  1.1  mrg 
   5069  1.1  mrg /* PAR is a parallel that is being skipped in its entirety according to
   5070  1.1  mrg    MASK.  Treat this as skipping a superblock starting at forked
   5071  1.1  mrg    and ending at joining.  */
   5072  1.1  mrg 
   5073  1.1  mrg static void
   5074  1.1  mrg nvptx_skip_par (unsigned mask, parallel *par)
   5075  1.1  mrg {
   5076  1.1  mrg   basic_block tail = par->join_block;
   5077  1.1  mrg   gcc_assert (tail->preds->length () == 1);
   5078  1.1  mrg 
   5079  1.1  mrg   basic_block pre_tail = (*tail->preds)[0]->src;
   5080  1.1  mrg   gcc_assert (pre_tail->succs->length () == 1);
   5081  1.1  mrg 
   5082  1.1  mrg   nvptx_single (mask, par->forked_block, pre_tail);
   5083  1.1  mrg }
   5084  1.1  mrg 
   5085  1.1  mrg /* If PAR has a single inner parallel and PAR itself only contains
   5086  1.1  mrg    empty entry and exit blocks, swallow the inner PAR.  */
   5087  1.1  mrg 
   5088  1.1  mrg static void
   5089  1.1  mrg nvptx_optimize_inner (parallel *par)
   5090  1.1  mrg {
   5091  1.1  mrg   parallel *inner = par->inner;
   5092  1.1  mrg 
   5093  1.1  mrg   /* We mustn't be the outer dummy par.  */
   5094  1.1  mrg   if (!par->mask)
   5095  1.1  mrg     return;
   5096  1.1  mrg 
   5097  1.1  mrg   /* We must have a single inner par.  */
   5098  1.1  mrg   if (!inner || inner->next)
   5099  1.1  mrg     return;
   5100  1.1  mrg 
   5101  1.1  mrg   /* We must only contain 2 blocks ourselves -- the head and tail of
   5102  1.1  mrg      the inner par.  */
   5103  1.1  mrg   if (par->blocks.length () != 2)
   5104  1.1  mrg     return;
   5105  1.1  mrg 
   5106  1.1  mrg   /* We must be disjoint partitioning.  As we only have vector and
   5107  1.1  mrg      worker partitioning, this is sufficient to guarantee the pars
   5108  1.1  mrg      have adjacent partitioning.  */
   5109  1.1  mrg   if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
   5110  1.1  mrg     /* This indicates malformed code generation.  */
   5111  1.1  mrg     return;
   5112  1.1  mrg 
   5113  1.1  mrg   /* The outer forked insn should be immediately followed by the inner
   5114  1.1  mrg      fork insn.  */
   5115  1.1  mrg   rtx_insn *forked = par->forked_insn;
   5116  1.1  mrg   rtx_insn *fork = BB_END (par->forked_block);
   5117  1.1  mrg 
   5118  1.1  mrg   if (NEXT_INSN (forked) != fork)
   5119  1.1  mrg     return;
   5120  1.1  mrg   gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
   5121  1.1  mrg 
   5122  1.1  mrg   /* The outer joining insn must immediately follow the inner join
   5123  1.1  mrg      insn.  */
   5124  1.1  mrg   rtx_insn *joining = par->joining_insn;
   5125  1.1  mrg   rtx_insn *join = inner->join_insn;
   5126  1.1  mrg   if (NEXT_INSN (join) != joining)
   5127  1.1  mrg     return;
   5128  1.1  mrg 
   5129  1.1  mrg   /* Preconditions met.  Swallow the inner par.  */
   5130  1.1  mrg   if (dump_file)
   5131  1.1  mrg     fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
   5132  1.1  mrg 	     inner->mask, inner->forked_block->index,
   5133  1.1  mrg 	     inner->join_block->index,
   5134  1.1  mrg 	     par->mask, par->forked_block->index, par->join_block->index);
   5135  1.1  mrg 
   5136  1.1  mrg   par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
   5137  1.1  mrg 
   5138  1.1  mrg   par->blocks.reserve (inner->blocks.length ());
   5139  1.1  mrg   while (inner->blocks.length ())
   5140  1.1  mrg     par->blocks.quick_push (inner->blocks.pop ());
   5141  1.1  mrg 
   5142  1.1  mrg   par->inner = inner->inner;
   5143  1.1  mrg   inner->inner = NULL;
   5144  1.1  mrg 
   5145  1.1  mrg   delete inner;
   5146  1.1  mrg }
   5147  1.1  mrg 
   5148  1.1  mrg /* Process the parallel PAR and all its contained
   5149  1.1  mrg    parallels.  We do everything but the neutering.  Return mask of
   5150  1.1  mrg    partitioned modes used within this parallel.  */
   5151  1.1  mrg 
   5152  1.1  mrg static unsigned
   5153  1.1  mrg nvptx_process_pars (parallel *par)
   5154  1.1  mrg {
   5155  1.1  mrg   if (nvptx_optimize)
   5156  1.1  mrg     nvptx_optimize_inner (par);
   5157  1.1  mrg 
   5158  1.1  mrg   unsigned inner_mask = par->mask;
   5159  1.1  mrg 
   5160  1.1  mrg   /* Do the inner parallels first.  */
   5161  1.1  mrg   if (par->inner)
   5162  1.1  mrg     {
   5163  1.1  mrg       par->inner_mask = nvptx_process_pars (par->inner);
   5164  1.1  mrg       inner_mask |= par->inner_mask;
   5165  1.1  mrg     }
   5166  1.1  mrg 
   5167  1.1  mrg   bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
   5168  1.1  mrg   bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
   5169  1.1  mrg   bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
   5170  1.1  mrg 		      && nvptx_mach_vector_length () > PTX_WARP_SIZE);
   5171  1.1  mrg 
   5172  1.1  mrg   if (worker || large_vector)
   5173  1.1  mrg     {
   5174  1.1  mrg       nvptx_shared_propagate (false, is_call, par->forked_block,
   5175  1.1  mrg 			      par->forked_insn, !worker);
   5176  1.1  mrg       bool no_prop_p
   5177  1.1  mrg 	= nvptx_shared_propagate (true, is_call, par->forked_block,
   5178  1.1  mrg 				  par->fork_insn, !worker);
   5179  1.1  mrg       bool empty_loop_p
   5180  1.1  mrg 	= !is_call && (NEXT_INSN (par->forked_insn)
   5181  1.1  mrg 		       && NEXT_INSN (par->forked_insn) == par->joining_insn);
   5182  1.1  mrg       rtx barrier = GEN_INT (0);
   5183  1.1  mrg       int threads = 0;
   5184  1.1  mrg 
   5185  1.1  mrg       if (!worker && cfun->machine->sync_bar)
   5186  1.1  mrg 	{
   5187  1.1  mrg 	  barrier = cfun->machine->sync_bar;
   5188  1.1  mrg 	  threads = nvptx_mach_vector_length ();
   5189  1.1  mrg 	}
   5190  1.1  mrg 
   5191  1.1  mrg       if (no_prop_p && empty_loop_p)
   5192  1.1  mrg 	;
   5193  1.1  mrg       else if (no_prop_p && is_call)
   5194  1.1  mrg 	;
   5195  1.1  mrg       else
   5196  1.1  mrg 	{
   5197  1.1  mrg 	  /* Insert begin and end synchronizations.  */
   5198  1.1  mrg 	  emit_insn_before (nvptx_cta_sync (barrier, threads),
   5199  1.1  mrg 			    par->forked_insn);
   5200  1.1  mrg 	  emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
   5201  1.1  mrg 	}
   5202  1.1  mrg     }
   5203  1.1  mrg   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
   5204  1.1  mrg     nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
   5205  1.1  mrg 
   5206  1.1  mrg   /* Now do siblings.  */
   5207  1.1  mrg   if (par->next)
   5208  1.1  mrg     inner_mask |= nvptx_process_pars (par->next);
   5209  1.1  mrg   return inner_mask;
   5210  1.1  mrg }
   5211  1.1  mrg 
   5212  1.1  mrg /* Neuter the parallel described by PAR.  We recurse in depth-first
   5213  1.1  mrg    order.  MODES are the partitioning of the execution and OUTER is
   5214  1.1  mrg    the partitioning of the parallels we are contained in.  */
   5215  1.1  mrg 
   5216  1.1  mrg static void
   5217  1.1  mrg nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
   5218  1.1  mrg {
   5219  1.1  mrg   unsigned me = (par->mask
   5220  1.1  mrg 		 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
   5221  1.1  mrg 		    | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
   5222  1.1  mrg   unsigned  skip_mask = 0, neuter_mask = 0;
   5223  1.1  mrg 
   5224  1.1  mrg   if (par->inner)
   5225  1.1  mrg     nvptx_neuter_pars (par->inner, modes, outer | me);
   5226  1.1  mrg 
   5227  1.1  mrg   for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
   5228  1.1  mrg     {
   5229  1.1  mrg       if ((outer | me) & GOMP_DIM_MASK (mode))
   5230  1.1  mrg 	{} /* Mode is partitioned: no neutering.  */
   5231  1.1  mrg       else if (!(modes & GOMP_DIM_MASK (mode)))
   5232  1.1  mrg 	{} /* Mode is not used: nothing to do.  */
   5233  1.1  mrg       else if (par->inner_mask & GOMP_DIM_MASK (mode)
   5234  1.1  mrg 	       || !par->forked_insn)
   5235  1.1  mrg 	/* Partitioned in inner parallels, or we're not a partitioned
   5236  1.1  mrg 	   at all: neuter individual blocks.  */
   5237  1.1  mrg 	neuter_mask |= GOMP_DIM_MASK (mode);
   5238  1.1  mrg       else if (!par->parent || !par->parent->forked_insn
   5239  1.1  mrg 	       || par->parent->inner_mask & GOMP_DIM_MASK (mode))
   5240  1.1  mrg 	/* Parent isn't a parallel or contains this paralleling: skip
   5241  1.1  mrg 	   parallel at this level.  */
   5242  1.1  mrg 	skip_mask |= GOMP_DIM_MASK (mode);
   5243  1.1  mrg       else
   5244  1.1  mrg 	{} /* Parent will skip this parallel itself.  */
   5245  1.1  mrg     }
   5246  1.1  mrg 
   5247  1.1  mrg   if (neuter_mask)
   5248  1.1  mrg     {
   5249  1.1  mrg       int ix, len;
   5250  1.1  mrg 
   5251  1.1  mrg       if (nvptx_optimize)
   5252  1.1  mrg 	{
   5253  1.1  mrg 	  /* Neuter whole SESE regions.  */
   5254  1.1  mrg 	  bb_pair_vec_t regions;
   5255  1.1  mrg 
   5256  1.1  mrg 	  nvptx_find_sese (par->blocks, regions);
   5257  1.1  mrg 	  len = regions.length ();
   5258  1.1  mrg 	  for (ix = 0; ix != len; ix++)
   5259  1.1  mrg 	    {
   5260  1.1  mrg 	      basic_block from = regions[ix].first;
   5261  1.1  mrg 	      basic_block to = regions[ix].second;
   5262  1.1  mrg 
   5263  1.1  mrg 	      if (from)
   5264  1.1  mrg 		nvptx_single (neuter_mask, from, to);
   5265  1.1  mrg 	      else
   5266  1.1  mrg 		gcc_assert (!to);
   5267  1.1  mrg 	    }
   5268  1.1  mrg 	}
   5269  1.1  mrg       else
   5270  1.1  mrg 	{
   5271  1.1  mrg 	  /* Neuter each BB individually.  */
   5272  1.1  mrg 	  len = par->blocks.length ();
   5273  1.1  mrg 	  for (ix = 0; ix != len; ix++)
   5274  1.1  mrg 	    {
   5275  1.1  mrg 	      basic_block block = par->blocks[ix];
   5276  1.1  mrg 
   5277  1.1  mrg 	      nvptx_single (neuter_mask, block, block);
   5278  1.1  mrg 	    }
   5279  1.1  mrg 	}
   5280  1.1  mrg     }
   5281  1.1  mrg 
   5282  1.1  mrg   if (skip_mask)
   5283  1.1  mrg     nvptx_skip_par (skip_mask, par);
   5284  1.1  mrg 
   5285  1.1  mrg   if (par->next)
   5286  1.1  mrg     nvptx_neuter_pars (par->next, modes, outer);
   5287  1.1  mrg }
   5288  1.1  mrg 
   5289  1.1  mrg static void
   5290  1.1  mrg populate_offload_attrs (offload_attrs *oa)
   5291  1.1  mrg {
   5292  1.1  mrg   tree attr = oacc_get_fn_attrib (current_function_decl);
   5293  1.1  mrg   tree dims = TREE_VALUE (attr);
   5294  1.1  mrg   unsigned ix;
   5295  1.1  mrg 
   5296  1.1  mrg   oa->mask = 0;
   5297  1.1  mrg 
   5298  1.1  mrg   for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
   5299  1.1  mrg     {
   5300  1.1  mrg       tree t = TREE_VALUE (dims);
   5301  1.1  mrg       int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
   5302  1.1  mrg       tree allowed = TREE_PURPOSE (dims);
   5303  1.1  mrg 
   5304  1.1  mrg       if (size != 1 && !(allowed && integer_zerop (allowed)))
   5305  1.1  mrg 	oa->mask |= GOMP_DIM_MASK (ix);
   5306  1.1  mrg 
   5307  1.1  mrg       switch (ix)
   5308  1.1  mrg 	{
   5309  1.1  mrg 	case GOMP_DIM_GANG:
   5310  1.1  mrg 	  oa->num_gangs = size;
   5311  1.1  mrg 	  break;
   5312  1.1  mrg 
   5313  1.1  mrg 	case GOMP_DIM_WORKER:
   5314  1.1  mrg 	  oa->num_workers = size;
   5315  1.1  mrg 	  break;
   5316  1.1  mrg 
   5317  1.1  mrg 	case GOMP_DIM_VECTOR:
   5318  1.1  mrg 	  oa->vector_length = size;
   5319  1.1  mrg 	  break;
   5320  1.1  mrg 	}
   5321  1.1  mrg     }
   5322  1.1  mrg }
   5323  1.1  mrg 
   5324  1.1  mrg #if WORKAROUND_PTXJIT_BUG_2
   5325  1.1  mrg /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT.  This variant
   5326  1.1  mrg    is needed in the nvptx target because the branches generated for
   5327  1.1  mrg    parititioning are NONJUMP_INSN_P, not JUMP_P.  */
   5328  1.1  mrg 
   5329  1.1  mrg static rtx
   5330  1.1  mrg nvptx_pc_set (const rtx_insn *insn, bool strict = true)
   5331  1.1  mrg {
   5332  1.1  mrg   rtx pat;
   5333  1.1  mrg   if ((strict && !JUMP_P (insn))
   5334  1.1  mrg       || (!strict && !INSN_P (insn)))
   5335  1.1  mrg     return NULL_RTX;
   5336  1.1  mrg   pat = PATTERN (insn);
   5337  1.1  mrg 
   5338  1.1  mrg   /* The set is allowed to appear either as the insn pattern or
   5339  1.1  mrg      the first set in a PARALLEL.  */
   5340  1.1  mrg   if (GET_CODE (pat) == PARALLEL)
   5341  1.1  mrg     pat = XVECEXP (pat, 0, 0);
   5342  1.1  mrg   if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
   5343  1.1  mrg     return pat;
   5344  1.1  mrg 
   5345  1.1  mrg   return NULL_RTX;
   5346  1.1  mrg }
   5347  1.1  mrg 
   5348  1.1  mrg /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT.  */
   5349  1.1  mrg 
   5350  1.1  mrg static rtx
   5351  1.1  mrg nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
   5352  1.1  mrg {
   5353  1.1  mrg   rtx x = nvptx_pc_set (insn, strict);
   5354  1.1  mrg 
   5355  1.1  mrg   if (!x)
   5356  1.1  mrg     return NULL_RTX;
   5357  1.1  mrg   x = SET_SRC (x);
   5358  1.1  mrg   if (GET_CODE (x) == LABEL_REF)
   5359  1.1  mrg     return x;
   5360  1.1  mrg   if (GET_CODE (x) != IF_THEN_ELSE)
   5361  1.1  mrg     return NULL_RTX;
   5362  1.1  mrg   if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
   5363  1.1  mrg     return XEXP (x, 1);
   5364  1.1  mrg   if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
   5365  1.1  mrg     return XEXP (x, 2);
   5366  1.1  mrg   return NULL_RTX;
   5367  1.1  mrg }
   5368  1.1  mrg 
   5369  1.1  mrg /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
   5370  1.1  mrg    insn inbetween the branch and the label.  This works around a JIT bug
   5371  1.1  mrg    observed at driver version 384.111, at -O0 for sm_50.  */
   5372  1.1  mrg 
   5373  1.1  mrg static void
   5374  1.1  mrg prevent_branch_around_nothing (void)
   5375  1.1  mrg {
   5376  1.1  mrg   rtx_insn *seen_label = NULL;
   5377  1.1  mrg     for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
   5378  1.1  mrg       {
   5379  1.1  mrg 	if (INSN_P (insn) && condjump_p (insn))
   5380  1.1  mrg 	  {
   5381  1.1  mrg 	    seen_label = label_ref_label (nvptx_condjump_label (insn, false));
   5382  1.1  mrg 	    continue;
   5383  1.1  mrg 	  }
   5384  1.1  mrg 
   5385  1.1  mrg 	if (seen_label == NULL)
   5386  1.1  mrg 	  continue;
   5387  1.1  mrg 
   5388  1.1  mrg 	if (NOTE_P (insn) || DEBUG_INSN_P (insn))
   5389  1.1  mrg 	  continue;
   5390  1.1  mrg 
   5391  1.1  mrg 	if (INSN_P (insn))
   5392  1.1  mrg 	  switch (recog_memoized (insn))
   5393  1.1  mrg 	    {
   5394  1.1  mrg 	    case CODE_FOR_nvptx_fork:
   5395  1.1  mrg 	    case CODE_FOR_nvptx_forked:
   5396  1.1  mrg 	    case CODE_FOR_nvptx_joining:
   5397  1.1  mrg 	    case CODE_FOR_nvptx_join:
   5398  1.1  mrg 	    case CODE_FOR_nop:
   5399  1.1  mrg 	      continue;
   5400  1.1  mrg 	    case -1:
   5401  1.1  mrg 	      /* Handle asm ("") and similar.  */
   5402  1.1  mrg 	      if (GET_CODE (PATTERN (insn)) == ASM_INPUT
   5403  1.1  mrg 		  || GET_CODE (PATTERN (insn)) == ASM_OPERANDS
   5404  1.1  mrg 		  || (GET_CODE (PATTERN (insn)) == PARALLEL
   5405  1.1  mrg 		      && asm_noperands (PATTERN (insn)) >= 0))
   5406  1.1  mrg 		continue;
   5407  1.1  mrg 	      /* FALLTHROUGH.  */
   5408  1.1  mrg 	    default:
   5409  1.1  mrg 	      seen_label = NULL;
   5410  1.1  mrg 	      continue;
   5411  1.1  mrg 	    }
   5412  1.1  mrg 
   5413  1.1  mrg 	if (LABEL_P (insn) && insn == seen_label)
   5414  1.1  mrg 	  emit_insn_before (gen_fake_nop (), insn);
   5415  1.1  mrg 
   5416  1.1  mrg 	seen_label = NULL;
   5417  1.1  mrg       }
   5418  1.1  mrg   }
   5419  1.1  mrg #endif
   5420  1.1  mrg 
   5421  1.1  mrg #ifdef WORKAROUND_PTXJIT_BUG_3
   5422  1.1  mrg /* Insert two membar.cta insns inbetween two subsequent bar.sync insns.  This
   5423  1.1  mrg    works around a hang observed at driver version 390.48 for sm_50.  */
   5424  1.1  mrg 
   5425  1.1  mrg static void
   5426  1.1  mrg workaround_barsyncs (void)
   5427  1.1  mrg {
   5428  1.1  mrg   bool seen_barsync = false;
   5429  1.1  mrg   for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
   5430  1.1  mrg     {
   5431  1.1  mrg       if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
   5432  1.1  mrg 	{
   5433  1.1  mrg 	  if (seen_barsync)
   5434  1.1  mrg 	    {
   5435  1.1  mrg 	      emit_insn_before (gen_nvptx_membar_cta (), insn);
   5436  1.1  mrg 	      emit_insn_before (gen_nvptx_membar_cta (), insn);
   5437  1.1  mrg 	    }
   5438  1.1  mrg 
   5439  1.1  mrg 	  seen_barsync = true;
   5440  1.1  mrg 	  continue;
   5441  1.1  mrg 	}
   5442  1.1  mrg 
   5443  1.1  mrg       if (!seen_barsync)
   5444  1.1  mrg 	continue;
   5445  1.1  mrg 
   5446  1.1  mrg       if (NOTE_P (insn) || DEBUG_INSN_P (insn))
   5447  1.1  mrg 	continue;
   5448  1.1  mrg       else if (INSN_P (insn))
   5449  1.1  mrg 	switch (recog_memoized (insn))
   5450  1.1  mrg 	  {
   5451  1.1  mrg 	  case CODE_FOR_nvptx_fork:
   5452  1.1  mrg 	  case CODE_FOR_nvptx_forked:
   5453  1.1  mrg 	  case CODE_FOR_nvptx_joining:
   5454  1.1  mrg 	  case CODE_FOR_nvptx_join:
   5455  1.1  mrg 	    continue;
   5456  1.1  mrg 	  default:
   5457  1.1  mrg 	    break;
   5458  1.1  mrg 	  }
   5459  1.1  mrg 
   5460  1.1  mrg       seen_barsync = false;
   5461  1.1  mrg     }
   5462  1.1  mrg }
   5463  1.1  mrg #endif
   5464  1.1  mrg 
   5465  1.1  mrg static rtx
   5466  1.1  mrg gen_comment (const char *s)
   5467  1.1  mrg {
   5468  1.1  mrg   const char *sep = " ";
   5469  1.1  mrg   size_t len = strlen (ASM_COMMENT_START) + strlen (sep) + strlen (s) + 1;
   5470  1.1  mrg   char *comment = (char *) alloca (len);
   5471  1.1  mrg   snprintf (comment, len, "%s%s%s", ASM_COMMENT_START, sep, s);
   5472  1.1  mrg   return gen_rtx_ASM_INPUT_loc (VOIDmode, ggc_strdup (comment),
   5473  1.1  mrg 				DECL_SOURCE_LOCATION (cfun->decl));
   5474  1.1  mrg }
   5475  1.1  mrg 
   5476  1.1  mrg /* Initialize all declared regs at function entry.
   5477  1.1  mrg    Advantage   : Fool-proof.
   5478  1.1  mrg    Disadvantage: Potentially creates a lot of long live ranges and adds a lot
   5479  1.1  mrg 		 of insns.  */
   5480  1.1  mrg 
   5481  1.1  mrg static void
   5482  1.1  mrg workaround_uninit_method_1 (void)
   5483  1.1  mrg {
   5484  1.1  mrg   rtx_insn *first = get_insns ();
   5485  1.1  mrg   rtx_insn *insert_here = NULL;
   5486  1.1  mrg 
   5487  1.1  mrg   for (int ix = LAST_VIRTUAL_REGISTER + 1; ix < max_reg_num (); ix++)
   5488  1.1  mrg     {
   5489  1.1  mrg       rtx reg = regno_reg_rtx[ix];
   5490  1.1  mrg 
   5491  1.1  mrg       /* Skip undeclared registers.  */
   5492  1.1  mrg       if (reg == const0_rtx)
   5493  1.1  mrg 	continue;
   5494  1.1  mrg 
   5495  1.1  mrg       gcc_assert (CONST0_RTX (GET_MODE (reg)));
   5496  1.1  mrg 
   5497  1.1  mrg       start_sequence ();
   5498  1.1  mrg       if (nvptx_comment && first != NULL)
   5499  1.1  mrg 	emit_insn (gen_comment ("Start: Added by -minit-regs=1"));
   5500  1.1  mrg       emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
   5501  1.1  mrg       rtx_insn *inits = get_insns ();
   5502  1.1  mrg       end_sequence ();
   5503  1.1  mrg 
   5504  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
   5505  1.1  mrg 	for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
   5506  1.1  mrg 	  fprintf (dump_file, "Default init of reg %u inserted: insn %u\n",
   5507  1.1  mrg 		   ix, INSN_UID (init));
   5508  1.1  mrg 
   5509  1.1  mrg       if (first != NULL)
   5510  1.1  mrg 	{
   5511  1.1  mrg 	  insert_here = emit_insn_before (inits, first);
   5512  1.1  mrg 	  first = NULL;
   5513  1.1  mrg 	}
   5514  1.1  mrg       else
   5515  1.1  mrg 	insert_here = emit_insn_after (inits, insert_here);
   5516  1.1  mrg     }
   5517  1.1  mrg 
   5518  1.1  mrg   if (nvptx_comment && insert_here != NULL)
   5519  1.1  mrg     emit_insn_after (gen_comment ("End: Added by -minit-regs=1"), insert_here);
   5520  1.1  mrg }
   5521  1.1  mrg 
   5522  1.1  mrg /* Find uses of regs that are not defined on all incoming paths, and insert a
   5523  1.1  mrg    corresponding def at function entry.
   5524  1.1  mrg    Advantage   : Simple.
   5525  1.1  mrg    Disadvantage: Potentially creates long live ranges.
   5526  1.1  mrg 		 May not catch all cases.  F.i. a clobber cuts a live range in
   5527  1.1  mrg 		 the compiler and may prevent entry_lr_in from being set for a
   5528  1.1  mrg 		 reg, but the clobber does not translate to a ptx insn, so in
   5529  1.1  mrg 		 ptx there still may be an uninitialized ptx reg.  See f.i.
   5530  1.1  mrg 		 gcc.c-torture/compile/20020926-1.c.  */
   5531  1.1  mrg 
   5532  1.1  mrg static void
   5533  1.1  mrg workaround_uninit_method_2 (void)
   5534  1.1  mrg {
   5535  1.1  mrg   auto_bitmap entry_pseudo_uninit;
   5536  1.1  mrg   {
   5537  1.1  mrg     auto_bitmap not_pseudo;
   5538  1.1  mrg     bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
   5539  1.1  mrg 
   5540  1.1  mrg     bitmap entry_lr_in = DF_LR_IN (ENTRY_BLOCK_PTR_FOR_FN (cfun));
   5541  1.1  mrg     bitmap_and_compl (entry_pseudo_uninit, entry_lr_in, not_pseudo);
   5542  1.1  mrg   }
   5543  1.1  mrg 
   5544  1.1  mrg   rtx_insn *first = get_insns ();
   5545  1.1  mrg   rtx_insn *insert_here = NULL;
   5546  1.1  mrg 
   5547  1.1  mrg   bitmap_iterator iterator;
   5548  1.1  mrg   unsigned ix;
   5549  1.1  mrg   EXECUTE_IF_SET_IN_BITMAP (entry_pseudo_uninit, 0, ix, iterator)
   5550  1.1  mrg     {
   5551  1.1  mrg       rtx reg = regno_reg_rtx[ix];
   5552  1.1  mrg       gcc_assert (CONST0_RTX (GET_MODE (reg)));
   5553  1.1  mrg 
   5554  1.1  mrg       start_sequence ();
   5555  1.1  mrg       if (nvptx_comment && first != NULL)
   5556  1.1  mrg 	emit_insn (gen_comment ("Start: Added by -minit-regs=2:"));
   5557  1.1  mrg       emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
   5558  1.1  mrg       rtx_insn *inits = get_insns ();
   5559  1.1  mrg       end_sequence ();
   5560  1.1  mrg 
   5561  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
   5562  1.1  mrg 	for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
   5563  1.1  mrg 	  fprintf (dump_file, "Missing init of reg %u inserted: insn %u\n",
   5564  1.1  mrg 		   ix, INSN_UID (init));
   5565  1.1  mrg 
   5566  1.1  mrg       if (first != NULL)
   5567  1.1  mrg 	{
   5568  1.1  mrg 	  insert_here = emit_insn_before (inits, first);
   5569  1.1  mrg 	  first = NULL;
   5570  1.1  mrg 	}
   5571  1.1  mrg       else
   5572  1.1  mrg 	insert_here = emit_insn_after (inits, insert_here);
   5573  1.1  mrg     }
   5574  1.1  mrg 
   5575  1.1  mrg   if (nvptx_comment && insert_here != NULL)
   5576  1.1  mrg     emit_insn_after (gen_comment ("End: Added by -minit-regs=2"), insert_here);
   5577  1.1  mrg }
   5578  1.1  mrg 
   5579  1.1  mrg /* Find uses of regs that are not defined on all incoming paths, and insert a
   5580  1.1  mrg    corresponding def on those.
   5581  1.1  mrg    Advantage   : Doesn't create long live ranges.
   5582  1.1  mrg    Disadvantage: More complex, and potentially also more defs.  */
   5583  1.1  mrg 
   5584  1.1  mrg static void
   5585  1.1  mrg workaround_uninit_method_3 (void)
   5586  1.1  mrg {
   5587  1.1  mrg   auto_bitmap not_pseudo;
   5588  1.1  mrg   bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
   5589  1.1  mrg 
   5590  1.1  mrg   basic_block bb;
   5591  1.1  mrg   FOR_EACH_BB_FN (bb, cfun)
   5592  1.1  mrg     {
   5593  1.1  mrg       if (single_pred_p (bb))
   5594  1.1  mrg 	continue;
   5595  1.1  mrg 
   5596  1.1  mrg       auto_bitmap bb_pseudo_uninit;
   5597  1.1  mrg       bitmap_and_compl (bb_pseudo_uninit, DF_LIVE_IN (bb), DF_MIR_IN (bb));
   5598  1.1  mrg       bitmap_and_compl_into (bb_pseudo_uninit, not_pseudo);
   5599  1.1  mrg 
   5600  1.1  mrg       bitmap_iterator iterator;
   5601  1.1  mrg       unsigned ix;
   5602  1.1  mrg       EXECUTE_IF_SET_IN_BITMAP (bb_pseudo_uninit, 0, ix, iterator)
   5603  1.1  mrg 	{
   5604  1.1  mrg 	  bool have_false = false;
   5605  1.1  mrg 	  bool have_true = false;
   5606  1.1  mrg 
   5607  1.1  mrg 	  edge e;
   5608  1.1  mrg 	  edge_iterator ei;
   5609  1.1  mrg 	  FOR_EACH_EDGE (e, ei, bb->preds)
   5610  1.1  mrg 	    {
   5611  1.1  mrg 	      if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
   5612  1.1  mrg 		have_true = true;
   5613  1.1  mrg 	      else
   5614  1.1  mrg 		have_false = true;
   5615  1.1  mrg 	    }
   5616  1.1  mrg 	  if (have_false ^ have_true)
   5617  1.1  mrg 	    continue;
   5618  1.1  mrg 
   5619  1.1  mrg 	  FOR_EACH_EDGE (e, ei, bb->preds)
   5620  1.1  mrg 	    {
   5621  1.1  mrg 	      if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
   5622  1.1  mrg 		continue;
   5623  1.1  mrg 
   5624  1.1  mrg 	      rtx reg = regno_reg_rtx[ix];
   5625  1.1  mrg 	      gcc_assert (CONST0_RTX (GET_MODE (reg)));
   5626  1.1  mrg 
   5627  1.1  mrg 	      start_sequence ();
   5628  1.1  mrg 	      emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
   5629  1.1  mrg 	      rtx_insn *inits = get_insns ();
   5630  1.1  mrg 	      end_sequence ();
   5631  1.1  mrg 
   5632  1.1  mrg 	      if (dump_file && (dump_flags & TDF_DETAILS))
   5633  1.1  mrg 		for (rtx_insn *init = inits; init != NULL;
   5634  1.1  mrg 		     init = NEXT_INSN (init))
   5635  1.1  mrg 		  fprintf (dump_file,
   5636  1.1  mrg 			   "Missing init of reg %u inserted on edge: %d -> %d:"
   5637  1.1  mrg 			   " insn %u\n", ix, e->src->index, e->dest->index,
   5638  1.1  mrg 			   INSN_UID (init));
   5639  1.1  mrg 
   5640  1.1  mrg 	      insert_insn_on_edge (inits, e);
   5641  1.1  mrg 	    }
   5642  1.1  mrg 	}
   5643  1.1  mrg     }
   5644  1.1  mrg 
   5645  1.1  mrg   if (nvptx_comment)
   5646  1.1  mrg     FOR_EACH_BB_FN (bb, cfun)
   5647  1.1  mrg       {
   5648  1.1  mrg 	if (single_pred_p (bb))
   5649  1.1  mrg 	  continue;
   5650  1.1  mrg 
   5651  1.1  mrg 	edge e;
   5652  1.1  mrg 	edge_iterator ei;
   5653  1.1  mrg 	FOR_EACH_EDGE (e, ei, bb->preds)
   5654  1.1  mrg 	  {
   5655  1.1  mrg 	    if (e->insns.r == NULL_RTX)
   5656  1.1  mrg 	      continue;
   5657  1.1  mrg 	    start_sequence ();
   5658  1.1  mrg 	    emit_insn (gen_comment ("Start: Added by -minit-regs=3:"));
   5659  1.1  mrg 	    emit_insn (e->insns.r);
   5660  1.1  mrg 	    emit_insn (gen_comment ("End: Added by -minit-regs=3:"));
   5661  1.1  mrg 	    e->insns.r = get_insns ();
   5662  1.1  mrg 	    end_sequence ();
   5663  1.1  mrg 	  }
   5664  1.1  mrg       }
   5665  1.1  mrg 
   5666  1.1  mrg   commit_edge_insertions ();
   5667  1.1  mrg }
   5668  1.1  mrg 
   5669  1.1  mrg static void
   5670  1.1  mrg workaround_uninit (void)
   5671  1.1  mrg {
   5672  1.1  mrg   switch (nvptx_init_regs)
   5673  1.1  mrg     {
   5674  1.1  mrg     case 0:
   5675  1.1  mrg       /* Skip.  */
   5676  1.1  mrg       break;
   5677  1.1  mrg     case 1:
   5678  1.1  mrg       workaround_uninit_method_1 ();
   5679  1.1  mrg       break;
   5680  1.1  mrg     case 2:
   5681  1.1  mrg       workaround_uninit_method_2 ();
   5682  1.1  mrg       break;
   5683  1.1  mrg     case 3:
   5684  1.1  mrg       workaround_uninit_method_3 ();
   5685  1.1  mrg       break;
   5686  1.1  mrg     default:
   5687  1.1  mrg       gcc_unreachable ();
   5688  1.1  mrg     }
   5689  1.1  mrg }
   5690  1.1  mrg 
   5691  1.1  mrg /* PTX-specific reorganization
   5692  1.1  mrg    - Split blocks at fork and join instructions
   5693  1.1  mrg    - Compute live registers
   5694  1.1  mrg    - Mark now-unused registers, so function begin doesn't declare
   5695  1.1  mrg    unused registers.
   5696  1.1  mrg    - Insert state propagation when entering partitioned mode
   5697  1.1  mrg    - Insert neutering instructions when in single mode
   5698  1.1  mrg    - Replace subregs with suitable sequences.
   5699  1.1  mrg */
   5700  1.1  mrg 
   5701  1.1  mrg static void
   5702  1.1  mrg nvptx_reorg (void)
   5703  1.1  mrg {
   5704  1.1  mrg   /* We are freeing block_for_insn in the toplev to keep compatibility
   5705  1.1  mrg      with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
   5706  1.1  mrg   compute_bb_for_insn ();
   5707  1.1  mrg 
   5708  1.1  mrg   thread_prologue_and_epilogue_insns ();
   5709  1.1  mrg 
   5710  1.1  mrg   /* Split blocks and record interesting unspecs.  */
   5711  1.1  mrg   bb_insn_map_t bb_insn_map;
   5712  1.1  mrg 
   5713  1.1  mrg   nvptx_split_blocks (&bb_insn_map);
   5714  1.1  mrg 
   5715  1.1  mrg   /* Compute live regs */
   5716  1.1  mrg   df_clear_flags (DF_LR_RUN_DCE);
   5717  1.1  mrg   df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
   5718  1.1  mrg   df_live_add_problem ();
   5719  1.1  mrg   df_live_set_all_dirty ();
   5720  1.1  mrg   if (nvptx_init_regs == 3)
   5721  1.1  mrg     df_mir_add_problem ();
   5722  1.1  mrg   df_analyze ();
   5723  1.1  mrg   regstat_init_n_sets_and_refs ();
   5724  1.1  mrg 
   5725  1.1  mrg   if (dump_file)
   5726  1.1  mrg     df_dump (dump_file);
   5727  1.1  mrg 
   5728  1.1  mrg   /* Mark unused regs as unused.  */
   5729  1.1  mrg   int max_regs = max_reg_num ();
   5730  1.1  mrg   for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
   5731  1.1  mrg     if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
   5732  1.1  mrg       regno_reg_rtx[i] = const0_rtx;
   5733  1.1  mrg 
   5734  1.1  mrg   workaround_uninit ();
   5735  1.1  mrg 
   5736  1.1  mrg   /* Determine launch dimensions of the function.  If it is not an
   5737  1.1  mrg      offloaded function  (i.e. this is a regular compiler), the
   5738  1.1  mrg      function has no neutering.  */
   5739  1.1  mrg   tree attr = oacc_get_fn_attrib (current_function_decl);
   5740  1.1  mrg   if (attr)
   5741  1.1  mrg     {
   5742  1.1  mrg       /* If we determined this mask before RTL expansion, we could
   5743  1.1  mrg 	 elide emission of some levels of forks and joins.  */
   5744  1.1  mrg       offload_attrs oa;
   5745  1.1  mrg 
   5746  1.1  mrg       populate_offload_attrs (&oa);
   5747  1.1  mrg 
   5748  1.1  mrg       /* If there is worker neutering, there must be vector
   5749  1.1  mrg 	 neutering.  Otherwise the hardware will fail.  */
   5750  1.1  mrg       gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
   5751  1.1  mrg 		  || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
   5752  1.1  mrg 
   5753  1.1  mrg       /* Discover & process partitioned regions.  */
   5754  1.1  mrg       parallel *pars = nvptx_discover_pars (&bb_insn_map);
   5755  1.1  mrg       nvptx_process_pars (pars);
   5756  1.1  mrg       nvptx_neuter_pars (pars, oa.mask, 0);
   5757  1.1  mrg       delete pars;
   5758  1.1  mrg     }
   5759  1.1  mrg 
   5760  1.1  mrg   /* Replace subregs.  */
   5761  1.1  mrg   nvptx_reorg_subreg ();
   5762  1.1  mrg 
   5763  1.1  mrg   if (TARGET_UNIFORM_SIMT)
   5764  1.1  mrg     nvptx_reorg_uniform_simt ();
   5765  1.1  mrg 
   5766  1.1  mrg #if WORKAROUND_PTXJIT_BUG_2
   5767  1.1  mrg   prevent_branch_around_nothing ();
   5768  1.1  mrg #endif
   5769  1.1  mrg 
   5770  1.1  mrg #ifdef WORKAROUND_PTXJIT_BUG_3
   5771  1.1  mrg   workaround_barsyncs ();
   5772  1.1  mrg #endif
   5773  1.1  mrg 
   5774  1.1  mrg   regstat_free_n_sets_and_refs ();
   5775  1.1  mrg 
   5776  1.1  mrg   df_finish_pass (true);
   5777  1.1  mrg }
   5778  1.1  mrg 
   5779  1.1  mrg /* Handle a "kernel" attribute; arguments as in
   5781  1.1  mrg    struct attribute_spec.handler.  */
   5782  1.1  mrg 
   5783  1.1  mrg static tree
   5784  1.1  mrg nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
   5785  1.1  mrg 			       int ARG_UNUSED (flags), bool *no_add_attrs)
   5786  1.1  mrg {
   5787  1.1  mrg   tree decl = *node;
   5788  1.1  mrg 
   5789  1.1  mrg   if (TREE_CODE (decl) != FUNCTION_DECL)
   5790  1.1  mrg     {
   5791  1.1  mrg       error ("%qE attribute only applies to functions", name);
   5792  1.1  mrg       *no_add_attrs = true;
   5793  1.1  mrg     }
   5794  1.1  mrg   else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
   5795  1.1  mrg     {
   5796  1.1  mrg       error ("%qE attribute requires a void return type", name);
   5797  1.1  mrg       *no_add_attrs = true;
   5798  1.1  mrg     }
   5799  1.1  mrg 
   5800  1.1  mrg   return NULL_TREE;
   5801  1.1  mrg }
   5802  1.1  mrg 
   5803  1.1  mrg /* Handle a "shared" attribute; arguments as in
   5804  1.1  mrg    struct attribute_spec.handler.  */
   5805  1.1  mrg 
   5806  1.1  mrg static tree
   5807  1.1  mrg nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
   5808  1.1  mrg 			       int ARG_UNUSED (flags), bool *no_add_attrs)
   5809  1.1  mrg {
   5810  1.1  mrg   tree decl = *node;
   5811  1.1  mrg 
   5812  1.1  mrg   if (TREE_CODE (decl) != VAR_DECL)
   5813  1.1  mrg     {
   5814  1.1  mrg       error ("%qE attribute only applies to variables", name);
   5815  1.1  mrg       *no_add_attrs = true;
   5816  1.1  mrg     }
   5817  1.1  mrg   else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
   5818  1.1  mrg     {
   5819  1.1  mrg       error ("%qE attribute not allowed with auto storage class", name);
   5820  1.1  mrg       *no_add_attrs = true;
   5821  1.1  mrg     }
   5822  1.1  mrg 
   5823  1.1  mrg   return NULL_TREE;
   5824  1.1  mrg }
   5825  1.1  mrg 
   5826  1.1  mrg /* Table of valid machine attributes.  */
   5827  1.1  mrg static const struct attribute_spec nvptx_attribute_table[] =
   5828  1.1  mrg {
   5829  1.1  mrg   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
   5830  1.1  mrg        affects_type_identity, handler, exclude } */
   5831  1.1  mrg   { "kernel", 0, 0, true, false,  false, false, nvptx_handle_kernel_attribute,
   5832  1.1  mrg     NULL },
   5833  1.1  mrg   { "shared", 0, 0, true, false,  false, false, nvptx_handle_shared_attribute,
   5834  1.1  mrg     NULL },
   5835  1.1  mrg   { NULL, 0, 0, false, false, false, false, NULL, NULL }
   5836  1.1  mrg };
   5837  1.1  mrg 
   5838  1.1  mrg /* Limit vector alignments to BIGGEST_ALIGNMENT.  */
   5840  1.1  mrg 
   5841  1.1  mrg static HOST_WIDE_INT
   5842  1.1  mrg nvptx_vector_alignment (const_tree type)
   5843  1.1  mrg {
   5844  1.1  mrg   unsigned HOST_WIDE_INT align;
   5845  1.1  mrg   tree size = TYPE_SIZE (type);
   5846  1.1  mrg 
   5847  1.1  mrg   /* Ensure align is not bigger than BIGGEST_ALIGNMENT.  */
   5848  1.1  mrg   if (tree_fits_uhwi_p (size))
   5849  1.1  mrg     {
   5850  1.1  mrg       align = tree_to_uhwi (size);
   5851  1.1  mrg       align = MIN (align, BIGGEST_ALIGNMENT);
   5852  1.1  mrg     }
   5853  1.1  mrg   else
   5854  1.1  mrg     align = BIGGEST_ALIGNMENT;
   5855  1.1  mrg 
   5856  1.1  mrg   /* Ensure align is not smaller than mode alignment.  */
   5857  1.1  mrg   align = MAX (align, GET_MODE_ALIGNMENT (TYPE_MODE (type)));
   5858  1.1  mrg 
   5859  1.1  mrg   return align;
   5860  1.1  mrg }
   5861  1.1  mrg 
   5862  1.1  mrg /* Indicate that INSN cannot be duplicated.   */
   5863  1.1  mrg 
   5864  1.1  mrg static bool
   5865  1.1  mrg nvptx_cannot_copy_insn_p (rtx_insn *insn)
   5866  1.1  mrg {
   5867  1.1  mrg   switch (recog_memoized (insn))
   5868  1.1  mrg     {
   5869  1.1  mrg     case CODE_FOR_nvptx_shufflesi:
   5870  1.1  mrg     case CODE_FOR_nvptx_shufflesf:
   5871  1.1  mrg     case CODE_FOR_nvptx_barsync:
   5872  1.1  mrg     case CODE_FOR_nvptx_fork:
   5873  1.1  mrg     case CODE_FOR_nvptx_forked:
   5874  1.1  mrg     case CODE_FOR_nvptx_joining:
   5875  1.1  mrg     case CODE_FOR_nvptx_join:
   5876  1.1  mrg       return true;
   5877  1.1  mrg     default:
   5878  1.1  mrg       return false;
   5879  1.1  mrg     }
   5880  1.1  mrg }
   5881  1.1  mrg 
   5882  1.1  mrg /* Section anchors do not work.  Initialization for flag_section_anchor
   5883  1.1  mrg    probes the existence of the anchoring target hooks and prevents
   5884  1.1  mrg    anchoring if they don't exist.  However, we may be being used with
   5885  1.1  mrg    a host-side compiler that does support anchoring, and hence see
   5886  1.1  mrg    the anchor flag set (as it's not recalculated).  So provide an
   5887  1.1  mrg    implementation denying anchoring.  */
   5888  1.1  mrg 
   5889  1.1  mrg static bool
   5890  1.1  mrg nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
   5891  1.1  mrg {
   5892  1.1  mrg   return false;
   5893  1.1  mrg }
   5894  1.1  mrg 
   5895  1.1  mrg /* Record a symbol for mkoffload to enter into the mapping table.  */
   5897  1.1  mrg 
   5898  1.1  mrg static void
   5899  1.1  mrg nvptx_record_offload_symbol (tree decl)
   5900  1.1  mrg {
   5901  1.1  mrg   switch (TREE_CODE (decl))
   5902  1.1  mrg     {
   5903  1.1  mrg     case VAR_DECL:
   5904  1.1  mrg       fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
   5905  1.1  mrg 	       IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
   5906  1.1  mrg       break;
   5907  1.1  mrg 
   5908  1.1  mrg     case FUNCTION_DECL:
   5909  1.1  mrg       {
   5910  1.1  mrg 	tree attr = oacc_get_fn_attrib (decl);
   5911  1.1  mrg 	/* OpenMP offloading does not set this attribute.  */
   5912  1.1  mrg 	tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
   5913  1.1  mrg 
   5914  1.1  mrg 	fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
   5915  1.1  mrg 		 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
   5916  1.1  mrg 
   5917  1.1  mrg 	for (; dims; dims = TREE_CHAIN (dims))
   5918  1.1  mrg 	  {
   5919  1.1  mrg 	    int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
   5920  1.1  mrg 
   5921  1.1  mrg 	    gcc_assert (!TREE_PURPOSE (dims));
   5922  1.1  mrg 	    fprintf (asm_out_file, ", %#x", size);
   5923  1.1  mrg 	  }
   5924  1.1  mrg 
   5925  1.1  mrg 	fprintf (asm_out_file, "\n");
   5926  1.1  mrg       }
   5927  1.1  mrg       break;
   5928  1.1  mrg 
   5929  1.1  mrg     default:
   5930  1.1  mrg       gcc_unreachable ();
   5931  1.1  mrg     }
   5932  1.1  mrg }
   5933  1.1  mrg 
   5934  1.1  mrg /* Implement TARGET_ASM_FILE_START.  Write the kinds of things ptxas expects
   5935  1.1  mrg    at the start of a file.  */
   5936  1.1  mrg 
   5937  1.1  mrg static void
   5938  1.1  mrg nvptx_file_start (void)
   5939  1.1  mrg {
   5940  1.1  mrg   fputs ("// BEGIN PREAMBLE\n", asm_out_file);
   5941  1.1  mrg 
   5942  1.1  mrg   fputs ("\t.version\t", asm_out_file);
   5943  1.1  mrg   fputs (ptx_version_to_string ((enum ptx_version)ptx_version_option),
   5944  1.1  mrg 	 asm_out_file);
   5945  1.1  mrg   fputs ("\n", asm_out_file);
   5946  1.1  mrg 
   5947  1.1  mrg   fputs ("\t.target\tsm_", asm_out_file);
   5948  1.1  mrg   fputs (sm_version_to_string ((enum ptx_isa)ptx_isa_option),
   5949  1.1  mrg 	 asm_out_file);
   5950  1.1  mrg   fputs ("\n", asm_out_file);
   5951  1.1  mrg 
   5952  1.1  mrg   fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
   5953  1.1  mrg 
   5954  1.1  mrg   fputs ("// END PREAMBLE\n", asm_out_file);
   5955  1.1  mrg }
   5956  1.1  mrg 
   5957  1.1  mrg /* Emit a declaration for a worker and vector-level buffer in .shared
   5958  1.1  mrg    memory.  */
   5959  1.1  mrg 
   5960  1.1  mrg static void
   5961  1.1  mrg write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
   5962  1.1  mrg {
   5963  1.1  mrg   const char *name = XSTR (sym, 0);
   5964  1.1  mrg 
   5965  1.1  mrg   write_var_marker (file, true, false, name);
   5966  1.1  mrg   fprintf (file, ".shared .align %d .u8 %s[%d];\n",
   5967  1.1  mrg 	   align, name, size);
   5968  1.1  mrg }
   5969  1.1  mrg 
   5970  1.1  mrg /* Write out the function declarations we've collected and declare storage
   5971  1.1  mrg    for the broadcast buffer.  */
   5972  1.1  mrg 
   5973  1.1  mrg static void
   5974  1.1  mrg nvptx_file_end (void)
   5975  1.1  mrg {
   5976  1.1  mrg   hash_table<tree_hasher>::iterator iter;
   5977  1.1  mrg   tree decl;
   5978  1.1  mrg   FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
   5979  1.1  mrg     nvptx_record_fndecl (decl);
   5980  1.1  mrg   fputs (func_decls.str().c_str(), asm_out_file);
   5981  1.1  mrg 
   5982  1.1  mrg   if (oacc_bcast_size)
   5983  1.1  mrg     write_shared_buffer (asm_out_file, oacc_bcast_sym,
   5984  1.1  mrg 			 oacc_bcast_align, oacc_bcast_size);
   5985  1.1  mrg 
   5986  1.1  mrg   if (worker_red_size)
   5987  1.1  mrg     write_shared_buffer (asm_out_file, worker_red_sym,
   5988  1.1  mrg 			 worker_red_align, worker_red_size);
   5989  1.1  mrg 
   5990  1.1  mrg   if (vector_red_size)
   5991  1.1  mrg     write_shared_buffer (asm_out_file, vector_red_sym,
   5992  1.1  mrg 			 vector_red_align, vector_red_size);
   5993  1.1  mrg 
   5994  1.1  mrg   if (gang_private_shared_size)
   5995  1.1  mrg     write_shared_buffer (asm_out_file, gang_private_shared_sym,
   5996  1.1  mrg 			 gang_private_shared_align, gang_private_shared_size);
   5997  1.1  mrg 
   5998  1.1  mrg   if (need_softstack_decl)
   5999  1.1  mrg     {
   6000  1.1  mrg       write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
   6001  1.1  mrg       /* 32 is the maximum number of warps in a block.  Even though it's an
   6002  1.1  mrg          external declaration, emit the array size explicitly; otherwise, it
   6003  1.1  mrg          may fail at PTX JIT time if the definition is later in link order.  */
   6004  1.1  mrg       fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
   6005  1.1  mrg 	       POINTER_SIZE);
   6006  1.1  mrg     }
   6007  1.1  mrg   if (need_unisimt_decl)
   6008  1.1  mrg     {
   6009  1.1  mrg       write_var_marker (asm_out_file, false, true, "__nvptx_uni");
   6010  1.1  mrg       fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
   6011  1.1  mrg     }
   6012  1.1  mrg }
   6013  1.1  mrg 
   6014  1.1  mrg /* Expander for the shuffle builtins.  */
   6015  1.1  mrg 
   6016  1.1  mrg static rtx
   6017  1.1  mrg nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
   6018  1.1  mrg {
   6019  1.1  mrg   if (ignore)
   6020  1.1  mrg     return target;
   6021  1.1  mrg 
   6022  1.1  mrg   rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
   6023  1.1  mrg 			 NULL_RTX, mode, EXPAND_NORMAL);
   6024  1.1  mrg   if (!REG_P (src))
   6025  1.1  mrg     src = copy_to_mode_reg (mode, src);
   6026  1.1  mrg 
   6027  1.1  mrg   rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
   6028  1.1  mrg 			 NULL_RTX, SImode, EXPAND_NORMAL);
   6029  1.1  mrg   rtx op = expand_expr (CALL_EXPR_ARG  (exp, 2),
   6030  1.1  mrg 			NULL_RTX, SImode, EXPAND_NORMAL);
   6031  1.1  mrg 
   6032  1.1  mrg   if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
   6033  1.1  mrg     idx = copy_to_mode_reg (SImode, idx);
   6034  1.1  mrg 
   6035  1.1  mrg   rtx pat = nvptx_gen_shuffle (target, src, idx,
   6036  1.1  mrg 			       (nvptx_shuffle_kind) INTVAL (op));
   6037  1.1  mrg   if (pat)
   6038  1.1  mrg     emit_insn (pat);
   6039  1.1  mrg 
   6040  1.1  mrg   return target;
   6041  1.1  mrg }
   6042  1.1  mrg 
   6043  1.1  mrg const char *
   6044  1.1  mrg nvptx_output_red_partition (rtx dst, rtx offset)
   6045  1.1  mrg {
   6046  1.1  mrg   const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
   6047  1.1  mrg   const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
   6048  1.1  mrg 
   6049  1.1  mrg   if (offset == const0_rtx)
   6050  1.1  mrg     fprintf (asm_out_file, zero_offset, REGNO (dst),
   6051  1.1  mrg 	     REGNO (cfun->machine->red_partition));
   6052  1.1  mrg   else
   6053  1.1  mrg     fprintf (asm_out_file, with_offset, REGNO (dst),
   6054  1.1  mrg 	     REGNO (cfun->machine->red_partition), UINTVAL (offset));
   6055  1.1  mrg 
   6056  1.1  mrg   return "";
   6057  1.1  mrg }
   6058  1.1  mrg 
   6059  1.1  mrg /* Shared-memory reduction address expander.  */
   6060  1.1  mrg 
   6061  1.1  mrg static rtx
   6062  1.1  mrg nvptx_expand_shared_addr (tree exp, rtx target,
   6063  1.1  mrg 			  machine_mode ARG_UNUSED (mode), int ignore,
   6064  1.1  mrg 			  int vector)
   6065  1.1  mrg {
   6066  1.1  mrg   if (ignore)
   6067  1.1  mrg     return target;
   6068  1.1  mrg 
   6069  1.1  mrg   unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
   6070  1.1  mrg   unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
   6071  1.1  mrg   unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
   6072  1.1  mrg   rtx addr = worker_red_sym;
   6073  1.1  mrg 
   6074  1.1  mrg   if (vector)
   6075  1.1  mrg     {
   6076  1.1  mrg       offload_attrs oa;
   6077  1.1  mrg 
   6078  1.1  mrg       populate_offload_attrs (&oa);
   6079  1.1  mrg 
   6080  1.1  mrg       unsigned int psize = ROUND_UP (size + offset, align);
   6081  1.1  mrg       unsigned int pnum = nvptx_mach_max_workers ();
   6082  1.1  mrg       vector_red_partition = MAX (vector_red_partition, psize);
   6083  1.1  mrg       vector_red_size = MAX (vector_red_size, psize * pnum);
   6084  1.1  mrg       vector_red_align = MAX (vector_red_align, align);
   6085  1.1  mrg 
   6086  1.1  mrg       if (cfun->machine->red_partition == NULL)
   6087  1.1  mrg 	cfun->machine->red_partition = gen_reg_rtx (Pmode);
   6088  1.1  mrg 
   6089  1.1  mrg       addr = gen_reg_rtx (Pmode);
   6090  1.1  mrg       emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
   6091  1.1  mrg     }
   6092  1.1  mrg   else
   6093  1.1  mrg     {
   6094  1.1  mrg       worker_red_align = MAX (worker_red_align, align);
   6095  1.1  mrg       worker_red_size = MAX (worker_red_size, size + offset);
   6096  1.1  mrg 
   6097  1.1  mrg       if (offset)
   6098  1.1  mrg 	{
   6099  1.1  mrg 	  addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
   6100  1.1  mrg 	  addr = gen_rtx_CONST (Pmode, addr);
   6101  1.1  mrg 	}
   6102  1.1  mrg    }
   6103  1.1  mrg 
   6104  1.1  mrg   emit_move_insn (target, addr);
   6105  1.1  mrg   return target;
   6106  1.1  mrg }
   6107  1.1  mrg 
   6108  1.1  mrg /* Expand the CMP_SWAP PTX builtins.  We have our own versions that do
   6109  1.1  mrg    not require taking the address of any object, other than the memory
   6110  1.1  mrg    cell being operated on.  */
   6111  1.1  mrg 
   6112  1.1  mrg static rtx
   6113  1.1  mrg nvptx_expand_cmp_swap (tree exp, rtx target,
   6114  1.1  mrg 		       machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
   6115  1.1  mrg {
   6116  1.1  mrg   machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
   6117  1.1  mrg 
   6118  1.1  mrg   if (!target)
   6119  1.1  mrg     target = gen_reg_rtx (mode);
   6120  1.1  mrg 
   6121  1.1  mrg   rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
   6122  1.1  mrg 			 NULL_RTX, Pmode, EXPAND_NORMAL);
   6123  1.1  mrg   rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
   6124  1.1  mrg 			 NULL_RTX, mode, EXPAND_NORMAL);
   6125  1.1  mrg   rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
   6126  1.1  mrg 			 NULL_RTX, mode, EXPAND_NORMAL);
   6127  1.1  mrg   rtx pat;
   6128  1.1  mrg 
   6129  1.1  mrg   mem = gen_rtx_MEM (mode, mem);
   6130  1.1  mrg   if (!REG_P (cmp))
   6131  1.1  mrg     cmp = copy_to_mode_reg (mode, cmp);
   6132  1.1  mrg   if (!REG_P (src))
   6133  1.1  mrg     src = copy_to_mode_reg (mode, src);
   6134  1.1  mrg 
   6135  1.1  mrg   if (mode == SImode)
   6136  1.1  mrg     pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
   6137  1.1  mrg   else
   6138  1.1  mrg     pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
   6139  1.1  mrg 
   6140  1.1  mrg   emit_insn (pat);
   6141  1.1  mrg 
   6142  1.1  mrg   return target;
   6143  1.1  mrg }
   6144  1.1  mrg 
   6145  1.1  mrg 
   6146  1.1  mrg /* Codes for all the NVPTX builtins.  */
   6147  1.1  mrg enum nvptx_builtins
   6148  1.1  mrg {
   6149  1.1  mrg   NVPTX_BUILTIN_SHUFFLE,
   6150  1.1  mrg   NVPTX_BUILTIN_SHUFFLELL,
   6151  1.1  mrg   NVPTX_BUILTIN_WORKER_ADDR,
   6152  1.1  mrg   NVPTX_BUILTIN_VECTOR_ADDR,
   6153  1.1  mrg   NVPTX_BUILTIN_CMP_SWAP,
   6154  1.1  mrg   NVPTX_BUILTIN_CMP_SWAPLL,
   6155  1.1  mrg   NVPTX_BUILTIN_MEMBAR_GL,
   6156  1.1  mrg   NVPTX_BUILTIN_MEMBAR_CTA,
   6157  1.1  mrg   NVPTX_BUILTIN_MAX
   6158  1.1  mrg };
   6159  1.1  mrg 
   6160  1.1  mrg static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
   6161  1.1  mrg 
   6162  1.1  mrg /* Return the NVPTX builtin for CODE.  */
   6163  1.1  mrg 
   6164  1.1  mrg static tree
   6165  1.1  mrg nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
   6166  1.1  mrg {
   6167  1.1  mrg   if (code >= NVPTX_BUILTIN_MAX)
   6168  1.1  mrg     return error_mark_node;
   6169  1.1  mrg 
   6170  1.1  mrg   return nvptx_builtin_decls[code];
   6171  1.1  mrg }
   6172  1.1  mrg 
   6173  1.1  mrg /* Set up all builtin functions for this target.  */
   6174  1.1  mrg 
   6175  1.1  mrg static void
   6176  1.1  mrg nvptx_init_builtins (void)
   6177  1.1  mrg {
   6178  1.1  mrg #define DEF(ID, NAME, T)						\
   6179  1.1  mrg   (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID]				\
   6180  1.1  mrg    = add_builtin_function ("__builtin_nvptx_" NAME,			\
   6181  1.1  mrg 			   build_function_type_list T,			\
   6182  1.1  mrg 			   NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
   6183  1.1  mrg #define ST sizetype
   6184  1.1  mrg #define UINT unsigned_type_node
   6185  1.1  mrg #define LLUINT long_long_unsigned_type_node
   6186  1.1  mrg #define PTRVOID ptr_type_node
   6187  1.1  mrg #define VOID void_type_node
   6188  1.1  mrg 
   6189  1.1  mrg   DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
   6190  1.1  mrg   DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
   6191  1.1  mrg   DEF (WORKER_ADDR, "worker_addr",
   6192  1.1  mrg        (PTRVOID, ST, UINT, UINT, NULL_TREE));
   6193  1.1  mrg   DEF (VECTOR_ADDR, "vector_addr",
   6194  1.1  mrg        (PTRVOID, ST, UINT, UINT, NULL_TREE));
   6195  1.1  mrg   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
   6196  1.1  mrg   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
   6197  1.1  mrg   DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE));
   6198  1.1  mrg   DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE));
   6199  1.1  mrg 
   6200  1.1  mrg #undef DEF
   6201  1.1  mrg #undef ST
   6202  1.1  mrg #undef UINT
   6203  1.1  mrg #undef LLUINT
   6204  1.1  mrg #undef PTRVOID
   6205  1.1  mrg }
   6206  1.1  mrg 
   6207  1.1  mrg /* Expand an expression EXP that calls a built-in function,
   6208  1.1  mrg    with result going to TARGET if that's convenient
   6209  1.1  mrg    (and in mode MODE if that's convenient).
   6210  1.1  mrg    SUBTARGET may be used as the target for computing one of EXP's operands.
   6211  1.1  mrg    IGNORE is nonzero if the value is to be ignored.  */
   6212  1.1  mrg 
   6213  1.1  mrg static rtx
   6214  1.1  mrg nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
   6215  1.1  mrg 		      machine_mode mode, int ignore)
   6216  1.1  mrg {
   6217  1.1  mrg   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
   6218  1.1  mrg   switch (DECL_MD_FUNCTION_CODE (fndecl))
   6219  1.1  mrg     {
   6220  1.1  mrg     case NVPTX_BUILTIN_SHUFFLE:
   6221  1.1  mrg     case NVPTX_BUILTIN_SHUFFLELL:
   6222  1.1  mrg       return nvptx_expand_shuffle (exp, target, mode, ignore);
   6223  1.1  mrg 
   6224  1.1  mrg     case NVPTX_BUILTIN_WORKER_ADDR:
   6225  1.1  mrg       return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
   6226  1.1  mrg 
   6227  1.1  mrg     case NVPTX_BUILTIN_VECTOR_ADDR:
   6228  1.1  mrg       return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
   6229  1.1  mrg 
   6230  1.1  mrg     case NVPTX_BUILTIN_CMP_SWAP:
   6231  1.1  mrg     case NVPTX_BUILTIN_CMP_SWAPLL:
   6232  1.1  mrg       return nvptx_expand_cmp_swap (exp, target, mode, ignore);
   6233  1.1  mrg 
   6234  1.1  mrg     case NVPTX_BUILTIN_MEMBAR_GL:
   6235  1.1  mrg       emit_insn (gen_nvptx_membar_gl ());
   6236  1.1  mrg       return NULL_RTX;
   6237  1.1  mrg 
   6238  1.1  mrg     case NVPTX_BUILTIN_MEMBAR_CTA:
   6239  1.1  mrg       emit_insn (gen_nvptx_membar_cta ());
   6240  1.1  mrg       return NULL_RTX;
   6241  1.1  mrg 
   6242  1.1  mrg     default: gcc_unreachable ();
   6243  1.1  mrg     }
   6244  1.1  mrg }
   6245  1.1  mrg 
   6246  1.1  mrg /* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
   6247  1.1  mrg 
   6248  1.1  mrg static int
   6249  1.1  mrg nvptx_simt_vf ()
   6250  1.1  mrg {
   6251  1.1  mrg   return PTX_WARP_SIZE;
   6252  1.1  mrg }
   6253  1.1  mrg 
   6254  1.1  mrg /* Return 1 if TRAIT NAME is present in the OpenMP context's
   6255  1.1  mrg    device trait set, return 0 if not present in any OpenMP context in the
   6256  1.1  mrg    whole translation unit, or -1 if not present in the current OpenMP context
   6257  1.1  mrg    but might be present in another OpenMP context in the same TU.  */
   6258  1.1  mrg 
   6259  1.1  mrg int
   6260  1.1  mrg nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
   6261  1.1  mrg 				const char *name)
   6262  1.1  mrg {
   6263  1.1  mrg   switch (trait)
   6264  1.1  mrg     {
   6265  1.1  mrg     case omp_device_kind:
   6266  1.1  mrg       return strcmp (name, "gpu") == 0;
   6267  1.1  mrg     case omp_device_arch:
   6268  1.1  mrg       return strcmp (name, "nvptx") == 0;
   6269  1.1  mrg     case omp_device_isa:
   6270  1.1  mrg #define NVPTX_SM(XX, SEP)				\
   6271  1.1  mrg       {							\
   6272  1.1  mrg 	if (strcmp (name, "sm_" #XX) == 0)		\
   6273  1.1  mrg 	  return ptx_isa_option == PTX_ISA_SM ## XX;	\
   6274  1.1  mrg       }
   6275  1.1  mrg #include "nvptx-sm.def"
   6276  1.1  mrg #undef NVPTX_SM
   6277  1.1  mrg       return 0;
   6278  1.1  mrg     default:
   6279  1.1  mrg       gcc_unreachable ();
   6280  1.1  mrg     }
   6281  1.1  mrg }
   6282  1.1  mrg 
   6283  1.1  mrg static bool
   6284  1.1  mrg nvptx_welformed_vector_length_p (int l)
   6285  1.1  mrg {
   6286  1.1  mrg   gcc_assert (l > 0);
   6287  1.1  mrg   return l % PTX_WARP_SIZE == 0;
   6288  1.1  mrg }
   6289  1.1  mrg 
   6290  1.1  mrg static void
   6291  1.1  mrg nvptx_apply_dim_limits (int dims[])
   6292  1.1  mrg {
   6293  1.1  mrg   /* Check that the vector_length is not too large.  */
   6294  1.1  mrg   if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
   6295  1.1  mrg     dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
   6296  1.1  mrg 
   6297  1.1  mrg   /* Check that the number of workers is not too large.  */
   6298  1.1  mrg   if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
   6299  1.1  mrg     dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
   6300  1.1  mrg 
   6301  1.1  mrg   /* Ensure that num_worker * vector_length <= cta size.  */
   6302  1.1  mrg   if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
   6303  1.1  mrg       && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
   6304  1.1  mrg     dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
   6305  1.1  mrg 
   6306  1.1  mrg   /* If we need a per-worker barrier ... .  */
   6307  1.1  mrg   if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
   6308  1.1  mrg       && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
   6309  1.1  mrg     /* Don't use more barriers than available.  */
   6310  1.1  mrg     dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
   6311  1.1  mrg 				 PTX_NUM_PER_WORKER_BARRIERS);
   6312  1.1  mrg }
   6313  1.1  mrg 
   6314  1.1  mrg /* Return true if FNDECL contains calls to vector-partitionable routines.  */
   6315  1.1  mrg 
   6316  1.1  mrg static bool
   6317  1.1  mrg has_vector_partitionable_routine_calls_p (tree fndecl)
   6318  1.1  mrg {
   6319  1.1  mrg   if (!fndecl)
   6320  1.1  mrg     return false;
   6321  1.1  mrg 
   6322  1.1  mrg   basic_block bb;
   6323  1.1  mrg   FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
   6324  1.1  mrg     for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
   6325  1.1  mrg 	 gsi_next_nondebug (&i))
   6326  1.1  mrg       {
   6327  1.1  mrg 	gimple *stmt = gsi_stmt (i);
   6328  1.1  mrg 	if (gimple_code (stmt) != GIMPLE_CALL)
   6329  1.1  mrg 	  continue;
   6330  1.1  mrg 
   6331  1.1  mrg 	tree callee = gimple_call_fndecl (stmt);
   6332  1.1  mrg 	if (!callee)
   6333  1.1  mrg 	  continue;
   6334  1.1  mrg 
   6335  1.1  mrg 	tree attrs  = oacc_get_fn_attrib (callee);
   6336  1.1  mrg 	if (attrs == NULL_TREE)
   6337  1.1  mrg 	  return false;
   6338  1.1  mrg 
   6339  1.1  mrg 	int partition_level = oacc_fn_attrib_level (attrs);
   6340  1.1  mrg 	bool seq_routine_p = partition_level == GOMP_DIM_MAX;
   6341  1.1  mrg 	if (!seq_routine_p)
   6342  1.1  mrg 	  return true;
   6343  1.1  mrg       }
   6344  1.1  mrg 
   6345  1.1  mrg   return false;
   6346  1.1  mrg }
   6347  1.1  mrg 
   6348  1.1  mrg /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
   6349  1.1  mrg    DIMS has changed.  */
   6350  1.1  mrg 
   6351  1.1  mrg static void
   6352  1.1  mrg nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
   6353  1.1  mrg {
   6354  1.1  mrg   bool oacc_default_dims_p = false;
   6355  1.1  mrg   bool oacc_min_dims_p = false;
   6356  1.1  mrg   bool offload_region_p = false;
   6357  1.1  mrg   bool routine_p = false;
   6358  1.1  mrg   bool routine_seq_p = false;
   6359  1.1  mrg   int default_vector_length = -1;
   6360  1.1  mrg 
   6361  1.1  mrg   if (decl == NULL_TREE)
   6362  1.1  mrg     {
   6363  1.1  mrg       if (fn_level == -1)
   6364  1.1  mrg 	oacc_default_dims_p = true;
   6365  1.1  mrg       else if (fn_level == -2)
   6366  1.1  mrg 	oacc_min_dims_p = true;
   6367  1.1  mrg       else
   6368  1.1  mrg 	gcc_unreachable ();
   6369  1.1  mrg     }
   6370  1.1  mrg   else if (fn_level == -1)
   6371  1.1  mrg     offload_region_p = true;
   6372  1.1  mrg   else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
   6373  1.1  mrg     {
   6374  1.1  mrg       routine_p = true;
   6375  1.1  mrg       routine_seq_p = fn_level == GOMP_DIM_MAX;
   6376  1.1  mrg     }
   6377  1.1  mrg   else
   6378  1.1  mrg     gcc_unreachable ();
   6379  1.1  mrg 
   6380  1.1  mrg   if (oacc_min_dims_p)
   6381  1.1  mrg     {
   6382  1.1  mrg       gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
   6383  1.1  mrg       gcc_assert (dims[GOMP_DIM_WORKER] == 1);
   6384  1.1  mrg       gcc_assert (dims[GOMP_DIM_GANG] == 1);
   6385  1.1  mrg 
   6386  1.1  mrg       dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
   6387  1.1  mrg       return;
   6388  1.1  mrg     }
   6389  1.1  mrg 
   6390  1.1  mrg   if (routine_p)
   6391  1.1  mrg     {
   6392  1.1  mrg       if (!routine_seq_p)
   6393  1.1  mrg 	dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
   6394  1.1  mrg 
   6395  1.1  mrg       return;
   6396  1.1  mrg     }
   6397  1.1  mrg 
   6398  1.1  mrg   if (oacc_default_dims_p)
   6399  1.1  mrg     {
   6400  1.1  mrg       /* -1  : not set
   6401  1.1  mrg 	  0  : set at runtime, f.i. -fopenacc-dims=-
   6402  1.1  mrg          >= 1: set at compile time, f.i. -fopenacc-dims=1.  */
   6403  1.1  mrg       gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
   6404  1.1  mrg       gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
   6405  1.1  mrg       gcc_assert (dims[GOMP_DIM_GANG] >= -1);
   6406  1.1  mrg 
   6407  1.1  mrg       /* But -fopenacc-dims=- is not yet supported on trunk.  */
   6408  1.1  mrg       gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
   6409  1.1  mrg       gcc_assert (dims[GOMP_DIM_WORKER] != 0);
   6410  1.1  mrg       gcc_assert (dims[GOMP_DIM_GANG] != 0);
   6411  1.1  mrg     }
   6412  1.1  mrg 
   6413  1.1  mrg   if (offload_region_p)
   6414  1.1  mrg     {
   6415  1.1  mrg       /* -1   : not set
   6416  1.1  mrg 	  0   : set using variable, f.i. num_gangs (n)
   6417  1.1  mrg 	  >= 1: set using constant, f.i. num_gangs (1).  */
   6418  1.1  mrg       gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
   6419  1.1  mrg       gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
   6420  1.1  mrg       gcc_assert (dims[GOMP_DIM_GANG] >= -1);
   6421  1.1  mrg     }
   6422  1.1  mrg 
   6423  1.1  mrg   if (offload_region_p)
   6424  1.1  mrg     default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
   6425  1.1  mrg   else
   6426  1.1  mrg     /* oacc_default_dims_p.  */
   6427  1.1  mrg     default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
   6428  1.1  mrg 
   6429  1.1  mrg   int old_dims[GOMP_DIM_MAX];
   6430  1.1  mrg   unsigned int i;
   6431  1.1  mrg   for (i = 0; i < GOMP_DIM_MAX; ++i)
   6432  1.1  mrg     old_dims[i] = dims[i];
   6433  1.1  mrg 
   6434  1.1  mrg   const char *vector_reason = NULL;
   6435  1.1  mrg   if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
   6436  1.1  mrg     {
   6437  1.1  mrg       default_vector_length = PTX_WARP_SIZE;
   6438  1.1  mrg 
   6439  1.1  mrg       if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
   6440  1.1  mrg 	{
   6441  1.1  mrg 	  vector_reason = G_("using %<vector_length (%d)%> due to call to"
   6442  1.1  mrg 			     " vector-partitionable routine, ignoring %d");
   6443  1.1  mrg 	  dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
   6444  1.1  mrg 	}
   6445  1.1  mrg     }
   6446  1.1  mrg 
   6447  1.1  mrg   if (dims[GOMP_DIM_VECTOR] == 0)
   6448  1.1  mrg     {
   6449  1.1  mrg       vector_reason = G_("using %<vector_length (%d)%>, ignoring runtime setting");
   6450  1.1  mrg       dims[GOMP_DIM_VECTOR] = default_vector_length;
   6451  1.1  mrg     }
   6452  1.1  mrg 
   6453  1.1  mrg   if (dims[GOMP_DIM_VECTOR] > 0
   6454  1.1  mrg       && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
   6455  1.1  mrg     dims[GOMP_DIM_VECTOR] = default_vector_length;
   6456  1.1  mrg 
   6457  1.1  mrg   nvptx_apply_dim_limits (dims);
   6458  1.1  mrg 
   6459  1.1  mrg   if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
   6460  1.1  mrg     warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
   6461  1.1  mrg 		vector_reason != NULL
   6462  1.1  mrg 		? vector_reason
   6463  1.1  mrg 		: G_("using %<vector_length (%d)%>, ignoring %d"),
   6464  1.1  mrg 		dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
   6465  1.1  mrg 
   6466  1.1  mrg   if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
   6467  1.1  mrg     warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
   6468  1.1  mrg 		G_("using %<num_workers (%d)%>, ignoring %d"),
   6469  1.1  mrg 		dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
   6470  1.1  mrg 
   6471  1.1  mrg   if (oacc_default_dims_p)
   6472  1.1  mrg     {
   6473  1.1  mrg       if (dims[GOMP_DIM_VECTOR] < 0)
   6474  1.1  mrg 	dims[GOMP_DIM_VECTOR] = default_vector_length;
   6475  1.1  mrg       if (dims[GOMP_DIM_WORKER] < 0)
   6476  1.1  mrg 	dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
   6477  1.1  mrg       if (dims[GOMP_DIM_GANG] < 0)
   6478  1.1  mrg 	dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
   6479  1.1  mrg       nvptx_apply_dim_limits (dims);
   6480  1.1  mrg     }
   6481  1.1  mrg 
   6482  1.1  mrg   if (offload_region_p)
   6483  1.1  mrg     {
   6484  1.1  mrg       for (i = 0; i < GOMP_DIM_MAX; i++)
   6485  1.1  mrg 	{
   6486  1.1  mrg 	  if (!(dims[i] < 0))
   6487  1.1  mrg 	    continue;
   6488  1.1  mrg 
   6489  1.1  mrg 	  if ((used & GOMP_DIM_MASK (i)) == 0)
   6490  1.1  mrg 	    /* Function oacc_validate_dims will apply the minimal dimension.  */
   6491  1.1  mrg 	    continue;
   6492  1.1  mrg 
   6493  1.1  mrg 	  dims[i] = (i == GOMP_DIM_VECTOR
   6494  1.1  mrg 		     ? default_vector_length
   6495  1.1  mrg 		     : oacc_get_default_dim (i));
   6496  1.1  mrg 	}
   6497  1.1  mrg 
   6498  1.1  mrg       nvptx_apply_dim_limits (dims);
   6499  1.1  mrg     }
   6500  1.1  mrg }
   6501  1.1  mrg 
   6502  1.1  mrg /* Validate compute dimensions of an OpenACC offload or routine, fill
   6503  1.1  mrg    in non-unity defaults.  FN_LEVEL indicates the level at which a
   6504  1.1  mrg    routine might spawn a loop.  It is negative for non-routines.  If
   6505  1.1  mrg    DECL is null, we are validating the default dimensions.  */
   6506  1.1  mrg 
   6507  1.1  mrg static bool
   6508  1.1  mrg nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
   6509  1.1  mrg {
   6510  1.1  mrg   int old_dims[GOMP_DIM_MAX];
   6511  1.1  mrg   unsigned int i;
   6512  1.1  mrg 
   6513  1.1  mrg   for (i = 0; i < GOMP_DIM_MAX; ++i)
   6514  1.1  mrg     old_dims[i] = dims[i];
   6515  1.1  mrg 
   6516  1.1  mrg   nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
   6517  1.1  mrg 
   6518  1.1  mrg   gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
   6519  1.1  mrg   if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
   6520  1.1  mrg     gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
   6521  1.1  mrg 
   6522  1.1  mrg   for (i = 0; i < GOMP_DIM_MAX; ++i)
   6523  1.1  mrg     if (old_dims[i] != dims[i])
   6524  1.1  mrg       return true;
   6525  1.1  mrg 
   6526  1.1  mrg   return false;
   6527  1.1  mrg }
   6528  1.1  mrg 
   6529  1.1  mrg /* Return maximum dimension size, or zero for unbounded.  */
   6530  1.1  mrg 
   6531  1.1  mrg static int
   6532  1.1  mrg nvptx_dim_limit (int axis)
   6533  1.1  mrg {
   6534  1.1  mrg   switch (axis)
   6535  1.1  mrg     {
   6536  1.1  mrg     case GOMP_DIM_VECTOR:
   6537  1.1  mrg       return PTX_MAX_VECTOR_LENGTH;
   6538  1.1  mrg 
   6539  1.1  mrg     default:
   6540  1.1  mrg       break;
   6541  1.1  mrg     }
   6542  1.1  mrg   return 0;
   6543  1.1  mrg }
   6544  1.1  mrg 
   6545  1.1  mrg /* Determine whether fork & joins are needed.  */
   6546  1.1  mrg 
   6547  1.1  mrg static bool
   6548  1.1  mrg nvptx_goacc_fork_join (gcall *call, const int dims[],
   6549  1.1  mrg 		       bool ARG_UNUSED (is_fork))
   6550  1.1  mrg {
   6551  1.1  mrg   tree arg = gimple_call_arg (call, 2);
   6552  1.1  mrg   unsigned axis = TREE_INT_CST_LOW (arg);
   6553  1.1  mrg 
   6554  1.1  mrg   /* We only care about worker and vector partitioning.  */
   6555  1.1  mrg   if (axis < GOMP_DIM_WORKER)
   6556  1.1  mrg     return false;
   6557  1.1  mrg 
   6558  1.1  mrg   /* If the size is 1, there's no partitioning.  */
   6559  1.1  mrg   if (dims[axis] == 1)
   6560  1.1  mrg     return false;
   6561  1.1  mrg 
   6562  1.1  mrg   return true;
   6563  1.1  mrg }
   6564  1.1  mrg 
   6565  1.1  mrg /* Generate a PTX builtin function call that returns the address in
   6566  1.1  mrg    the worker reduction buffer at OFFSET.  TYPE is the type of the
   6567  1.1  mrg    data at that location.  */
   6568  1.1  mrg 
   6569  1.1  mrg static tree
   6570  1.1  mrg nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
   6571  1.1  mrg {
   6572  1.1  mrg   enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
   6573  1.1  mrg   if (vector)
   6574  1.1  mrg     addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
   6575  1.1  mrg   machine_mode mode = TYPE_MODE (type);
   6576  1.1  mrg   tree fndecl = nvptx_builtin_decl (addr_dim, true);
   6577  1.1  mrg   tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
   6578  1.1  mrg   tree align = build_int_cst (unsigned_type_node,
   6579  1.1  mrg 			      GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
   6580  1.1  mrg   tree call = build_call_expr (fndecl, 3, offset, size, align);
   6581  1.1  mrg 
   6582  1.1  mrg   return fold_convert (build_pointer_type (type), call);
   6583  1.1  mrg }
   6584  1.1  mrg 
   6585  1.1  mrg /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR.  This function
   6586  1.1  mrg    will cast the variable if necessary.  */
   6587  1.1  mrg 
   6588  1.1  mrg static void
   6589  1.1  mrg nvptx_generate_vector_shuffle (location_t loc,
   6590  1.1  mrg 			       tree dest_var, tree var, unsigned shift,
   6591  1.1  mrg 			       gimple_seq *seq)
   6592  1.1  mrg {
   6593  1.1  mrg   unsigned fn = NVPTX_BUILTIN_SHUFFLE;
   6594  1.1  mrg   tree_code code = NOP_EXPR;
   6595  1.1  mrg   tree arg_type = unsigned_type_node;
   6596  1.1  mrg   tree var_type = TREE_TYPE (var);
   6597  1.1  mrg   tree dest_type = var_type;
   6598  1.1  mrg 
   6599  1.1  mrg   if (TREE_CODE (var_type) == COMPLEX_TYPE)
   6600  1.1  mrg     var_type = TREE_TYPE (var_type);
   6601  1.1  mrg 
   6602  1.1  mrg   if (TREE_CODE (var_type) == REAL_TYPE)
   6603  1.1  mrg     code = VIEW_CONVERT_EXPR;
   6604  1.1  mrg 
   6605  1.1  mrg   if (TYPE_SIZE (var_type)
   6606  1.1  mrg       == TYPE_SIZE (long_long_unsigned_type_node))
   6607  1.1  mrg     {
   6608  1.1  mrg       fn = NVPTX_BUILTIN_SHUFFLELL;
   6609  1.1  mrg       arg_type = long_long_unsigned_type_node;
   6610  1.1  mrg     }
   6611  1.1  mrg 
   6612  1.1  mrg   tree call = nvptx_builtin_decl (fn, true);
   6613  1.1  mrg   tree bits = build_int_cst (unsigned_type_node, shift);
   6614  1.1  mrg   tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
   6615  1.1  mrg   tree expr;
   6616  1.1  mrg 
   6617  1.1  mrg   if (var_type != dest_type)
   6618  1.1  mrg     {
   6619  1.1  mrg       /* Do real and imaginary parts separately.  */
   6620  1.1  mrg       tree real = fold_build1 (REALPART_EXPR, var_type, var);
   6621  1.1  mrg       real = fold_build1 (code, arg_type, real);
   6622  1.1  mrg       real = build_call_expr_loc (loc, call, 3, real, bits, kind);
   6623  1.1  mrg       real = fold_build1 (code, var_type, real);
   6624  1.1  mrg 
   6625  1.1  mrg       tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
   6626  1.1  mrg       imag = fold_build1 (code, arg_type, imag);
   6627  1.1  mrg       imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
   6628  1.1  mrg       imag = fold_build1 (code, var_type, imag);
   6629  1.1  mrg 
   6630  1.1  mrg       expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
   6631  1.1  mrg     }
   6632  1.1  mrg   else
   6633  1.1  mrg     {
   6634  1.1  mrg       expr = fold_build1 (code, arg_type, var);
   6635  1.1  mrg       expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
   6636  1.1  mrg       expr = fold_build1 (code, dest_type, expr);
   6637  1.1  mrg     }
   6638  1.1  mrg 
   6639  1.1  mrg   gimplify_assign (dest_var, expr, seq);
   6640  1.1  mrg }
   6641  1.1  mrg 
   6642  1.1  mrg /* Lazily generate the global lock var decl and return its address.  */
   6643  1.1  mrg 
   6644  1.1  mrg static tree
   6645  1.1  mrg nvptx_global_lock_addr ()
   6646  1.1  mrg {
   6647  1.1  mrg   tree v = global_lock_var;
   6648  1.1  mrg 
   6649  1.1  mrg   if (!v)
   6650  1.1  mrg     {
   6651  1.1  mrg       tree name = get_identifier ("__reduction_lock");
   6652  1.1  mrg       tree type = build_qualified_type (unsigned_type_node,
   6653  1.1  mrg 					TYPE_QUAL_VOLATILE);
   6654  1.1  mrg       v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
   6655  1.1  mrg       global_lock_var = v;
   6656  1.1  mrg       DECL_ARTIFICIAL (v) = 1;
   6657  1.1  mrg       DECL_EXTERNAL (v) = 1;
   6658  1.1  mrg       TREE_STATIC (v) = 1;
   6659  1.1  mrg       TREE_PUBLIC (v) = 1;
   6660  1.1  mrg       TREE_USED (v) = 1;
   6661  1.1  mrg       mark_addressable (v);
   6662  1.1  mrg       mark_decl_referenced (v);
   6663  1.1  mrg     }
   6664  1.1  mrg 
   6665  1.1  mrg   return build_fold_addr_expr (v);
   6666  1.1  mrg }
   6667  1.1  mrg 
   6668  1.1  mrg /* Insert code to locklessly update *PTR with *PTR OP VAR just before
   6669  1.1  mrg    GSI.  We use a lockless scheme for nearly all case, which looks
   6670  1.1  mrg    like:
   6671  1.1  mrg      actual = initval(OP);
   6672  1.1  mrg      do {
   6673  1.1  mrg        guess = actual;
   6674  1.1  mrg        write = guess OP myval;
   6675  1.1  mrg        actual = cmp&swap (ptr, guess, write)
   6676  1.1  mrg      } while (actual bit-different-to guess);
   6677  1.1  mrg    return write;
   6678  1.1  mrg 
   6679  1.1  mrg    This relies on a cmp&swap instruction, which is available for 32-
   6680  1.1  mrg    and 64-bit types.  Larger types must use a locking scheme.  */
   6681  1.1  mrg 
   6682  1.1  mrg static tree
   6683  1.1  mrg nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
   6684  1.1  mrg 		       tree ptr, tree var, tree_code op)
   6685  1.1  mrg {
   6686  1.1  mrg   unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
   6687  1.1  mrg   tree_code code = NOP_EXPR;
   6688  1.1  mrg   tree arg_type = unsigned_type_node;
   6689  1.1  mrg   tree var_type = TREE_TYPE (var);
   6690  1.1  mrg 
   6691  1.1  mrg   if (TREE_CODE (var_type) == COMPLEX_TYPE
   6692  1.1  mrg       || TREE_CODE (var_type) == REAL_TYPE)
   6693  1.1  mrg     code = VIEW_CONVERT_EXPR;
   6694  1.1  mrg 
   6695  1.1  mrg   if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
   6696  1.1  mrg     {
   6697  1.1  mrg       arg_type = long_long_unsigned_type_node;
   6698  1.1  mrg       fn = NVPTX_BUILTIN_CMP_SWAPLL;
   6699  1.1  mrg     }
   6700  1.1  mrg 
   6701  1.1  mrg   tree swap_fn = nvptx_builtin_decl (fn, true);
   6702  1.1  mrg 
   6703  1.1  mrg   gimple_seq init_seq = NULL;
   6704  1.1  mrg   tree init_var = make_ssa_name (arg_type);
   6705  1.1  mrg   tree init_expr = omp_reduction_init_op (loc, op, var_type);
   6706  1.1  mrg   init_expr = fold_build1 (code, arg_type, init_expr);
   6707  1.1  mrg   gimplify_assign (init_var, init_expr, &init_seq);
   6708  1.1  mrg   gimple *init_end = gimple_seq_last (init_seq);
   6709  1.1  mrg 
   6710  1.1  mrg   gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
   6711  1.1  mrg 
   6712  1.1  mrg   /* Split the block just after the init stmts.  */
   6713  1.1  mrg   basic_block pre_bb = gsi_bb (*gsi);
   6714  1.1  mrg   edge pre_edge = split_block (pre_bb, init_end);
   6715  1.1  mrg   basic_block loop_bb = pre_edge->dest;
   6716  1.1  mrg   pre_bb = pre_edge->src;
   6717  1.1  mrg   /* Reset the iterator.  */
   6718  1.1  mrg   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
   6719  1.1  mrg 
   6720  1.1  mrg   tree expect_var = make_ssa_name (arg_type);
   6721  1.1  mrg   tree actual_var = make_ssa_name (arg_type);
   6722  1.1  mrg   tree write_var = make_ssa_name (arg_type);
   6723  1.1  mrg 
   6724  1.1  mrg   /* Build and insert the reduction calculation.  */
   6725  1.1  mrg   gimple_seq red_seq = NULL;
   6726  1.1  mrg   tree write_expr = fold_build1 (code, var_type, expect_var);
   6727  1.1  mrg   write_expr = fold_build2 (op, var_type, write_expr, var);
   6728  1.1  mrg   write_expr = fold_build1 (code, arg_type, write_expr);
   6729  1.1  mrg   gimplify_assign (write_var, write_expr, &red_seq);
   6730  1.1  mrg 
   6731  1.1  mrg   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
   6732  1.1  mrg 
   6733  1.1  mrg   /* Build & insert the cmp&swap sequence.  */
   6734  1.1  mrg   gimple_seq latch_seq = NULL;
   6735  1.1  mrg   tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
   6736  1.1  mrg 					ptr, expect_var, write_var);
   6737  1.1  mrg   gimplify_assign (actual_var, swap_expr, &latch_seq);
   6738  1.1  mrg 
   6739  1.1  mrg   gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
   6740  1.1  mrg 				   NULL_TREE, NULL_TREE);
   6741  1.1  mrg   gimple_seq_add_stmt (&latch_seq, cond);
   6742  1.1  mrg 
   6743  1.1  mrg   gimple *latch_end = gimple_seq_last (latch_seq);
   6744  1.1  mrg   gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
   6745  1.1  mrg 
   6746  1.1  mrg   /* Split the block just after the latch stmts.  */
   6747  1.1  mrg   edge post_edge = split_block (loop_bb, latch_end);
   6748  1.1  mrg   basic_block post_bb = post_edge->dest;
   6749  1.1  mrg   loop_bb = post_edge->src;
   6750  1.1  mrg   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
   6751  1.1  mrg 
   6752  1.1  mrg   post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
   6753  1.1  mrg   post_edge->probability = profile_probability::even ();
   6754  1.1  mrg   edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
   6755  1.1  mrg   loop_edge->probability = profile_probability::even ();
   6756  1.1  mrg   set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
   6757  1.1  mrg   set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
   6758  1.1  mrg 
   6759  1.1  mrg   gphi *phi = create_phi_node (expect_var, loop_bb);
   6760  1.1  mrg   add_phi_arg (phi, init_var, pre_edge, loc);
   6761  1.1  mrg   add_phi_arg (phi, actual_var, loop_edge, loc);
   6762  1.1  mrg 
   6763  1.1  mrg   loop *loop = alloc_loop ();
   6764  1.1  mrg   loop->header = loop_bb;
   6765  1.1  mrg   loop->latch = loop_bb;
   6766  1.1  mrg   add_loop (loop, loop_bb->loop_father);
   6767  1.1  mrg 
   6768  1.1  mrg   return fold_build1 (code, var_type, write_var);
   6769  1.1  mrg }
   6770  1.1  mrg 
   6771  1.1  mrg /* Insert code to lockfully update *PTR with *PTR OP VAR just before
   6772  1.1  mrg    GSI.  This is necessary for types larger than 64 bits, where there
   6773  1.1  mrg    is no cmp&swap instruction to implement a lockless scheme.  We use
   6774  1.1  mrg    a lock variable in global memory.
   6775  1.1  mrg 
   6776  1.1  mrg    while (cmp&swap (&lock_var, 0, 1))
   6777  1.1  mrg      continue;
   6778  1.1  mrg    T accum = *ptr;
   6779  1.1  mrg    accum = accum OP var;
   6780  1.1  mrg    *ptr = accum;
   6781  1.1  mrg    cmp&swap (&lock_var, 1, 0);
   6782  1.1  mrg    return accum;
   6783  1.1  mrg 
   6784  1.1  mrg    A lock in global memory is necessary to force execution engine
   6785  1.1  mrg    descheduling and avoid resource starvation that can occur if the
   6786  1.1  mrg    lock is in .shared memory.  */
   6787  1.1  mrg 
   6788  1.1  mrg static tree
   6789  1.1  mrg nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
   6790  1.1  mrg 		       tree ptr, tree var, tree_code op, int level)
   6791  1.1  mrg {
   6792  1.1  mrg   tree var_type = TREE_TYPE (var);
   6793  1.1  mrg   tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
   6794  1.1  mrg   tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
   6795  1.1  mrg   tree uns_locked = build_int_cst (unsigned_type_node, 1);
   6796  1.1  mrg 
   6797  1.1  mrg   /* Split the block just before the gsi.  Insert a gimple nop to make
   6798  1.1  mrg      this easier.  */
   6799  1.1  mrg   gimple *nop = gimple_build_nop ();
   6800  1.1  mrg   gsi_insert_before (gsi, nop, GSI_SAME_STMT);
   6801  1.1  mrg   basic_block entry_bb = gsi_bb (*gsi);
   6802  1.1  mrg   edge entry_edge = split_block (entry_bb, nop);
   6803  1.1  mrg   basic_block lock_bb = entry_edge->dest;
   6804  1.1  mrg   /* Reset the iterator.  */
   6805  1.1  mrg   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
   6806  1.1  mrg 
   6807  1.1  mrg   /* Build and insert the locking sequence.  */
   6808  1.1  mrg   gimple_seq lock_seq = NULL;
   6809  1.1  mrg   tree lock_var = make_ssa_name (unsigned_type_node);
   6810  1.1  mrg   tree lock_expr = nvptx_global_lock_addr ();
   6811  1.1  mrg   lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
   6812  1.1  mrg 				   uns_unlocked, uns_locked);
   6813  1.1  mrg   gimplify_assign (lock_var, lock_expr, &lock_seq);
   6814  1.1  mrg   gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
   6815  1.1  mrg 				   NULL_TREE, NULL_TREE);
   6816  1.1  mrg   gimple_seq_add_stmt (&lock_seq, cond);
   6817  1.1  mrg   gimple *lock_end = gimple_seq_last (lock_seq);
   6818  1.1  mrg   gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
   6819  1.1  mrg 
   6820  1.1  mrg   /* Split the block just after the lock sequence.  */
   6821  1.1  mrg   edge locked_edge = split_block (lock_bb, lock_end);
   6822  1.1  mrg   basic_block update_bb = locked_edge->dest;
   6823  1.1  mrg   lock_bb = locked_edge->src;
   6824  1.1  mrg   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
   6825  1.1  mrg 
   6826  1.1  mrg   /* Create the lock loop ... */
   6827  1.1  mrg   locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
   6828  1.1  mrg   locked_edge->probability = profile_probability::even ();
   6829  1.1  mrg   edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
   6830  1.1  mrg   loop_edge->probability = profile_probability::even ();
   6831  1.1  mrg   set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
   6832  1.1  mrg   set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
   6833  1.1  mrg 
   6834  1.1  mrg   /* ... and the loop structure.  */
   6835  1.1  mrg   loop *lock_loop = alloc_loop ();
   6836  1.1  mrg   lock_loop->header = lock_bb;
   6837  1.1  mrg   lock_loop->latch = lock_bb;
   6838  1.1  mrg   lock_loop->nb_iterations_estimate = 1;
   6839  1.1  mrg   lock_loop->any_estimate = true;
   6840  1.1  mrg   add_loop (lock_loop, entry_bb->loop_father);
   6841  1.1  mrg 
   6842  1.1  mrg   /* Build the pre-barrier.  */
   6843  1.1  mrg   gimple_seq red_seq = NULL;
   6844  1.1  mrg   enum nvptx_builtins barrier_builtin
   6845  1.1  mrg     = (level == GOMP_DIM_GANG
   6846  1.1  mrg        ? NVPTX_BUILTIN_MEMBAR_GL
   6847  1.1  mrg        : NVPTX_BUILTIN_MEMBAR_CTA);
   6848  1.1  mrg   tree barrier_fn = nvptx_builtin_decl (barrier_builtin, true);
   6849  1.1  mrg   tree barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
   6850  1.1  mrg   gimplify_stmt (&barrier_expr, &red_seq);
   6851  1.1  mrg 
   6852  1.1  mrg   /* Build the reduction calculation.  */
   6853  1.1  mrg   tree acc_in = make_ssa_name (var_type);
   6854  1.1  mrg   tree ref_in = build_simple_mem_ref (ptr);
   6855  1.1  mrg   TREE_THIS_VOLATILE (ref_in) = 1;
   6856  1.1  mrg   gimplify_assign (acc_in, ref_in, &red_seq);
   6857  1.1  mrg 
   6858  1.1  mrg   tree acc_out = make_ssa_name (var_type);
   6859  1.1  mrg   tree update_expr = fold_build2 (op, var_type, ref_in, var);
   6860  1.1  mrg   gimplify_assign (acc_out, update_expr, &red_seq);
   6861  1.1  mrg 
   6862  1.1  mrg   tree ref_out = build_simple_mem_ref (ptr);
   6863  1.1  mrg   TREE_THIS_VOLATILE (ref_out) = 1;
   6864  1.1  mrg   gimplify_assign (ref_out, acc_out, &red_seq);
   6865  1.1  mrg 
   6866  1.1  mrg   /* Build the post-barrier.  */
   6867  1.1  mrg   barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
   6868  1.1  mrg   gimplify_stmt (&barrier_expr, &red_seq);
   6869  1.1  mrg 
   6870  1.1  mrg   /* Insert the reduction calculation.  */
   6871  1.1  mrg   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
   6872  1.1  mrg 
   6873  1.1  mrg   /* Build & insert the unlock sequence.  */
   6874  1.1  mrg   gimple_seq unlock_seq = NULL;
   6875  1.1  mrg   tree unlock_expr = nvptx_global_lock_addr ();
   6876  1.1  mrg   unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
   6877  1.1  mrg 				     uns_locked, uns_unlocked);
   6878  1.1  mrg   gimplify_and_add (unlock_expr, &unlock_seq);
   6879  1.1  mrg   gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
   6880  1.1  mrg 
   6881  1.1  mrg   return acc_out;
   6882  1.1  mrg }
   6883  1.1  mrg 
   6884  1.1  mrg /* Emit a sequence to update a reduction accumlator at *PTR with the
   6885  1.1  mrg    value held in VAR using operator OP.  Return the updated value.
   6886  1.1  mrg 
   6887  1.1  mrg    TODO: optimize for atomic ops and indepedent complex ops.  */
   6888  1.1  mrg 
   6889  1.1  mrg static tree
   6890  1.1  mrg nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
   6891  1.1  mrg 			tree ptr, tree var, tree_code op, int level)
   6892  1.1  mrg {
   6893  1.1  mrg   tree type = TREE_TYPE (var);
   6894  1.1  mrg   tree size = TYPE_SIZE (type);
   6895  1.1  mrg 
   6896  1.1  mrg   if (size == TYPE_SIZE (unsigned_type_node)
   6897  1.1  mrg       || size == TYPE_SIZE (long_long_unsigned_type_node))
   6898  1.1  mrg     return nvptx_lockless_update (loc, gsi, ptr, var, op);
   6899  1.1  mrg   else
   6900  1.1  mrg     return nvptx_lockfull_update (loc, gsi, ptr, var, op, level);
   6901  1.1  mrg }
   6902  1.1  mrg 
   6903  1.1  mrg /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
   6904  1.1  mrg 
   6905  1.1  mrg static void
   6906  1.1  mrg nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
   6907  1.1  mrg {
   6908  1.1  mrg   gimple_stmt_iterator gsi = gsi_for_stmt (call);
   6909  1.1  mrg   tree lhs = gimple_call_lhs (call);
   6910  1.1  mrg   tree var = gimple_call_arg (call, 2);
   6911  1.1  mrg   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
   6912  1.1  mrg   gimple_seq seq = NULL;
   6913  1.1  mrg 
   6914  1.1  mrg   push_gimplify_context (true);
   6915  1.1  mrg 
   6916  1.1  mrg   if (level != GOMP_DIM_GANG)
   6917  1.1  mrg     {
   6918  1.1  mrg       /* Copy the receiver object.  */
   6919  1.1  mrg       tree ref_to_res = gimple_call_arg (call, 1);
   6920  1.1  mrg 
   6921  1.1  mrg       if (!integer_zerop (ref_to_res))
   6922  1.1  mrg 	var = build_simple_mem_ref (ref_to_res);
   6923  1.1  mrg     }
   6924  1.1  mrg 
   6925  1.1  mrg   if (level == GOMP_DIM_WORKER
   6926  1.1  mrg       || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
   6927  1.1  mrg     {
   6928  1.1  mrg       /* Store incoming value to worker reduction buffer.  */
   6929  1.1  mrg       tree offset = gimple_call_arg (call, 5);
   6930  1.1  mrg       tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
   6931  1.1  mrg 					     level == GOMP_DIM_VECTOR);
   6932  1.1  mrg       tree ptr = make_ssa_name (TREE_TYPE (call));
   6933  1.1  mrg 
   6934  1.1  mrg       gimplify_assign (ptr, call, &seq);
   6935  1.1  mrg       tree ref = build_simple_mem_ref (ptr);
   6936  1.1  mrg       TREE_THIS_VOLATILE (ref) = 1;
   6937  1.1  mrg       gimplify_assign (ref, var, &seq);
   6938  1.1  mrg     }
   6939  1.1  mrg 
   6940  1.1  mrg   if (lhs)
   6941  1.1  mrg     gimplify_assign (lhs, var, &seq);
   6942  1.1  mrg 
   6943  1.1  mrg   pop_gimplify_context (NULL);
   6944  1.1  mrg   gsi_replace_with_seq (&gsi, seq, true);
   6945  1.1  mrg }
   6946  1.1  mrg 
   6947  1.1  mrg /* NVPTX implementation of GOACC_REDUCTION_INIT. */
   6948  1.1  mrg 
   6949  1.1  mrg static void
   6950  1.1  mrg nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
   6951  1.1  mrg {
   6952  1.1  mrg   gimple_stmt_iterator gsi = gsi_for_stmt (call);
   6953  1.1  mrg   tree lhs = gimple_call_lhs (call);
   6954  1.1  mrg   tree var = gimple_call_arg (call, 2);
   6955  1.1  mrg   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
   6956  1.1  mrg   enum tree_code rcode
   6957  1.1  mrg     = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
   6958  1.1  mrg   tree init = omp_reduction_init_op (gimple_location (call), rcode,
   6959  1.1  mrg 				     TREE_TYPE (var));
   6960  1.1  mrg   gimple_seq seq = NULL;
   6961  1.1  mrg 
   6962  1.1  mrg   push_gimplify_context (true);
   6963  1.1  mrg 
   6964  1.1  mrg   if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
   6965  1.1  mrg     {
   6966  1.1  mrg       /* Initialize vector-non-zeroes to INIT_VAL (OP).  */
   6967  1.1  mrg       tree tid = make_ssa_name (integer_type_node);
   6968  1.1  mrg       tree dim_vector = gimple_call_arg (call, 3);
   6969  1.1  mrg       gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
   6970  1.1  mrg 						     dim_vector);
   6971  1.1  mrg       gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
   6972  1.1  mrg 					     NULL_TREE, NULL_TREE);
   6973  1.1  mrg 
   6974  1.1  mrg       gimple_call_set_lhs (tid_call, tid);
   6975  1.1  mrg       gimple_seq_add_stmt (&seq, tid_call);
   6976  1.1  mrg       gimple_seq_add_stmt (&seq, cond_stmt);
   6977  1.1  mrg 
   6978  1.1  mrg       /* Split the block just after the call.  */
   6979  1.1  mrg       edge init_edge = split_block (gsi_bb (gsi), call);
   6980  1.1  mrg       basic_block init_bb = init_edge->dest;
   6981  1.1  mrg       basic_block call_bb = init_edge->src;
   6982  1.1  mrg 
   6983  1.1  mrg       /* Fixup flags from call_bb to init_bb.  */
   6984  1.1  mrg       init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
   6985  1.1  mrg       init_edge->probability = profile_probability::even ();
   6986  1.1  mrg 
   6987  1.1  mrg       /* Set the initialization stmts.  */
   6988  1.1  mrg       gimple_seq init_seq = NULL;
   6989  1.1  mrg       tree init_var = make_ssa_name (TREE_TYPE (var));
   6990  1.1  mrg       gimplify_assign (init_var, init, &init_seq);
   6991  1.1  mrg       gsi = gsi_start_bb (init_bb);
   6992  1.1  mrg       gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
   6993  1.1  mrg 
   6994  1.1  mrg       /* Split block just after the init stmt.  */
   6995  1.1  mrg       gsi_prev (&gsi);
   6996  1.1  mrg       edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
   6997  1.1  mrg       basic_block dst_bb = inited_edge->dest;
   6998  1.1  mrg 
   6999  1.1  mrg       /* Create false edge from call_bb to dst_bb.  */
   7000  1.1  mrg       edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
   7001  1.1  mrg       nop_edge->probability = profile_probability::even ();
   7002  1.1  mrg 
   7003  1.1  mrg       /* Create phi node in dst block.  */
   7004  1.1  mrg       gphi *phi = create_phi_node (lhs, dst_bb);
   7005  1.1  mrg       add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
   7006  1.1  mrg       add_phi_arg (phi, var, nop_edge, gimple_location (call));
   7007  1.1  mrg 
   7008  1.1  mrg       /* Reset dominator of dst bb.  */
   7009  1.1  mrg       set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
   7010  1.1  mrg 
   7011  1.1  mrg       /* Reset the gsi.  */
   7012  1.1  mrg       gsi = gsi_for_stmt (call);
   7013  1.1  mrg     }
   7014  1.1  mrg   else
   7015  1.1  mrg     {
   7016  1.1  mrg       if (level == GOMP_DIM_GANG)
   7017  1.1  mrg 	{
   7018  1.1  mrg 	  /* If there's no receiver object, propagate the incoming VAR.  */
   7019  1.1  mrg 	  tree ref_to_res = gimple_call_arg (call, 1);
   7020  1.1  mrg 	  if (integer_zerop (ref_to_res))
   7021  1.1  mrg 	    init = var;
   7022  1.1  mrg 	}
   7023  1.1  mrg 
   7024  1.1  mrg       if (lhs != NULL_TREE)
   7025  1.1  mrg 	gimplify_assign (lhs, init, &seq);
   7026  1.1  mrg     }
   7027  1.1  mrg 
   7028  1.1  mrg   pop_gimplify_context (NULL);
   7029  1.1  mrg   gsi_replace_with_seq (&gsi, seq, true);
   7030  1.1  mrg }
   7031  1.1  mrg 
   7032  1.1  mrg /* NVPTX implementation of GOACC_REDUCTION_FINI.  */
   7033  1.1  mrg 
   7034  1.1  mrg static void
   7035  1.1  mrg nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
   7036  1.1  mrg {
   7037  1.1  mrg   gimple_stmt_iterator gsi = gsi_for_stmt (call);
   7038  1.1  mrg   tree lhs = gimple_call_lhs (call);
   7039  1.1  mrg   tree ref_to_res = gimple_call_arg (call, 1);
   7040  1.1  mrg   tree var = gimple_call_arg (call, 2);
   7041  1.1  mrg   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
   7042  1.1  mrg   enum tree_code op
   7043  1.1  mrg     = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
   7044  1.1  mrg   gimple_seq seq = NULL;
   7045  1.1  mrg   tree r = NULL_TREE;;
   7046  1.1  mrg 
   7047  1.1  mrg   push_gimplify_context (true);
   7048  1.1  mrg 
   7049  1.1  mrg   if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
   7050  1.1  mrg     {
   7051  1.1  mrg       /* Emit binary shuffle tree.  TODO. Emit this as an actual loop,
   7052  1.1  mrg 	 but that requires a method of emitting a unified jump at the
   7053  1.1  mrg 	 gimple level.  */
   7054  1.1  mrg       for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
   7055  1.1  mrg 	{
   7056  1.1  mrg 	  tree other_var = make_ssa_name (TREE_TYPE (var));
   7057  1.1  mrg 	  nvptx_generate_vector_shuffle (gimple_location (call),
   7058  1.1  mrg 					 other_var, var, shfl, &seq);
   7059  1.1  mrg 
   7060  1.1  mrg 	  r = make_ssa_name (TREE_TYPE (var));
   7061  1.1  mrg 	  gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
   7062  1.1  mrg 					   var, other_var), &seq);
   7063  1.1  mrg 	  var = r;
   7064  1.1  mrg 	}
   7065  1.1  mrg     }
   7066  1.1  mrg   else
   7067  1.1  mrg     {
   7068  1.1  mrg       tree accum = NULL_TREE;
   7069  1.1  mrg 
   7070  1.1  mrg       if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
   7071  1.1  mrg 	{
   7072  1.1  mrg 	  /* Get reduction buffer address.  */
   7073  1.1  mrg 	  tree offset = gimple_call_arg (call, 5);
   7074  1.1  mrg 	  tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
   7075  1.1  mrg 						 level == GOMP_DIM_VECTOR);
   7076  1.1  mrg 	  tree ptr = make_ssa_name (TREE_TYPE (call));
   7077  1.1  mrg 
   7078  1.1  mrg 	  gimplify_assign (ptr, call, &seq);
   7079  1.1  mrg 	  accum = ptr;
   7080  1.1  mrg 	}
   7081  1.1  mrg       else if (integer_zerop (ref_to_res))
   7082  1.1  mrg 	r = var;
   7083  1.1  mrg       else
   7084  1.1  mrg 	accum = ref_to_res;
   7085  1.1  mrg 
   7086  1.1  mrg       if (accum)
   7087  1.1  mrg 	{
   7088  1.1  mrg 	  /* UPDATE the accumulator.  */
   7089  1.1  mrg 	  gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
   7090  1.1  mrg 	  seq = NULL;
   7091  1.1  mrg 	  r = nvptx_reduction_update (gimple_location (call), &gsi,
   7092  1.1  mrg 				      accum, var, op, level);
   7093  1.1  mrg 	}
   7094  1.1  mrg     }
   7095  1.1  mrg 
   7096  1.1  mrg   if (lhs)
   7097  1.1  mrg     gimplify_assign (lhs, r, &seq);
   7098  1.1  mrg   pop_gimplify_context (NULL);
   7099  1.1  mrg 
   7100  1.1  mrg   gsi_replace_with_seq (&gsi, seq, true);
   7101  1.1  mrg }
   7102  1.1  mrg 
   7103  1.1  mrg /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN.  */
   7104  1.1  mrg 
   7105  1.1  mrg static void
   7106  1.1  mrg nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
   7107  1.1  mrg {
   7108  1.1  mrg   gimple_stmt_iterator gsi = gsi_for_stmt (call);
   7109  1.1  mrg   tree lhs = gimple_call_lhs (call);
   7110  1.1  mrg   tree var = gimple_call_arg (call, 2);
   7111  1.1  mrg   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
   7112  1.1  mrg   gimple_seq seq = NULL;
   7113  1.1  mrg 
   7114  1.1  mrg   push_gimplify_context (true);
   7115  1.1  mrg   if (level == GOMP_DIM_WORKER
   7116  1.1  mrg       || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
   7117  1.1  mrg     {
   7118  1.1  mrg       /* Read the worker reduction buffer.  */
   7119  1.1  mrg       tree offset = gimple_call_arg (call, 5);
   7120  1.1  mrg       tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
   7121  1.1  mrg 					     level == GOMP_DIM_VECTOR);
   7122  1.1  mrg       tree ptr = make_ssa_name (TREE_TYPE (call));
   7123  1.1  mrg 
   7124  1.1  mrg       gimplify_assign (ptr, call, &seq);
   7125  1.1  mrg       var = build_simple_mem_ref (ptr);
   7126  1.1  mrg       TREE_THIS_VOLATILE (var) = 1;
   7127  1.1  mrg     }
   7128  1.1  mrg 
   7129  1.1  mrg   if (level != GOMP_DIM_GANG)
   7130  1.1  mrg     {
   7131  1.1  mrg       /* Write to the receiver object.  */
   7132  1.1  mrg       tree ref_to_res = gimple_call_arg (call, 1);
   7133  1.1  mrg 
   7134  1.1  mrg       if (!integer_zerop (ref_to_res))
   7135  1.1  mrg 	gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
   7136  1.1  mrg     }
   7137  1.1  mrg 
   7138  1.1  mrg   if (lhs)
   7139  1.1  mrg     gimplify_assign (lhs, var, &seq);
   7140  1.1  mrg 
   7141  1.1  mrg   pop_gimplify_context (NULL);
   7142  1.1  mrg 
   7143  1.1  mrg   gsi_replace_with_seq (&gsi, seq, true);
   7144  1.1  mrg }
   7145  1.1  mrg 
   7146  1.1  mrg /* NVPTX reduction expander.  */
   7147  1.1  mrg 
   7148  1.1  mrg static void
   7149  1.1  mrg nvptx_goacc_reduction (gcall *call)
   7150  1.1  mrg {
   7151  1.1  mrg   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
   7152  1.1  mrg   offload_attrs oa;
   7153  1.1  mrg 
   7154  1.1  mrg   populate_offload_attrs (&oa);
   7155  1.1  mrg 
   7156  1.1  mrg   switch (code)
   7157  1.1  mrg     {
   7158  1.1  mrg     case IFN_GOACC_REDUCTION_SETUP:
   7159  1.1  mrg       nvptx_goacc_reduction_setup (call, &oa);
   7160  1.1  mrg       break;
   7161  1.1  mrg 
   7162  1.1  mrg     case IFN_GOACC_REDUCTION_INIT:
   7163  1.1  mrg       nvptx_goacc_reduction_init (call, &oa);
   7164  1.1  mrg       break;
   7165  1.1  mrg 
   7166  1.1  mrg     case IFN_GOACC_REDUCTION_FINI:
   7167  1.1  mrg       nvptx_goacc_reduction_fini (call, &oa);
   7168  1.1  mrg       break;
   7169  1.1  mrg 
   7170  1.1  mrg     case IFN_GOACC_REDUCTION_TEARDOWN:
   7171  1.1  mrg       nvptx_goacc_reduction_teardown (call, &oa);
   7172  1.1  mrg       break;
   7173  1.1  mrg 
   7174  1.1  mrg     default:
   7175  1.1  mrg       gcc_unreachable ();
   7176  1.1  mrg     }
   7177  1.1  mrg }
   7178  1.1  mrg 
   7179  1.1  mrg static bool
   7180  1.1  mrg nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
   7181  1.1  mrg 			      rtx x ATTRIBUTE_UNUSED)
   7182  1.1  mrg {
   7183  1.1  mrg   return true;
   7184  1.1  mrg }
   7185  1.1  mrg 
   7186  1.1  mrg static bool
   7187  1.1  mrg nvptx_scalar_mode_supported_p (scalar_mode mode)
   7188  1.1  mrg {
   7189  1.1  mrg   if (nvptx_experimental && mode == HFmode && TARGET_SM53)
   7190  1.1  mrg     return true;
   7191  1.1  mrg 
   7192  1.1  mrg   return default_scalar_mode_supported_p (mode);
   7193  1.1  mrg }
   7194  1.1  mrg 
   7195  1.1  mrg static bool
   7196  1.1  mrg nvptx_libgcc_floating_mode_supported_p (scalar_float_mode mode)
   7197  1.1  mrg {
   7198  1.1  mrg   if (nvptx_experimental && mode == HFmode && TARGET_SM53)
   7199  1.1  mrg     return true;
   7200  1.1  mrg 
   7201  1.1  mrg   return default_libgcc_floating_mode_supported_p (mode);
   7202  1.1  mrg }
   7203  1.1  mrg 
   7204  1.1  mrg static bool
   7205  1.1  mrg nvptx_vector_mode_supported (machine_mode mode)
   7206  1.1  mrg {
   7207  1.1  mrg   return (mode == V2SImode
   7208  1.1  mrg 	  || mode == V2DImode);
   7209  1.1  mrg }
   7210  1.1  mrg 
   7211  1.1  mrg /* Return the preferred mode for vectorizing scalar MODE.  */
   7212  1.1  mrg 
   7213  1.1  mrg static machine_mode
   7214  1.1  mrg nvptx_preferred_simd_mode (scalar_mode mode)
   7215  1.1  mrg {
   7216  1.1  mrg   switch (mode)
   7217  1.1  mrg     {
   7218  1.1  mrg     case E_DImode:
   7219  1.1  mrg       return V2DImode;
   7220  1.1  mrg     case E_SImode:
   7221  1.1  mrg       return V2SImode;
   7222  1.1  mrg 
   7223  1.1  mrg     default:
   7224  1.1  mrg       return default_preferred_simd_mode (mode);
   7225  1.1  mrg     }
   7226  1.1  mrg }
   7227  1.1  mrg 
   7228  1.1  mrg unsigned int
   7229  1.1  mrg nvptx_data_alignment (const_tree type, unsigned int basic_align)
   7230  1.1  mrg {
   7231  1.1  mrg   if (TREE_CODE (type) == INTEGER_TYPE)
   7232  1.1  mrg     {
   7233  1.1  mrg       unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
   7234  1.1  mrg       if (size == GET_MODE_SIZE (TImode))
   7235  1.1  mrg 	return GET_MODE_BITSIZE (maybe_split_mode (TImode));
   7236  1.1  mrg     }
   7237  1.1  mrg 
   7238  1.1  mrg   return basic_align;
   7239  1.1  mrg }
   7240  1.1  mrg 
   7241  1.1  mrg /* Implement TARGET_MODES_TIEABLE_P.  */
   7242  1.1  mrg 
   7243  1.1  mrg static bool
   7244  1.1  mrg nvptx_modes_tieable_p (machine_mode, machine_mode)
   7245  1.1  mrg {
   7246  1.1  mrg   return false;
   7247  1.1  mrg }
   7248  1.1  mrg 
   7249  1.1  mrg /* Implement TARGET_HARD_REGNO_NREGS.  */
   7250  1.1  mrg 
   7251  1.1  mrg static unsigned int
   7252  1.1  mrg nvptx_hard_regno_nregs (unsigned int, machine_mode)
   7253  1.1  mrg {
   7254  1.1  mrg   return 1;
   7255  1.1  mrg }
   7256  1.1  mrg 
   7257  1.1  mrg /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
   7258  1.1  mrg 
   7259  1.1  mrg static bool
   7260  1.1  mrg nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
   7261  1.1  mrg {
   7262  1.1  mrg   return false;
   7263  1.1  mrg }
   7264  1.1  mrg 
   7265  1.1  mrg /* Implement TARGET_TRULY_NOOP_TRUNCATION.  */
   7266  1.1  mrg 
   7267  1.1  mrg static bool
   7268  1.1  mrg nvptx_truly_noop_truncation (poly_uint64, poly_uint64)
   7269  1.1  mrg {
   7270  1.1  mrg   return false;
   7271  1.1  mrg }
   7272  1.1  mrg 
   7273  1.1  mrg /* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL.  */
   7274  1.1  mrg 
   7275  1.1  mrg static tree
   7276  1.1  mrg nvptx_goacc_adjust_private_decl (location_t loc, tree decl, int level)
   7277  1.1  mrg {
   7278  1.1  mrg   gcc_checking_assert (!lookup_attribute ("oacc gang-private",
   7279  1.1  mrg 					  DECL_ATTRIBUTES (decl)));
   7280  1.1  mrg 
   7281  1.1  mrg   /* Set "oacc gang-private" attribute for gang-private variable
   7282  1.1  mrg      declarations.  */
   7283  1.1  mrg   if (level == GOMP_DIM_GANG)
   7284  1.1  mrg     {
   7285  1.1  mrg       tree id = get_identifier ("oacc gang-private");
   7286  1.1  mrg       /* For later diagnostic purposes, pass LOC as VALUE (wrapped as a
   7287  1.1  mrg 	 TREE).  */
   7288  1.1  mrg       tree loc_tree = build_empty_stmt (loc);
   7289  1.1  mrg       DECL_ATTRIBUTES (decl)
   7290  1.1  mrg 	= tree_cons (id, loc_tree, DECL_ATTRIBUTES (decl));
   7291  1.1  mrg     }
   7292  1.1  mrg 
   7293  1.1  mrg   return decl;
   7294  1.1  mrg }
   7295  1.1  mrg 
   7296  1.1  mrg /* Implement TARGET_GOACC_EXPAND_VAR_DECL.  */
   7297  1.1  mrg 
   7298  1.1  mrg static rtx
   7299  1.1  mrg nvptx_goacc_expand_var_decl (tree var)
   7300  1.1  mrg {
   7301  1.1  mrg   /* Place "oacc gang-private" variables in shared memory.  */
   7302  1.1  mrg   if (tree attr = lookup_attribute ("oacc gang-private",
   7303  1.1  mrg 				    DECL_ATTRIBUTES (var)))
   7304  1.1  mrg     {
   7305  1.1  mrg       gcc_checking_assert (VAR_P (var));
   7306  1.1  mrg 
   7307  1.1  mrg       unsigned int offset, *poffset;
   7308  1.1  mrg       poffset = gang_private_shared_hmap.get (var);
   7309  1.1  mrg       if (poffset)
   7310  1.1  mrg 	offset = *poffset;
   7311  1.1  mrg       else
   7312  1.1  mrg 	{
   7313  1.1  mrg 	  unsigned HOST_WIDE_INT align = DECL_ALIGN (var);
   7314  1.1  mrg 	  gang_private_shared_size
   7315  1.1  mrg 	    = (gang_private_shared_size + align - 1) & ~(align - 1);
   7316  1.1  mrg 	  if (gang_private_shared_align < align)
   7317  1.1  mrg 	    gang_private_shared_align = align;
   7318  1.1  mrg 
   7319  1.1  mrg 	  offset = gang_private_shared_size;
   7320  1.1  mrg 	  bool existed = gang_private_shared_hmap.put (var, offset);
   7321  1.1  mrg 	  gcc_checking_assert (!existed);
   7322  1.1  mrg 	  gang_private_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var));
   7323  1.1  mrg 
   7324  1.1  mrg 	  location_t loc = EXPR_LOCATION (TREE_VALUE (attr));
   7325  1.1  mrg #if 0 /* For some reason, this doesn't work.  */
   7326  1.1  mrg 	  if (dump_enabled_p ())
   7327  1.1  mrg 	    {
   7328  1.1  mrg 	      dump_flags_t l_dump_flags
   7329  1.1  mrg 		= get_openacc_privatization_dump_flags ();
   7330  1.1  mrg 
   7331  1.1  mrg 	      const dump_user_location_t d_u_loc
   7332  1.1  mrg 		= dump_user_location_t::from_location_t (loc);
   7333  1.1  mrg /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
   7334  1.1  mrg #if __GNUC__ >= 10
   7335  1.1  mrg # pragma GCC diagnostic push
   7336  1.1  mrg # pragma GCC diagnostic ignored "-Wformat"
   7337  1.1  mrg #endif
   7338  1.1  mrg 	      dump_printf_loc (l_dump_flags, d_u_loc,
   7339  1.1  mrg 			       "variable %<%T%> adjusted for OpenACC"
   7340  1.1  mrg 			       " privatization level: %qs\n",
   7341  1.1  mrg 			       var, "gang");
   7342  1.1  mrg #if __GNUC__ >= 10
   7343  1.1  mrg # pragma GCC diagnostic pop
   7344  1.1  mrg #endif
   7345  1.1  mrg 	    }
   7346  1.1  mrg #else /* ..., thus emulate that, good enough for testsuite usage.  */
   7347  1.1  mrg 	  if (param_openacc_privatization != OPENACC_PRIVATIZATION_QUIET)
   7348  1.1  mrg 	    inform (loc,
   7349  1.1  mrg 		    "variable %qD adjusted for OpenACC privatization level:"
   7350  1.1  mrg 		    " %qs",
   7351  1.1  mrg 		    var, "gang");
   7352  1.1  mrg 	  if (dump_file && (dump_flags & TDF_DETAILS))
   7353  1.1  mrg 	    {
   7354  1.1  mrg 	      /* 'dumpfile.cc:dump_loc' */
   7355  1.1  mrg 	      fprintf (dump_file, "%s:%d:%d: ", LOCATION_FILE (loc),
   7356  1.1  mrg 		       LOCATION_LINE (loc), LOCATION_COLUMN (loc));
   7357  1.1  mrg 	      fprintf (dump_file, "%s: ", "note");
   7358  1.1  mrg 
   7359  1.1  mrg 	      fprintf (dump_file,
   7360  1.1  mrg 		       "variable '");
   7361  1.1  mrg 	      print_generic_expr (dump_file, var, TDF_SLIM);
   7362  1.1  mrg 	      fprintf (dump_file,
   7363  1.1  mrg 		       "' adjusted for OpenACC privatization level: '%s'\n",
   7364  1.1  mrg 		       "gang");
   7365  1.1  mrg 	    }
   7366  1.1  mrg #endif
   7367  1.1  mrg 	}
   7368  1.1  mrg       rtx addr = plus_constant (Pmode, gang_private_shared_sym, offset);
   7369  1.1  mrg       return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr);
   7370  1.1  mrg     }
   7371  1.1  mrg 
   7372  1.1  mrg   return NULL_RTX;
   7373  1.1  mrg }
   7374  1.1  mrg 
   7375  1.1  mrg static GTY(()) tree nvptx_previous_fndecl;
   7376  1.1  mrg 
   7377  1.1  mrg static void
   7378  1.1  mrg nvptx_set_current_function (tree fndecl)
   7379  1.1  mrg {
   7380  1.1  mrg   if (!fndecl || fndecl == nvptx_previous_fndecl)
   7381  1.1  mrg     return;
   7382  1.1  mrg 
   7383  1.1  mrg   gang_private_shared_hmap.empty ();
   7384  1.1  mrg   nvptx_previous_fndecl = fndecl;
   7385  1.1  mrg   vector_red_partition = 0;
   7386  1.1  mrg   oacc_bcast_partition = 0;
   7387  1.1  mrg }
   7388  1.1  mrg 
   7389  1.1  mrg /* Implement TARGET_LIBC_HAS_FUNCTION.  */
   7390  1.1  mrg 
   7391  1.1  mrg bool
   7392  1.1  mrg nvptx_libc_has_function (enum function_class fn_class, tree type)
   7393  1.1  mrg {
   7394  1.1  mrg   if (fn_class == function_sincos)
   7395  1.1  mrg     {
   7396  1.1  mrg       if (type != NULL_TREE)
   7397  1.1  mrg 	/* Currently, newlib does not support sincosl.  */
   7398  1.1  mrg 	return type == float_type_node || type == double_type_node;
   7399  1.1  mrg       else
   7400  1.1  mrg 	return true;
   7401  1.1  mrg     }
   7402  1.1  mrg 
   7403  1.1  mrg   return default_libc_has_function (fn_class, type);
   7404  1.1  mrg }
   7405  1.1  mrg 
   7406  1.1  mrg bool
   7407  1.1  mrg nvptx_mem_local_p (rtx mem)
   7408  1.1  mrg {
   7409  1.1  mrg   gcc_assert (GET_CODE (mem) == MEM);
   7410  1.1  mrg 
   7411  1.1  mrg   struct address_info info;
   7412  1.1  mrg   decompose_mem_address (&info, mem);
   7413  1.1  mrg 
   7414  1.1  mrg   if (info.base != NULL && REG_P (*info.base)
   7415  1.1  mrg       && REGNO_PTR_FRAME_P (REGNO (*info.base)))
   7416  1.1  mrg     {
   7417  1.1  mrg       if (TARGET_SOFT_STACK)
   7418  1.1  mrg 	{
   7419  1.1  mrg 	  /* Frame-related doesn't mean local.  */
   7420  1.1  mrg 	}
   7421  1.1  mrg       else
   7422  1.1  mrg 	return true;
   7423  1.1  mrg     }
   7424  1.1  mrg 
   7425  1.1  mrg   return false;
   7426  1.1  mrg }
   7427  1.1  mrg 
   7428  1.1  mrg /* Define locally, for use in NVPTX_ASM_OUTPUT_DEF.  */
   7429  1.1  mrg #define SET_ASM_OP ".alias "
   7430  1.1  mrg 
   7431  1.1  mrg /* Define locally, for use in nvptx_asm_output_def_from_decls.  Add NVPTX_
   7432  1.1  mrg    prefix to avoid clash with ASM_OUTPUT_DEF from nvptx.h.
   7433  1.1  mrg    Copy of ASM_OUTPUT_DEF from defaults.h, with added terminating
   7434  1.1  mrg    semicolon.  */
   7435  1.1  mrg #define NVPTX_ASM_OUTPUT_DEF(FILE,LABEL1,LABEL2)	\
   7436  1.1  mrg   do							\
   7437  1.1  mrg     {							\
   7438  1.1  mrg       fprintf ((FILE), "%s", SET_ASM_OP);		\
   7439  1.1  mrg       assemble_name (FILE, LABEL1);			\
   7440  1.1  mrg       fprintf (FILE, ",");				\
   7441  1.1  mrg       assemble_name (FILE, LABEL2);			\
   7442  1.1  mrg       fprintf (FILE, ";\n");				\
   7443  1.1  mrg     }							\
   7444  1.1  mrg   while (0)
   7445  1.1  mrg 
   7446  1.1  mrg void
   7447  1.1  mrg nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value)
   7448  1.1  mrg {
   7449  1.1  mrg   if (nvptx_alias == 0 || !TARGET_PTX_6_3)
   7450  1.1  mrg     {
   7451  1.1  mrg       /* Copied from assemble_alias.  */
   7452  1.1  mrg       error_at (DECL_SOURCE_LOCATION (name),
   7453  1.1  mrg 		"alias definitions not supported in this configuration");
   7454  1.1  mrg       TREE_ASM_WRITTEN (name) = 1;
   7455  1.1  mrg       return;
   7456  1.1  mrg     }
   7457  1.1  mrg 
   7458  1.1  mrg   if (lookup_attribute ("weak", DECL_ATTRIBUTES (name)))
   7459  1.1  mrg     {
   7460  1.1  mrg       /* Prevent execution FAILs for gcc.dg/globalalias.c and
   7461  1.1  mrg 	 gcc.dg/pr77587.c.  */
   7462  1.1  mrg       error_at (DECL_SOURCE_LOCATION (name),
   7463  1.1  mrg 		"weak alias definitions not supported in this configuration");
   7464  1.1  mrg       TREE_ASM_WRITTEN (name) = 1;
   7465  1.1  mrg       return;
   7466  1.1  mrg     }
   7467  1.1  mrg 
   7468  1.1  mrg   /* Ptx also doesn't support value having weak linkage, but we can't detect
   7469  1.1  mrg      that here, so we'll end up with:
   7470  1.1  mrg      "error: Function test with .weak scope cannot be aliased".
   7471  1.1  mrg      See gcc.dg/localalias.c.  */
   7472  1.1  mrg 
   7473  1.1  mrg   if (TREE_CODE (name) != FUNCTION_DECL)
   7474  1.1  mrg     {
   7475  1.1  mrg       error_at (DECL_SOURCE_LOCATION (name),
   7476  1.1  mrg 		"non-function alias definitions not supported"
   7477  1.1  mrg 		" in this configuration");
   7478  1.1  mrg       TREE_ASM_WRITTEN (name) = 1;
   7479  1.1  mrg       return;
   7480  1.1  mrg     }
   7481  1.1  mrg 
   7482  1.1  mrg   if (!cgraph_node::get (name)->referred_to_p ())
   7483  1.1  mrg     /* Prevent "Internal error: reference to deleted section".  */
   7484  1.1  mrg     return;
   7485  1.1  mrg 
   7486  1.1  mrg   std::stringstream s;
   7487  1.1  mrg   write_fn_proto (s, false, get_fnname_from_decl (name), name);
   7488  1.1  mrg   fputs (s.str ().c_str (), stream);
   7489  1.1  mrg 
   7490  1.1  mrg   tree id = DECL_ASSEMBLER_NAME (name);
   7491  1.1  mrg   NVPTX_ASM_OUTPUT_DEF (stream, IDENTIFIER_POINTER (id),
   7492  1.1  mrg 			IDENTIFIER_POINTER (value));
   7493  1.1  mrg }
   7494  1.1  mrg 
   7495  1.1  mrg #undef NVPTX_ASM_OUTPUT_DEF
   7496  1.1  mrg #undef SET_ASM_OP
   7497  1.1  mrg 
   7498  1.1  mrg #undef TARGET_OPTION_OVERRIDE
   7499  1.1  mrg #define TARGET_OPTION_OVERRIDE nvptx_option_override
   7500  1.1  mrg 
   7501  1.1  mrg #undef TARGET_ATTRIBUTE_TABLE
   7502  1.1  mrg #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
   7503  1.1  mrg 
   7504  1.1  mrg #undef TARGET_LRA_P
   7505  1.1  mrg #define TARGET_LRA_P hook_bool_void_false
   7506  1.1  mrg 
   7507  1.1  mrg #undef TARGET_LEGITIMATE_ADDRESS_P
   7508  1.1  mrg #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
   7509  1.1  mrg 
   7510  1.1  mrg #undef  TARGET_PROMOTE_FUNCTION_MODE
   7511  1.1  mrg #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
   7512  1.1  mrg 
   7513  1.1  mrg #undef TARGET_FUNCTION_ARG
   7514  1.1  mrg #define TARGET_FUNCTION_ARG nvptx_function_arg
   7515  1.1  mrg #undef TARGET_FUNCTION_INCOMING_ARG
   7516  1.1  mrg #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
   7517  1.1  mrg #undef TARGET_FUNCTION_ARG_ADVANCE
   7518  1.1  mrg #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
   7519  1.1  mrg #undef TARGET_FUNCTION_ARG_BOUNDARY
   7520  1.1  mrg #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
   7521  1.1  mrg #undef TARGET_PASS_BY_REFERENCE
   7522  1.1  mrg #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
   7523  1.1  mrg #undef TARGET_FUNCTION_VALUE_REGNO_P
   7524  1.1  mrg #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
   7525  1.1  mrg #undef TARGET_FUNCTION_VALUE
   7526  1.1  mrg #define TARGET_FUNCTION_VALUE nvptx_function_value
   7527  1.1  mrg #undef TARGET_LIBCALL_VALUE
   7528  1.1  mrg #define TARGET_LIBCALL_VALUE nvptx_libcall_value
   7529  1.1  mrg #undef TARGET_FUNCTION_OK_FOR_SIBCALL
   7530  1.1  mrg #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
   7531  1.1  mrg #undef TARGET_GET_DRAP_RTX
   7532  1.1  mrg #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
   7533  1.1  mrg #undef TARGET_SPLIT_COMPLEX_ARG
   7534  1.1  mrg #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
   7535  1.1  mrg #undef TARGET_RETURN_IN_MEMORY
   7536  1.1  mrg #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
   7537  1.1  mrg #undef TARGET_OMIT_STRUCT_RETURN_REG
   7538  1.1  mrg #define TARGET_OMIT_STRUCT_RETURN_REG true
   7539  1.1  mrg #undef TARGET_STRICT_ARGUMENT_NAMING
   7540  1.1  mrg #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
   7541  1.1  mrg #undef TARGET_CALL_ARGS
   7542  1.1  mrg #define TARGET_CALL_ARGS nvptx_call_args
   7543  1.1  mrg #undef TARGET_END_CALL_ARGS
   7544  1.1  mrg #define TARGET_END_CALL_ARGS nvptx_end_call_args
   7545  1.1  mrg 
   7546  1.1  mrg #undef TARGET_ASM_FILE_START
   7547  1.1  mrg #define TARGET_ASM_FILE_START nvptx_file_start
   7548  1.1  mrg #undef TARGET_ASM_FILE_END
   7549  1.1  mrg #define TARGET_ASM_FILE_END nvptx_file_end
   7550  1.1  mrg #undef TARGET_ASM_GLOBALIZE_LABEL
   7551  1.1  mrg #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
   7552  1.1  mrg #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
   7553  1.1  mrg #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
   7554  1.1  mrg #undef  TARGET_PRINT_OPERAND
   7555  1.1  mrg #define TARGET_PRINT_OPERAND nvptx_print_operand
   7556  1.1  mrg #undef  TARGET_PRINT_OPERAND_ADDRESS
   7557  1.1  mrg #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
   7558  1.1  mrg #undef  TARGET_PRINT_OPERAND_PUNCT_VALID_P
   7559  1.1  mrg #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
   7560  1.1  mrg #undef TARGET_ASM_INTEGER
   7561  1.1  mrg #define TARGET_ASM_INTEGER nvptx_assemble_integer
   7562  1.1  mrg #undef TARGET_ASM_DECL_END
   7563  1.1  mrg #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
   7564  1.1  mrg #undef TARGET_ASM_DECLARE_CONSTANT_NAME
   7565  1.1  mrg #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
   7566  1.1  mrg #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
   7567  1.1  mrg #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
   7568  1.1  mrg #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
   7569  1.1  mrg #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
   7570  1.1  mrg 
   7571  1.1  mrg #undef TARGET_MACHINE_DEPENDENT_REORG
   7572  1.1  mrg #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
   7573  1.1  mrg #undef TARGET_NO_REGISTER_ALLOCATION
   7574  1.1  mrg #define TARGET_NO_REGISTER_ALLOCATION true
   7575  1.1  mrg 
   7576  1.1  mrg #undef TARGET_ENCODE_SECTION_INFO
   7577  1.1  mrg #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
   7578  1.1  mrg #undef TARGET_RECORD_OFFLOAD_SYMBOL
   7579  1.1  mrg #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
   7580  1.1  mrg 
   7581  1.1  mrg #undef TARGET_VECTOR_ALIGNMENT
   7582  1.1  mrg #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
   7583  1.1  mrg 
   7584  1.1  mrg #undef TARGET_CANNOT_COPY_INSN_P
   7585  1.1  mrg #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
   7586  1.1  mrg 
   7587  1.1  mrg #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
   7588  1.1  mrg #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
   7589  1.1  mrg 
   7590  1.1  mrg #undef TARGET_INIT_BUILTINS
   7591  1.1  mrg #define TARGET_INIT_BUILTINS nvptx_init_builtins
   7592  1.1  mrg #undef TARGET_EXPAND_BUILTIN
   7593  1.1  mrg #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
   7594  1.1  mrg #undef  TARGET_BUILTIN_DECL
   7595  1.1  mrg #define TARGET_BUILTIN_DECL nvptx_builtin_decl
   7596  1.1  mrg 
   7597  1.1  mrg #undef TARGET_SIMT_VF
   7598  1.1  mrg #define TARGET_SIMT_VF nvptx_simt_vf
   7599  1.1  mrg 
   7600  1.1  mrg #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
   7601  1.1  mrg #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
   7602  1.1  mrg 
   7603  1.1  mrg #undef TARGET_GOACC_VALIDATE_DIMS
   7604  1.1  mrg #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
   7605  1.1  mrg 
   7606  1.1  mrg #undef TARGET_GOACC_DIM_LIMIT
   7607  1.1  mrg #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
   7608  1.1  mrg 
   7609  1.1  mrg #undef TARGET_GOACC_FORK_JOIN
   7610  1.1  mrg #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
   7611  1.1  mrg 
   7612  1.1  mrg #undef TARGET_GOACC_REDUCTION
   7613  1.1  mrg #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
   7614  1.1  mrg 
   7615  1.1  mrg #undef TARGET_CANNOT_FORCE_CONST_MEM
   7616  1.1  mrg #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
   7617  1.1  mrg 
   7618  1.1  mrg #undef TARGET_SCALAR_MODE_SUPPORTED_P
   7619  1.1  mrg #define TARGET_SCALAR_MODE_SUPPORTED_P nvptx_scalar_mode_supported_p
   7620  1.1  mrg 
   7621  1.1  mrg #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
   7622  1.1  mrg #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
   7623  1.1  mrg   nvptx_libgcc_floating_mode_supported_p
   7624  1.1  mrg 
   7625  1.1  mrg #undef TARGET_VECTOR_MODE_SUPPORTED_P
   7626  1.1  mrg #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
   7627  1.1  mrg 
   7628  1.1  mrg #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
   7629  1.1  mrg #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
   7630  1.1  mrg     nvptx_preferred_simd_mode
   7631  1.1  mrg 
   7632  1.1  mrg #undef TARGET_MODES_TIEABLE_P
   7633  1.1  mrg #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
   7634  1.1  mrg 
   7635  1.1  mrg #undef TARGET_HARD_REGNO_NREGS
   7636  1.1  mrg #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
   7637  1.1  mrg 
   7638  1.1  mrg #undef TARGET_CAN_CHANGE_MODE_CLASS
   7639  1.1  mrg #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
   7640  1.1  mrg 
   7641  1.1  mrg #undef TARGET_TRULY_NOOP_TRUNCATION
   7642  1.1  mrg #define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation
   7643  1.1  mrg 
   7644  1.1  mrg #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
   7645  1.1  mrg #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
   7646  1.1  mrg 
   7647  1.1  mrg #undef TARGET_GOACC_ADJUST_PRIVATE_DECL
   7648  1.1  mrg #define TARGET_GOACC_ADJUST_PRIVATE_DECL nvptx_goacc_adjust_private_decl
   7649  1.1  mrg 
   7650  1.1  mrg #undef TARGET_GOACC_EXPAND_VAR_DECL
   7651  1.1  mrg #define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl
   7652  1.1  mrg 
   7653  1.1  mrg #undef TARGET_SET_CURRENT_FUNCTION
   7654           #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
   7655           
   7656           #undef TARGET_LIBC_HAS_FUNCTION
   7657           #define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
   7658           
   7659           struct gcc_target targetm = TARGET_INITIALIZER;
   7660           
   7661           #include "gt-nvptx.h"
   7662