1 1.1 mrg /* Target code for NVPTX. 2 1.1 mrg Copyright (C) 2014-2022 Free Software Foundation, Inc. 3 1.1 mrg Contributed by Bernd Schmidt <bernds (at) codesourcery.com> 4 1.1 mrg 5 1.1 mrg This file is part of GCC. 6 1.1 mrg 7 1.1 mrg GCC is free software; you can redistribute it and/or modify it 8 1.1 mrg under the terms of the GNU General Public License as published 9 1.1 mrg by the Free Software Foundation; either version 3, or (at your 10 1.1 mrg option) any later version. 11 1.1 mrg 12 1.1 mrg GCC is distributed in the hope that it will be useful, but WITHOUT 13 1.1 mrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15 1.1 mrg License for more details. 16 1.1 mrg 17 1.1 mrg You should have received a copy of the GNU General Public License 18 1.1 mrg along with GCC; see the file COPYING3. If not see 19 1.1 mrg <http://www.gnu.org/licenses/>. */ 20 1.1 mrg 21 1.1 mrg #define IN_TARGET_CODE 1 22 1.1 mrg 23 1.1 mrg #include "config.h" 24 1.1 mrg #include <sstream> 25 1.1 mrg #include "system.h" 26 1.1 mrg #include "coretypes.h" 27 1.1 mrg #include "backend.h" 28 1.1 mrg #include "target.h" 29 1.1 mrg #include "rtl.h" 30 1.1 mrg #include "tree.h" 31 1.1 mrg #include "cfghooks.h" 32 1.1 mrg #include "df.h" 33 1.1 mrg #include "memmodel.h" 34 1.1 mrg #include "tm_p.h" 35 1.1 mrg #include "expmed.h" 36 1.1 mrg #include "optabs.h" 37 1.1 mrg #include "regs.h" 38 1.1 mrg #include "emit-rtl.h" 39 1.1 mrg #include "recog.h" 40 1.1 mrg #include "diagnostic.h" 41 1.1 mrg #include "alias.h" 42 1.1 mrg #include "insn-flags.h" 43 1.1 mrg #include "output.h" 44 1.1 mrg #include "insn-attr.h" 45 1.1 mrg #include "flags.h" 46 1.1 mrg #include "dojump.h" 47 1.1 mrg #include "explow.h" 48 1.1 mrg #include "calls.h" 49 1.1 mrg #include "varasm.h" 50 1.1 mrg #include "stmt.h" 51 1.1 mrg #include "expr.h" 52 1.1 mrg #include "tm-preds.h" 53 1.1 mrg #include "tm-constrs.h" 54 1.1 mrg #include "langhooks.h" 55 1.1 mrg #include "dbxout.h" 56 1.1 mrg #include "cfgrtl.h" 57 1.1 mrg #include "gimple.h" 58 1.1 mrg #include "stor-layout.h" 59 1.1 mrg #include "builtins.h" 60 1.1 mrg #include "omp-general.h" 61 1.1 mrg #include "omp-low.h" 62 1.1 mrg #include "omp-offload.h" 63 1.1 mrg #include "gomp-constants.h" 64 1.1 mrg #include "dumpfile.h" 65 1.1 mrg #include "internal-fn.h" 66 1.1 mrg #include "gimple-iterator.h" 67 1.1 mrg #include "stringpool.h" 68 1.1 mrg #include "attribs.h" 69 1.1 mrg #include "tree-vrp.h" 70 1.1 mrg #include "tree-ssa-operands.h" 71 1.1 mrg #include "tree-ssanames.h" 72 1.1 mrg #include "gimplify.h" 73 1.1 mrg #include "tree-phinodes.h" 74 1.1 mrg #include "cfgloop.h" 75 1.1 mrg #include "fold-const.h" 76 1.1 mrg #include "intl.h" 77 1.1 mrg #include "opts.h" 78 1.1 mrg #include "tree-pretty-print.h" 79 1.1 mrg #include "rtl-iter.h" 80 1.1 mrg #include "cgraph.h" 81 1.1 mrg 82 1.1 mrg /* This file should be included last. */ 83 1.1 mrg #include "target-def.h" 84 1.1 mrg 85 1.1 mrg #define WORKAROUND_PTXJIT_BUG 1 86 1.1 mrg #define WORKAROUND_PTXJIT_BUG_2 1 87 1.1 mrg #define WORKAROUND_PTXJIT_BUG_3 1 88 1.1 mrg 89 1.1 mrg /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread 90 1.1 mrg block, which has had a maximum number of threads of 1024 since CUDA version 91 1.1 mrg 2.x. */ 92 1.1 mrg #define PTX_CTA_SIZE 1024 93 1.1 mrg 94 1.1 mrg #define PTX_CTA_NUM_BARRIERS 16 95 1.1 mrg #define PTX_WARP_SIZE 32 96 1.1 mrg 97 1.1 mrg #define PTX_PER_CTA_BARRIER 0 98 1.1 mrg #define PTX_NUM_PER_CTA_BARRIERS 1 99 1.1 mrg #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS) 100 1.1 mrg #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS) 101 1.1 mrg 102 1.1 mrg #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE 103 1.1 mrg #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE 104 1.1 mrg #define PTX_WORKER_LENGTH 32 105 1.1 mrg #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */ 106 1.1 mrg 107 1.1 mrg /* The various PTX memory areas an object might reside in. */ 108 1.1 mrg enum nvptx_data_area 109 1.1 mrg { 110 1.1 mrg DATA_AREA_GENERIC, 111 1.1 mrg DATA_AREA_GLOBAL, 112 1.1 mrg DATA_AREA_SHARED, 113 1.1 mrg DATA_AREA_LOCAL, 114 1.1 mrg DATA_AREA_CONST, 115 1.1 mrg DATA_AREA_PARAM, 116 1.1 mrg DATA_AREA_MAX 117 1.1 mrg }; 118 1.1 mrg 119 1.1 mrg /* We record the data area in the target symbol flags. */ 120 1.1 mrg #define SYMBOL_DATA_AREA(SYM) \ 121 1.1 mrg (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \ 122 1.1 mrg & 7) 123 1.1 mrg #define SET_SYMBOL_DATA_AREA(SYM,AREA) \ 124 1.1 mrg (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT) 125 1.1 mrg 126 1.1 mrg /* Record the function decls we've written, and the libfuncs and function 127 1.1 mrg decls corresponding to them. */ 128 1.1 mrg static std::stringstream func_decls; 129 1.1 mrg 130 1.1 mrg struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def> 131 1.1 mrg { 132 1.1 mrg static hashval_t hash (rtx x) { return htab_hash_pointer (x); } 133 1.1 mrg static bool equal (rtx a, rtx b) { return a == b; } 134 1.1 mrg }; 135 1.1 mrg 136 1.1 mrg static GTY((cache)) 137 1.1 mrg hash_table<declared_libfunc_hasher> *declared_libfuncs_htab; 138 1.1 mrg 139 1.1 mrg struct tree_hasher : ggc_cache_ptr_hash<tree_node> 140 1.1 mrg { 141 1.1 mrg static hashval_t hash (tree t) { return htab_hash_pointer (t); } 142 1.1 mrg static bool equal (tree a, tree b) { return a == b; } 143 1.1 mrg }; 144 1.1 mrg 145 1.1 mrg static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab; 146 1.1 mrg static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab; 147 1.1 mrg 148 1.1 mrg /* Buffer needed to broadcast across workers and vectors. This is 149 1.1 mrg used for both worker-neutering and worker broadcasting, and 150 1.1 mrg vector-neutering and boardcasting when vector_length > 32. It is 151 1.1 mrg shared by all functions emitted. The buffer is placed in shared 152 1.1 mrg memory. It'd be nice if PTX supported common blocks, because then 153 1.1 mrg this could be shared across TUs (taking the largest size). */ 154 1.1 mrg static unsigned oacc_bcast_size; 155 1.1 mrg static unsigned oacc_bcast_partition; 156 1.1 mrg static unsigned oacc_bcast_align; 157 1.1 mrg static GTY(()) rtx oacc_bcast_sym; 158 1.1 mrg 159 1.1 mrg /* Buffer needed for worker reductions. This has to be distinct from 160 1.1 mrg the worker broadcast array, as both may be live concurrently. */ 161 1.1 mrg static unsigned worker_red_size; 162 1.1 mrg static unsigned worker_red_align; 163 1.1 mrg static GTY(()) rtx worker_red_sym; 164 1.1 mrg 165 1.1 mrg /* Buffer needed for vector reductions, when vector_length > 166 1.1 mrg PTX_WARP_SIZE. This has to be distinct from the worker broadcast 167 1.1 mrg array, as both may be live concurrently. */ 168 1.1 mrg static unsigned vector_red_size; 169 1.1 mrg static unsigned vector_red_align; 170 1.1 mrg static unsigned vector_red_partition; 171 1.1 mrg static GTY(()) rtx vector_red_sym; 172 1.1 mrg 173 1.1 mrg /* Shared memory block for gang-private variables. */ 174 1.1 mrg static unsigned gang_private_shared_size; 175 1.1 mrg static unsigned gang_private_shared_align; 176 1.1 mrg static GTY(()) rtx gang_private_shared_sym; 177 1.1 mrg static hash_map<tree_decl_hash, unsigned int> gang_private_shared_hmap; 178 1.1 mrg 179 1.1 mrg /* Global lock variable, needed for 128bit worker & gang reductions. */ 180 1.1 mrg static GTY(()) tree global_lock_var; 181 1.1 mrg 182 1.1 mrg /* True if any function references __nvptx_stacks. */ 183 1.1 mrg static bool need_softstack_decl; 184 1.1 mrg 185 1.1 mrg /* True if any function references __nvptx_uni. */ 186 1.1 mrg static bool need_unisimt_decl; 187 1.1 mrg 188 1.1 mrg static int nvptx_mach_max_workers (); 189 1.1 mrg 190 1.1 mrg /* Allocate a new, cleared machine_function structure. */ 191 1.1 mrg 192 1.1 mrg static struct machine_function * 193 1.1 mrg nvptx_init_machine_status (void) 194 1.1 mrg { 195 1.1 mrg struct machine_function *p = ggc_cleared_alloc<machine_function> (); 196 1.1 mrg p->return_mode = VOIDmode; 197 1.1 mrg return p; 198 1.1 mrg } 199 1.1 mrg 200 1.1 mrg /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL) 201 1.1 mrg and -fopenacc is also enabled. */ 202 1.1 mrg 203 1.1 mrg static void 204 1.1 mrg diagnose_openacc_conflict (bool optval, const char *optname) 205 1.1 mrg { 206 1.1 mrg if (flag_openacc && optval) 207 1.1 mrg error ("option %s is not supported together with %<-fopenacc%>", optname); 208 1.1 mrg } 209 1.1 mrg 210 1.1 mrg static enum ptx_version 211 1.1 mrg first_ptx_version_supporting_sm (enum ptx_isa sm) 212 1.1 mrg { 213 1.1 mrg switch (sm) 214 1.1 mrg { 215 1.1 mrg case PTX_ISA_SM30: 216 1.1 mrg return PTX_VERSION_3_0; 217 1.1 mrg case PTX_ISA_SM35: 218 1.1 mrg return PTX_VERSION_3_1; 219 1.1 mrg case PTX_ISA_SM53: 220 1.1 mrg return PTX_VERSION_4_2; 221 1.1 mrg case PTX_ISA_SM70: 222 1.1 mrg return PTX_VERSION_6_0; 223 1.1 mrg case PTX_ISA_SM75: 224 1.1 mrg return PTX_VERSION_6_3; 225 1.1 mrg case PTX_ISA_SM80: 226 1.1 mrg return PTX_VERSION_7_0; 227 1.1 mrg default: 228 1.1 mrg gcc_unreachable (); 229 1.1 mrg } 230 1.1 mrg } 231 1.1 mrg 232 1.1 mrg static enum ptx_version 233 1.1 mrg default_ptx_version_option (void) 234 1.1 mrg { 235 1.1 mrg enum ptx_version first 236 1.1 mrg = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option); 237 1.1 mrg 238 1.1 mrg /* Pick a version that supports the sm. */ 239 1.1 mrg enum ptx_version res = first; 240 1.1 mrg 241 1.1 mrg /* Pick at least 3.1. This has been the smallest version historically. */ 242 1.1 mrg res = MAX (res, PTX_VERSION_3_1); 243 1.1 mrg 244 1.1 mrg /* Pick at least 6.0, to enable using bar.warp.sync to have a way to force 245 1.1 mrg warp convergence. */ 246 1.1 mrg res = MAX (res, PTX_VERSION_6_0); 247 1.1 mrg 248 1.1 mrg /* Verify that we pick a version that supports the sm. */ 249 1.1 mrg gcc_assert (first <= res); 250 1.1 mrg return res; 251 1.1 mrg } 252 1.1 mrg 253 1.1 mrg static const char * 254 1.1 mrg ptx_version_to_string (enum ptx_version v) 255 1.1 mrg { 256 1.1 mrg switch (v) 257 1.1 mrg { 258 1.1 mrg case PTX_VERSION_3_0: 259 1.1 mrg return "3.0"; 260 1.1 mrg case PTX_VERSION_3_1: 261 1.1 mrg return "3.1"; 262 1.1 mrg case PTX_VERSION_4_2: 263 1.1 mrg return "4.2"; 264 1.1 mrg case PTX_VERSION_6_0: 265 1.1 mrg return "6.0"; 266 1.1 mrg case PTX_VERSION_6_3: 267 1.1 mrg return "6.3"; 268 1.1 mrg case PTX_VERSION_7_0: 269 1.1 mrg return "7.0"; 270 1.1 mrg default: 271 1.1 mrg gcc_unreachable (); 272 1.1 mrg } 273 1.1 mrg } 274 1.1 mrg 275 1.1 mrg unsigned int 276 1.1 mrg ptx_version_to_number (enum ptx_version v, bool major_p) 277 1.1 mrg { 278 1.1 mrg switch (v) 279 1.1 mrg { 280 1.1 mrg case PTX_VERSION_3_0: 281 1.1 mrg return major_p ? 3 : 0; 282 1.1 mrg case PTX_VERSION_3_1: 283 1.1 mrg return major_p ? 3 : 1; 284 1.1 mrg case PTX_VERSION_4_2: 285 1.1 mrg return major_p ? 4 : 2; 286 1.1 mrg case PTX_VERSION_6_0: 287 1.1 mrg return major_p ? 6 : 0; 288 1.1 mrg case PTX_VERSION_6_3: 289 1.1 mrg return major_p ? 6 : 3; 290 1.1 mrg case PTX_VERSION_7_0: 291 1.1 mrg return major_p ? 7 : 0; 292 1.1 mrg default: 293 1.1 mrg gcc_unreachable (); 294 1.1 mrg } 295 1.1 mrg } 296 1.1 mrg 297 1.1 mrg static const char * 298 1.1 mrg sm_version_to_string (enum ptx_isa sm) 299 1.1 mrg { 300 1.1 mrg switch (sm) 301 1.1 mrg { 302 1.1 mrg #define NVPTX_SM(XX, SEP) \ 303 1.1 mrg case PTX_ISA_SM ## XX: \ 304 1.1 mrg return #XX; 305 1.1 mrg #include "nvptx-sm.def" 306 1.1 mrg #undef NVPTX_SM 307 1.1 mrg default: 308 1.1 mrg gcc_unreachable (); 309 1.1 mrg } 310 1.1 mrg } 311 1.1 mrg 312 1.1 mrg static void 313 1.1 mrg handle_ptx_version_option (void) 314 1.1 mrg { 315 1.1 mrg if (!OPTION_SET_P (ptx_version_option) 316 1.1 mrg || ptx_version_option == PTX_VERSION_default) 317 1.1 mrg { 318 1.1 mrg ptx_version_option = default_ptx_version_option (); 319 1.1 mrg return; 320 1.1 mrg } 321 1.1 mrg 322 1.1 mrg enum ptx_version first 323 1.1 mrg = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option); 324 1.1 mrg 325 1.1 mrg if (ptx_version_option < first) 326 1.1 mrg error ("PTX version (%<-mptx%>) needs to be at least %s to support selected" 327 1.1 mrg " %<-misa%> (sm_%s)", ptx_version_to_string (first), 328 1.1 mrg sm_version_to_string ((enum ptx_isa)ptx_isa_option)); 329 1.1 mrg } 330 1.1 mrg 331 1.1 mrg /* Implement TARGET_OPTION_OVERRIDE. */ 332 1.1 mrg 333 1.1 mrg static void 334 1.1 mrg nvptx_option_override (void) 335 1.1 mrg { 336 1.1 mrg init_machine_status = nvptx_init_machine_status; 337 1.1 mrg 338 1.1 mrg handle_ptx_version_option (); 339 1.1 mrg 340 1.1 mrg /* Set toplevel_reorder, unless explicitly disabled. We need 341 1.1 mrg reordering so that we emit necessary assembler decls of 342 1.1 mrg undeclared variables. */ 343 1.1 mrg if (!OPTION_SET_P (flag_toplevel_reorder)) 344 1.1 mrg flag_toplevel_reorder = 1; 345 1.1 mrg 346 1.1 mrg debug_nonbind_markers_p = 0; 347 1.1 mrg 348 1.1 mrg /* Set flag_no_common, unless explicitly disabled. We fake common 349 1.1 mrg using .weak, and that's not entirely accurate, so avoid it 350 1.1 mrg unless forced. */ 351 1.1 mrg if (!OPTION_SET_P (flag_no_common)) 352 1.1 mrg flag_no_common = 1; 353 1.1 mrg 354 1.1 mrg /* The patch area requires nops, which we don't have. */ 355 1.1 mrg HOST_WIDE_INT patch_area_size, patch_area_entry; 356 1.1 mrg parse_and_check_patch_area (flag_patchable_function_entry, false, 357 1.1 mrg &patch_area_size, &patch_area_entry); 358 1.1 mrg if (patch_area_size > 0) 359 1.1 mrg sorry ("not generating patch area, nops not supported"); 360 1.1 mrg 361 1.1 mrg /* Assumes that it will see only hard registers. */ 362 1.1 mrg flag_var_tracking = 0; 363 1.1 mrg 364 1.1 mrg if (nvptx_optimize < 0) 365 1.1 mrg nvptx_optimize = optimize > 0; 366 1.1 mrg 367 1.1 mrg declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); 368 1.1 mrg needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); 369 1.1 mrg declared_libfuncs_htab 370 1.1 mrg = hash_table<declared_libfunc_hasher>::create_ggc (17); 371 1.1 mrg 372 1.1 mrg oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast"); 373 1.1 mrg SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED); 374 1.1 mrg oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; 375 1.1 mrg oacc_bcast_partition = 0; 376 1.1 mrg 377 1.1 mrg worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red"); 378 1.1 mrg SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED); 379 1.1 mrg worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; 380 1.1 mrg 381 1.1 mrg vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red"); 382 1.1 mrg SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED); 383 1.1 mrg vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; 384 1.1 mrg vector_red_partition = 0; 385 1.1 mrg 386 1.1 mrg gang_private_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gang_private_shared"); 387 1.1 mrg SET_SYMBOL_DATA_AREA (gang_private_shared_sym, DATA_AREA_SHARED); 388 1.1 mrg gang_private_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; 389 1.1 mrg 390 1.1 mrg diagnose_openacc_conflict (TARGET_GOMP, "-mgomp"); 391 1.1 mrg diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack"); 392 1.1 mrg diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt"); 393 1.1 mrg 394 1.1 mrg if (TARGET_GOMP) 395 1.1 mrg target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT; 396 1.1 mrg } 397 1.1 mrg 398 1.1 mrg /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to 399 1.1 mrg deal with ptx ideosyncracies. */ 400 1.1 mrg 401 1.1 mrg const char * 402 1.1 mrg nvptx_ptx_type_from_mode (machine_mode mode, bool promote) 403 1.1 mrg { 404 1.1 mrg switch (mode) 405 1.1 mrg { 406 1.1 mrg case E_BLKmode: 407 1.1 mrg return ".b8"; 408 1.1 mrg case E_BImode: 409 1.1 mrg return ".pred"; 410 1.1 mrg case E_QImode: 411 1.1 mrg if (promote) 412 1.1 mrg return ".u32"; 413 1.1 mrg else 414 1.1 mrg return ".u8"; 415 1.1 mrg case E_HImode: 416 1.1 mrg return ".u16"; 417 1.1 mrg case E_SImode: 418 1.1 mrg return ".u32"; 419 1.1 mrg case E_DImode: 420 1.1 mrg return ".u64"; 421 1.1 mrg 422 1.1 mrg case E_HFmode: 423 1.1 mrg return ".f16"; 424 1.1 mrg case E_SFmode: 425 1.1 mrg return ".f32"; 426 1.1 mrg case E_DFmode: 427 1.1 mrg return ".f64"; 428 1.1 mrg 429 1.1 mrg case E_V2SImode: 430 1.1 mrg return ".v2.u32"; 431 1.1 mrg case E_V2DImode: 432 1.1 mrg return ".v2.u64"; 433 1.1 mrg 434 1.1 mrg default: 435 1.1 mrg gcc_unreachable (); 436 1.1 mrg } 437 1.1 mrg } 438 1.1 mrg 439 1.1 mrg /* Encode the PTX data area that DECL (which might not actually be a 440 1.1 mrg _DECL) should reside in. */ 441 1.1 mrg 442 1.1 mrg static void 443 1.1 mrg nvptx_encode_section_info (tree decl, rtx rtl, int first) 444 1.1 mrg { 445 1.1 mrg default_encode_section_info (decl, rtl, first); 446 1.1 mrg if (first && MEM_P (rtl)) 447 1.1 mrg { 448 1.1 mrg nvptx_data_area area = DATA_AREA_GENERIC; 449 1.1 mrg 450 1.1 mrg if (TREE_CONSTANT (decl)) 451 1.1 mrg area = DATA_AREA_CONST; 452 1.1 mrg else if (TREE_CODE (decl) == VAR_DECL) 453 1.1 mrg { 454 1.1 mrg if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl))) 455 1.1 mrg { 456 1.1 mrg area = DATA_AREA_SHARED; 457 1.1 mrg if (DECL_INITIAL (decl)) 458 1.1 mrg error ("static initialization of variable %q+D in %<.shared%>" 459 1.1 mrg " memory is not supported", decl); 460 1.1 mrg } 461 1.1 mrg else 462 1.1 mrg area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL; 463 1.1 mrg } 464 1.1 mrg 465 1.1 mrg SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area); 466 1.1 mrg } 467 1.1 mrg } 468 1.1 mrg 469 1.1 mrg /* Return the PTX name of the data area in which SYM should be 470 1.1 mrg placed. The symbol must have already been processed by 471 1.1 mrg nvptx_encode_seciton_info, or equivalent. */ 472 1.1 mrg 473 1.1 mrg static const char * 474 1.1 mrg section_for_sym (rtx sym) 475 1.1 mrg { 476 1.1 mrg nvptx_data_area area = SYMBOL_DATA_AREA (sym); 477 1.1 mrg /* Same order as nvptx_data_area enum. */ 478 1.1 mrg static char const *const areas[] = 479 1.1 mrg {"", ".global", ".shared", ".local", ".const", ".param"}; 480 1.1 mrg 481 1.1 mrg return areas[area]; 482 1.1 mrg } 483 1.1 mrg 484 1.1 mrg /* Similarly for a decl. */ 485 1.1 mrg 486 1.1 mrg static const char * 487 1.1 mrg section_for_decl (const_tree decl) 488 1.1 mrg { 489 1.1 mrg return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0)); 490 1.1 mrg } 491 1.1 mrg 492 1.1 mrg /* Check NAME for special function names and redirect them by returning a 493 1.1 mrg replacement. This applies to malloc, free and realloc, for which we 494 1.1 mrg want to use libgcc wrappers, and call, which triggers a bug in 495 1.1 mrg ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's 496 1.1 mrg not active in an offload compiler -- the names are all set by the 497 1.1 mrg host-side compiler. */ 498 1.1 mrg 499 1.1 mrg static const char * 500 1.1 mrg nvptx_name_replacement (const char *name) 501 1.1 mrg { 502 1.1 mrg if (strcmp (name, "call") == 0) 503 1.1 mrg return "__nvptx_call"; 504 1.1 mrg if (strcmp (name, "malloc") == 0) 505 1.1 mrg return "__nvptx_malloc"; 506 1.1 mrg if (strcmp (name, "free") == 0) 507 1.1 mrg return "__nvptx_free"; 508 1.1 mrg if (strcmp (name, "realloc") == 0) 509 1.1 mrg return "__nvptx_realloc"; 510 1.1 mrg return name; 511 1.1 mrg } 512 1.1 mrg 513 1.1 mrg /* Return NULL if NAME contains no dot. Otherwise return a copy of NAME 514 1.1 mrg with the dots replaced with dollar signs. */ 515 1.1 mrg 516 1.1 mrg static char * 517 1.1 mrg nvptx_replace_dot (const char *name) 518 1.1 mrg { 519 1.1 mrg if (strchr (name, '.') == NULL) 520 1.1 mrg return NULL; 521 1.1 mrg 522 1.1 mrg char *p = xstrdup (name); 523 1.1 mrg for (size_t i = 0; i < strlen (p); ++i) 524 1.1 mrg if (p[i] == '.') 525 1.1 mrg p[i] = '$'; 526 1.1 mrg return p; 527 1.1 mrg } 528 1.1 mrg 529 1.1 mrg /* If MODE should be treated as two registers of an inner mode, return 530 1.1 mrg that inner mode. Otherwise return VOIDmode. */ 531 1.1 mrg 532 1.1 mrg static machine_mode 533 1.1 mrg maybe_split_mode (machine_mode mode) 534 1.1 mrg { 535 1.1 mrg if (COMPLEX_MODE_P (mode)) 536 1.1 mrg return GET_MODE_INNER (mode); 537 1.1 mrg 538 1.1 mrg if (mode == TImode) 539 1.1 mrg return DImode; 540 1.1 mrg 541 1.1 mrg return VOIDmode; 542 1.1 mrg } 543 1.1 mrg 544 1.1 mrg /* Return true if mode should be treated as two registers. */ 545 1.1 mrg 546 1.1 mrg static bool 547 1.1 mrg split_mode_p (machine_mode mode) 548 1.1 mrg { 549 1.1 mrg return maybe_split_mode (mode) != VOIDmode; 550 1.1 mrg } 551 1.1 mrg 552 1.1 mrg /* Output a register, subreg, or register pair (with optional 553 1.1 mrg enclosing braces). */ 554 1.1 mrg 555 1.1 mrg static void 556 1.1 mrg output_reg (FILE *file, unsigned regno, machine_mode inner_mode, 557 1.1 mrg int subreg_offset = -1) 558 1.1 mrg { 559 1.1 mrg if (inner_mode == VOIDmode) 560 1.1 mrg { 561 1.1 mrg if (HARD_REGISTER_NUM_P (regno)) 562 1.1 mrg fprintf (file, "%s", reg_names[regno]); 563 1.1 mrg else 564 1.1 mrg fprintf (file, "%%r%d", regno); 565 1.1 mrg } 566 1.1 mrg else if (subreg_offset >= 0) 567 1.1 mrg { 568 1.1 mrg output_reg (file, regno, VOIDmode); 569 1.1 mrg fprintf (file, "$%d", subreg_offset); 570 1.1 mrg } 571 1.1 mrg else 572 1.1 mrg { 573 1.1 mrg if (subreg_offset == -1) 574 1.1 mrg fprintf (file, "{"); 575 1.1 mrg output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode)); 576 1.1 mrg fprintf (file, ","); 577 1.1 mrg output_reg (file, regno, inner_mode, 0); 578 1.1 mrg if (subreg_offset == -1) 579 1.1 mrg fprintf (file, "}"); 580 1.1 mrg } 581 1.1 mrg } 582 1.1 mrg 583 1.1 mrg /* Emit forking instructions for MASK. */ 584 1.1 mrg 585 1.1 mrg static void 586 1.1 mrg nvptx_emit_forking (unsigned mask, bool is_call) 587 1.1 mrg { 588 1.1 mrg mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER) 589 1.1 mrg | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); 590 1.1 mrg if (mask) 591 1.1 mrg { 592 1.1 mrg rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX)); 593 1.1 mrg 594 1.1 mrg /* Emit fork at all levels. This helps form SESE regions, as 595 1.1 mrg it creates a block with a single successor before entering a 596 1.1 mrg partitooned region. That is a good candidate for the end of 597 1.1 mrg an SESE region. */ 598 1.1 mrg emit_insn (gen_nvptx_fork (op)); 599 1.1 mrg emit_insn (gen_nvptx_forked (op)); 600 1.1 mrg } 601 1.1 mrg } 602 1.1 mrg 603 1.1 mrg /* Emit joining instructions for MASK. */ 604 1.1 mrg 605 1.1 mrg static void 606 1.1 mrg nvptx_emit_joining (unsigned mask, bool is_call) 607 1.1 mrg { 608 1.1 mrg mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER) 609 1.1 mrg | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); 610 1.1 mrg if (mask) 611 1.1 mrg { 612 1.1 mrg rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX)); 613 1.1 mrg 614 1.1 mrg /* Emit joining for all non-call pars to ensure there's a single 615 1.1 mrg predecessor for the block the join insn ends up in. This is 616 1.1 mrg needed for skipping entire loops. */ 617 1.1 mrg emit_insn (gen_nvptx_joining (op)); 618 1.1 mrg emit_insn (gen_nvptx_join (op)); 619 1.1 mrg } 620 1.1 mrg } 621 1.1 mrg 622 1.1 mrg 623 1.1 mrg /* Determine whether MODE and TYPE (possibly NULL) should be passed or 625 1.1 mrg returned in memory. Integer and floating types supported by the 626 1.1 mrg machine are passed in registers, everything else is passed in 627 1.1 mrg memory. Complex types are split. */ 628 1.1 mrg 629 1.1 mrg static bool 630 1.1 mrg pass_in_memory (machine_mode mode, const_tree type, bool for_return) 631 1.1 mrg { 632 1.1 mrg if (type) 633 1.1 mrg { 634 1.1 mrg if (AGGREGATE_TYPE_P (type)) 635 1.1 mrg return true; 636 1.1 mrg if (TREE_CODE (type) == VECTOR_TYPE) 637 1.1 mrg return true; 638 1.1 mrg } 639 1.1 mrg 640 1.1 mrg if (!for_return && COMPLEX_MODE_P (mode)) 641 1.1 mrg /* Complex types are passed as two underlying args. */ 642 1.1 mrg mode = GET_MODE_INNER (mode); 643 1.1 mrg 644 1.1 mrg if (GET_MODE_CLASS (mode) != MODE_INT 645 1.1 mrg && GET_MODE_CLASS (mode) != MODE_FLOAT) 646 1.1 mrg return true; 647 1.1 mrg 648 1.1 mrg if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) 649 1.1 mrg return true; 650 1.1 mrg 651 1.1 mrg return false; 652 1.1 mrg } 653 1.1 mrg 654 1.1 mrg /* A non-memory argument of mode MODE is being passed, determine the mode it 655 1.1 mrg should be promoted to. This is also used for determining return 656 1.1 mrg type promotion. */ 657 1.1 mrg 658 1.1 mrg static machine_mode 659 1.1 mrg promote_arg (machine_mode mode, bool prototyped) 660 1.1 mrg { 661 1.1 mrg if (!prototyped && mode == SFmode) 662 1.1 mrg /* K&R float promotion for unprototyped functions. */ 663 1.1 mrg mode = DFmode; 664 1.1 mrg else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode)) 665 1.1 mrg mode = SImode; 666 1.1 mrg 667 1.1 mrg return mode; 668 1.1 mrg } 669 1.1 mrg 670 1.1 mrg /* A non-memory return type of MODE is being returned. Determine the 671 1.1 mrg mode it should be promoted to. */ 672 1.1 mrg 673 1.1 mrg static machine_mode 674 1.1 mrg promote_return (machine_mode mode) 675 1.1 mrg { 676 1.1 mrg return promote_arg (mode, true); 677 1.1 mrg } 678 1.1 mrg 679 1.1 mrg /* Implement TARGET_FUNCTION_ARG. */ 680 1.1 mrg 681 1.1 mrg static rtx 682 1.1 mrg nvptx_function_arg (cumulative_args_t, const function_arg_info &arg) 683 1.1 mrg { 684 1.1 mrg if (arg.end_marker_p () || !arg.named) 685 1.1 mrg return NULL_RTX; 686 1.1 mrg 687 1.1 mrg return gen_reg_rtx (arg.mode); 688 1.1 mrg } 689 1.1 mrg 690 1.1 mrg /* Implement TARGET_FUNCTION_INCOMING_ARG. */ 691 1.1 mrg 692 1.1 mrg static rtx 693 1.1 mrg nvptx_function_incoming_arg (cumulative_args_t cum_v, 694 1.1 mrg const function_arg_info &arg) 695 1.1 mrg { 696 1.1 mrg CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 697 1.1 mrg 698 1.1 mrg if (arg.end_marker_p () || !arg.named) 699 1.1 mrg return NULL_RTX; 700 1.1 mrg 701 1.1 mrg /* No need to deal with split modes here, the only case that can 702 1.1 mrg happen is complex modes and those are dealt with by 703 1.1 mrg TARGET_SPLIT_COMPLEX_ARG. */ 704 1.1 mrg return gen_rtx_UNSPEC (arg.mode, 705 1.1 mrg gen_rtvec (1, GEN_INT (cum->count)), 706 1.1 mrg UNSPEC_ARG_REG); 707 1.1 mrg } 708 1.1 mrg 709 1.1 mrg /* Implement TARGET_FUNCTION_ARG_ADVANCE. */ 710 1.1 mrg 711 1.1 mrg static void 712 1.1 mrg nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &) 713 1.1 mrg { 714 1.1 mrg CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 715 1.1 mrg 716 1.1 mrg cum->count++; 717 1.1 mrg } 718 1.1 mrg 719 1.1 mrg /* Implement TARGET_FUNCTION_ARG_BOUNDARY. 720 1.1 mrg 721 1.1 mrg For nvptx This is only used for varadic args. The type has already 722 1.1 mrg been promoted and/or converted to invisible reference. */ 723 1.1 mrg 724 1.1 mrg static unsigned 725 1.1 mrg nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type)) 726 1.1 mrg { 727 1.1 mrg return GET_MODE_ALIGNMENT (mode); 728 1.1 mrg } 729 1.1 mrg 730 1.1 mrg /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook. 731 1.1 mrg 732 1.1 mrg For nvptx, we know how to handle functions declared as stdarg: by 733 1.1 mrg passing an extra pointer to the unnamed arguments. However, the 734 1.1 mrg Fortran frontend can produce a different situation, where a 735 1.1 mrg function pointer is declared with no arguments, but the actual 736 1.1 mrg function and calls to it take more arguments. In that case, we 737 1.1 mrg want to ensure the call matches the definition of the function. */ 738 1.1 mrg 739 1.1 mrg static bool 740 1.1 mrg nvptx_strict_argument_naming (cumulative_args_t cum_v) 741 1.1 mrg { 742 1.1 mrg CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 743 1.1 mrg 744 1.1 mrg return cum->fntype == NULL_TREE || stdarg_p (cum->fntype); 745 1.1 mrg } 746 1.1 mrg 747 1.1 mrg /* Implement TARGET_LIBCALL_VALUE. */ 748 1.1 mrg 749 1.1 mrg static rtx 750 1.1 mrg nvptx_libcall_value (machine_mode mode, const_rtx) 751 1.1 mrg { 752 1.1 mrg if (!cfun || !cfun->machine->doing_call) 753 1.1 mrg /* Pretend to return in a hard reg for early uses before pseudos can be 754 1.1 mrg generated. */ 755 1.1 mrg return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM); 756 1.1 mrg 757 1.1 mrg return gen_reg_rtx (mode); 758 1.1 mrg } 759 1.1 mrg 760 1.1 mrg /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place 761 1.1 mrg where function FUNC returns or receives a value of data type TYPE. */ 762 1.1 mrg 763 1.1 mrg static rtx 764 1.1 mrg nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func), 765 1.1 mrg bool outgoing) 766 1.1 mrg { 767 1.1 mrg machine_mode mode = promote_return (TYPE_MODE (type)); 768 1.1 mrg 769 1.1 mrg if (outgoing) 770 1.1 mrg { 771 1.1 mrg gcc_assert (cfun); 772 1.1 mrg cfun->machine->return_mode = mode; 773 1.1 mrg return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM); 774 1.1 mrg } 775 1.1 mrg 776 1.1 mrg return nvptx_libcall_value (mode, NULL_RTX); 777 1.1 mrg } 778 1.1 mrg 779 1.1 mrg /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */ 780 1.1 mrg 781 1.1 mrg static bool 782 1.1 mrg nvptx_function_value_regno_p (const unsigned int regno) 783 1.1 mrg { 784 1.1 mrg return regno == NVPTX_RETURN_REGNUM; 785 1.1 mrg } 786 1.1 mrg 787 1.1 mrg /* Types with a mode other than those supported by the machine are passed by 788 1.1 mrg reference in memory. */ 789 1.1 mrg 790 1.1 mrg static bool 791 1.1 mrg nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg) 792 1.1 mrg { 793 1.1 mrg return pass_in_memory (arg.mode, arg.type, false); 794 1.1 mrg } 795 1.1 mrg 796 1.1 mrg /* Implement TARGET_RETURN_IN_MEMORY. */ 797 1.1 mrg 798 1.1 mrg static bool 799 1.1 mrg nvptx_return_in_memory (const_tree type, const_tree) 800 1.1 mrg { 801 1.1 mrg return pass_in_memory (TYPE_MODE (type), type, true); 802 1.1 mrg } 803 1.1 mrg 804 1.1 mrg /* Implement TARGET_PROMOTE_FUNCTION_MODE. */ 805 1.1 mrg 806 1.1 mrg static machine_mode 807 1.1 mrg nvptx_promote_function_mode (const_tree type, machine_mode mode, 808 1.1 mrg int *ARG_UNUSED (punsignedp), 809 1.1 mrg const_tree funtype, int for_return) 810 1.1 mrg { 811 1.1 mrg return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype)); 812 1.1 mrg } 813 1.1 mrg 814 1.1 mrg /* Helper for write_arg. Emit a single PTX argument of MODE, either 815 1.1 mrg in a prototype, or as copy in a function prologue. ARGNO is the 816 1.1 mrg index of this argument in the PTX function. FOR_REG is negative, 817 1.1 mrg if we're emitting the PTX prototype. It is zero if we're copying 818 1.1 mrg to an argument register and it is greater than zero if we're 819 1.1 mrg copying to a specific hard register. */ 820 1.1 mrg 821 1.1 mrg static int 822 1.1 mrg write_arg_mode (std::stringstream &s, int for_reg, int argno, 823 1.1 mrg machine_mode mode) 824 1.1 mrg { 825 1.1 mrg const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); 826 1.1 mrg 827 1.1 mrg if (for_reg < 0) 828 1.1 mrg { 829 1.1 mrg /* Writing PTX prototype. */ 830 1.1 mrg s << (argno ? ", " : " ("); 831 1.1 mrg s << ".param" << ptx_type << " %in_ar" << argno; 832 1.1 mrg } 833 1.1 mrg else 834 1.1 mrg { 835 1.1 mrg s << "\t.reg" << ptx_type << " "; 836 1.1 mrg if (for_reg) 837 1.1 mrg s << reg_names[for_reg]; 838 1.1 mrg else 839 1.1 mrg s << "%ar" << argno; 840 1.1 mrg s << ";\n"; 841 1.1 mrg if (argno >= 0) 842 1.1 mrg { 843 1.1 mrg s << "\tld.param" << ptx_type << " "; 844 1.1 mrg if (for_reg) 845 1.1 mrg s << reg_names[for_reg]; 846 1.1 mrg else 847 1.1 mrg s << "%ar" << argno; 848 1.1 mrg s << ", [%in_ar" << argno << "];\n"; 849 1.1 mrg } 850 1.1 mrg } 851 1.1 mrg return argno + 1; 852 1.1 mrg } 853 1.1 mrg 854 1.1 mrg /* Process function parameter TYPE to emit one or more PTX 855 1.1 mrg arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED 856 1.1 mrg is true, if this is a prototyped function, rather than an old-style 857 1.1 mrg C declaration. Returns the next argument number to use. 858 1.1 mrg 859 1.1 mrg The promotion behavior here must match the regular GCC function 860 1.1 mrg parameter marshalling machinery. */ 861 1.1 mrg 862 1.1 mrg static int 863 1.1 mrg write_arg_type (std::stringstream &s, int for_reg, int argno, 864 1.1 mrg tree type, bool prototyped) 865 1.1 mrg { 866 1.1 mrg machine_mode mode = TYPE_MODE (type); 867 1.1 mrg 868 1.1 mrg if (mode == VOIDmode) 869 1.1 mrg return argno; 870 1.1 mrg 871 1.1 mrg if (pass_in_memory (mode, type, false)) 872 1.1 mrg mode = Pmode; 873 1.1 mrg else 874 1.1 mrg { 875 1.1 mrg bool split = TREE_CODE (type) == COMPLEX_TYPE; 876 1.1 mrg 877 1.1 mrg if (split) 878 1.1 mrg { 879 1.1 mrg /* Complex types are sent as two separate args. */ 880 1.1 mrg type = TREE_TYPE (type); 881 1.1 mrg mode = TYPE_MODE (type); 882 1.1 mrg prototyped = true; 883 1.1 mrg } 884 1.1 mrg 885 1.1 mrg mode = promote_arg (mode, prototyped); 886 1.1 mrg if (split) 887 1.1 mrg argno = write_arg_mode (s, for_reg, argno, mode); 888 1.1 mrg } 889 1.1 mrg 890 1.1 mrg return write_arg_mode (s, for_reg, argno, mode); 891 1.1 mrg } 892 1.1 mrg 893 1.1 mrg /* Emit a PTX return as a prototype or function prologue declaration 894 1.1 mrg for MODE. */ 895 1.1 mrg 896 1.1 mrg static void 897 1.1 mrg write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode) 898 1.1 mrg { 899 1.1 mrg const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); 900 1.1 mrg const char *pfx = "\t.reg"; 901 1.1 mrg const char *sfx = ";\n"; 902 1.1 mrg 903 1.1 mrg if (for_proto) 904 1.1 mrg pfx = "(.param", sfx = "_out) "; 905 1.1 mrg 906 1.1 mrg s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx; 907 1.1 mrg } 908 1.1 mrg 909 1.1 mrg /* Process a function return TYPE to emit a PTX return as a prototype 910 1.1 mrg or function prologue declaration. Returns true if return is via an 911 1.1 mrg additional pointer parameter. The promotion behavior here must 912 1.1 mrg match the regular GCC function return mashalling. */ 913 1.1 mrg 914 1.1 mrg static bool 915 1.1 mrg write_return_type (std::stringstream &s, bool for_proto, tree type) 916 1.1 mrg { 917 1.1 mrg machine_mode mode = TYPE_MODE (type); 918 1.1 mrg 919 1.1 mrg if (mode == VOIDmode) 920 1.1 mrg return false; 921 1.1 mrg 922 1.1 mrg bool return_in_mem = pass_in_memory (mode, type, true); 923 1.1 mrg 924 1.1 mrg if (return_in_mem) 925 1.1 mrg { 926 1.1 mrg if (for_proto) 927 1.1 mrg return return_in_mem; 928 1.1 mrg 929 1.1 mrg /* Named return values can cause us to return a pointer as well 930 1.1 mrg as expect an argument for the return location. This is 931 1.1 mrg optimization-level specific, so no caller can make use of 932 1.1 mrg this data, but more importantly for us, we must ensure it 933 1.1 mrg doesn't change the PTX prototype. */ 934 1.1 mrg mode = (machine_mode) cfun->machine->return_mode; 935 1.1 mrg 936 1.1 mrg if (mode == VOIDmode) 937 1.1 mrg return return_in_mem; 938 1.1 mrg 939 1.1 mrg /* Clear return_mode to inhibit copy of retval to non-existent 940 1.1 mrg retval parameter. */ 941 1.1 mrg cfun->machine->return_mode = VOIDmode; 942 1.1 mrg } 943 1.1 mrg else 944 1.1 mrg mode = promote_return (mode); 945 1.1 mrg 946 1.1 mrg write_return_mode (s, for_proto, mode); 947 1.1 mrg 948 1.1 mrg return return_in_mem; 949 1.1 mrg } 950 1.1 mrg 951 1.1 mrg /* Look for attributes in ATTRS that would indicate we must write a function 952 1.1 mrg as a .entry kernel rather than a .func. Return true if one is found. */ 953 1.1 mrg 954 1.1 mrg static bool 955 1.1 mrg write_as_kernel (tree attrs) 956 1.1 mrg { 957 1.1 mrg return (lookup_attribute ("kernel", attrs) != NULL_TREE 958 1.1 mrg || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE 959 1.1 mrg && lookup_attribute ("oacc function", attrs) != NULL_TREE)); 960 1.1 mrg /* For OpenMP target regions, the corresponding kernel entry is emitted from 961 1.1 mrg write_omp_entry as a separate function. */ 962 1.1 mrg } 963 1.1 mrg 964 1.1 mrg /* Emit a linker marker for a function decl or defn. */ 965 1.1 mrg 966 1.1 mrg static void 967 1.1 mrg write_fn_marker (std::stringstream &s, bool is_defn, bool globalize, 968 1.1 mrg const char *name) 969 1.1 mrg { 970 1.1 mrg s << "\n// BEGIN"; 971 1.1 mrg if (globalize) 972 1.1 mrg s << " GLOBAL"; 973 1.1 mrg s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: "); 974 1.1 mrg s << name << "\n"; 975 1.1 mrg } 976 1.1 mrg 977 1.1 mrg /* Emit a linker marker for a variable decl or defn. */ 978 1.1 mrg 979 1.1 mrg static void 980 1.1 mrg write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name) 981 1.1 mrg { 982 1.1 mrg fprintf (file, "\n// BEGIN%s VAR %s: ", 983 1.1 mrg globalize ? " GLOBAL" : "", 984 1.1 mrg is_defn ? "DEF" : "DECL"); 985 1.1 mrg assemble_name_raw (file, name); 986 1.1 mrg fputs ("\n", file); 987 1.1 mrg } 988 1.1 mrg 989 1.1 mrg /* Helper function for write_fn_proto. */ 990 1.1 mrg 991 1.1 mrg static void 992 1.1 mrg write_fn_proto_1 (std::stringstream &s, bool is_defn, 993 1.1 mrg const char *name, const_tree decl) 994 1.1 mrg { 995 1.1 mrg if (lookup_attribute ("alias", DECL_ATTRIBUTES (decl)) == NULL) 996 1.1 mrg write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name); 997 1.1 mrg 998 1.1 mrg /* PTX declaration. */ 999 1.1 mrg if (DECL_EXTERNAL (decl)) 1000 1.1 mrg s << ".extern "; 1001 1.1 mrg else if (TREE_PUBLIC (decl)) 1002 1.1 mrg s << (DECL_WEAK (decl) ? ".weak " : ".visible "); 1003 1.1 mrg s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func "); 1004 1.1 mrg 1005 1.1 mrg tree fntype = TREE_TYPE (decl); 1006 1.1 mrg tree result_type = TREE_TYPE (fntype); 1007 1.1 mrg 1008 1.1 mrg /* atomic_compare_exchange_$n builtins have an exceptional calling 1009 1.1 mrg convention. */ 1010 1.1 mrg int not_atomic_weak_arg = -1; 1011 1.1 mrg if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL) 1012 1.1 mrg switch (DECL_FUNCTION_CODE (decl)) 1013 1.1 mrg { 1014 1.1 mrg case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1: 1015 1.1 mrg case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2: 1016 1.1 mrg case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4: 1017 1.1 mrg case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8: 1018 1.1 mrg case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16: 1019 1.1 mrg /* These atomics skip the 'weak' parm in an actual library 1020 1.1 mrg call. We must skip it in the prototype too. */ 1021 1.1 mrg not_atomic_weak_arg = 3; 1022 1.1 mrg break; 1023 1.1 mrg 1024 1.1 mrg default: 1025 1.1 mrg break; 1026 1.1 mrg } 1027 1.1 mrg 1028 1.1 mrg /* Declare the result. */ 1029 1.1 mrg bool return_in_mem = write_return_type (s, true, result_type); 1030 1.1 mrg 1031 1.1 mrg s << name; 1032 1.1 mrg 1033 1.1 mrg int argno = 0; 1034 1.1 mrg 1035 1.1 mrg /* Emit argument list. */ 1036 1.1 mrg if (return_in_mem) 1037 1.1 mrg argno = write_arg_type (s, -1, argno, ptr_type_node, true); 1038 1.1 mrg 1039 1.1 mrg /* We get: 1040 1.1 mrg NULL in TYPE_ARG_TYPES, for old-style functions 1041 1.1 mrg NULL in DECL_ARGUMENTS, for builtin functions without another 1042 1.1 mrg declaration. 1043 1.1 mrg So we have to pick the best one we have. */ 1044 1.1 mrg tree args = TYPE_ARG_TYPES (fntype); 1045 1.1 mrg bool prototyped = true; 1046 1.1 mrg if (!args) 1047 1.1 mrg { 1048 1.1 mrg args = DECL_ARGUMENTS (decl); 1049 1.1 mrg prototyped = false; 1050 1.1 mrg } 1051 1.1 mrg 1052 1.1 mrg for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--) 1053 1.1 mrg { 1054 1.1 mrg tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args); 1055 1.1 mrg 1056 1.1 mrg if (not_atomic_weak_arg) 1057 1.1 mrg argno = write_arg_type (s, -1, argno, type, prototyped); 1058 1.1 mrg else 1059 1.1 mrg gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE); 1060 1.1 mrg } 1061 1.1 mrg 1062 1.1 mrg if (stdarg_p (fntype)) 1063 1.1 mrg argno = write_arg_type (s, -1, argno, ptr_type_node, true); 1064 1.1 mrg 1065 1.1 mrg if (DECL_STATIC_CHAIN (decl)) 1066 1.1 mrg argno = write_arg_type (s, -1, argno, ptr_type_node, true); 1067 1.1 mrg 1068 1.1 mrg if (argno < 2 && strcmp (name, "main") == 0) 1069 1.1 mrg { 1070 1.1 mrg if (argno == 0) 1071 1.1 mrg argno = write_arg_type (s, -1, argno, integer_type_node, true); 1072 1.1 mrg 1073 1.1 mrg if (argno == 1) 1074 1.1 mrg argno = write_arg_type (s, -1, argno, ptr_type_node, true); 1075 1.1 mrg } 1076 1.1 mrg 1077 1.1 mrg if (argno) 1078 1.1 mrg s << ")"; 1079 1.1 mrg 1080 1.1 mrg s << (is_defn ? "\n" : ";\n"); 1081 1.1 mrg } 1082 1.1 mrg 1083 1.1 mrg /* Write a .func or .kernel declaration or definition along with 1084 1.1 mrg a helper comment for use by ld. S is the stream to write to, DECL 1085 1.1 mrg the decl for the function with name NAME. For definitions, emit 1086 1.1 mrg a declaration too. */ 1087 1.1 mrg 1088 1.1 mrg static void 1089 1.1 mrg write_fn_proto (std::stringstream &s, bool is_defn, 1090 1.1 mrg const char *name, const_tree decl) 1091 1.1 mrg { 1092 1.1 mrg const char *replacement = nvptx_name_replacement (name); 1093 1.1 mrg char *replaced_dots = NULL; 1094 1.1 mrg if (replacement != name) 1095 1.1 mrg name = replacement; 1096 1.1 mrg else 1097 1.1 mrg { 1098 1.1 mrg replaced_dots = nvptx_replace_dot (name); 1099 1.1 mrg if (replaced_dots) 1100 1.1 mrg name = replaced_dots; 1101 1.1 mrg } 1102 1.1 mrg if (name[0] == '*') 1103 1.1 mrg name++; 1104 1.1 mrg 1105 1.1 mrg if (is_defn) 1106 1.1 mrg /* Emit a declaration. The PTX assembler gets upset without it. */ 1107 1.1 mrg write_fn_proto_1 (s, false, name, decl); 1108 1.1 mrg 1109 1.1 mrg write_fn_proto_1 (s, is_defn, name, decl); 1110 1.1 mrg 1111 1.1 mrg if (replaced_dots) 1112 1.1 mrg XDELETE (replaced_dots); 1113 1.1 mrg } 1114 1.1 mrg 1115 1.1 mrg /* Construct a function declaration from a call insn. This can be 1116 1.1 mrg necessary for two reasons - either we have an indirect call which 1117 1.1 mrg requires a .callprototype declaration, or we have a libcall 1118 1.1 mrg generated by emit_library_call for which no decl exists. */ 1119 1.1 mrg 1120 1.1 mrg static void 1121 1.1 mrg write_fn_proto_from_insn (std::stringstream &s, const char *name, 1122 1.1 mrg rtx result, rtx pat) 1123 1.1 mrg { 1124 1.1 mrg char *replaced_dots = NULL; 1125 1.1 mrg 1126 1.1 mrg if (!name) 1127 1.1 mrg { 1128 1.1 mrg s << "\t.callprototype "; 1129 1.1 mrg name = "_"; 1130 1.1 mrg } 1131 1.1 mrg else 1132 1.1 mrg { 1133 1.1 mrg const char *replacement = nvptx_name_replacement (name); 1134 1.1 mrg if (replacement != name) 1135 1.1 mrg name = replacement; 1136 1.1 mrg else 1137 1.1 mrg { 1138 1.1 mrg replaced_dots = nvptx_replace_dot (name); 1139 1.1 mrg if (replaced_dots) 1140 1.1 mrg name = replaced_dots; 1141 1.1 mrg } 1142 1.1 mrg write_fn_marker (s, false, true, name); 1143 1.1 mrg s << "\t.extern .func "; 1144 1.1 mrg } 1145 1.1 mrg 1146 1.1 mrg if (result != NULL_RTX) 1147 1.1 mrg write_return_mode (s, true, GET_MODE (result)); 1148 1.1 mrg 1149 1.1 mrg s << name; 1150 1.1 mrg if (replaced_dots) 1151 1.1 mrg XDELETE (replaced_dots); 1152 1.1 mrg 1153 1.1 mrg int arg_end = XVECLEN (pat, 0); 1154 1.1 mrg for (int i = 1; i < arg_end; i++) 1155 1.1 mrg { 1156 1.1 mrg /* We don't have to deal with mode splitting & promotion here, 1157 1.1 mrg as that was already done when generating the call 1158 1.1 mrg sequence. */ 1159 1.1 mrg machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0)); 1160 1.1 mrg 1161 1.1 mrg write_arg_mode (s, -1, i - 1, mode); 1162 1.1 mrg } 1163 1.1 mrg if (arg_end != 1) 1164 1.1 mrg s << ")"; 1165 1.1 mrg s << ";\n"; 1166 1.1 mrg } 1167 1.1 mrg 1168 1.1 mrg /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash 1169 1.1 mrg table and write a ptx prototype. These are emitted at end of 1170 1.1 mrg compilation. */ 1171 1.1 mrg 1172 1.1 mrg static void 1173 1.1 mrg nvptx_record_fndecl (tree decl) 1174 1.1 mrg { 1175 1.1 mrg tree *slot = declared_fndecls_htab->find_slot (decl, INSERT); 1176 1.1 mrg if (*slot == NULL) 1177 1.1 mrg { 1178 1.1 mrg *slot = decl; 1179 1.1 mrg const char *name = get_fnname_from_decl (decl); 1180 1.1 mrg write_fn_proto (func_decls, false, name, decl); 1181 1.1 mrg } 1182 1.1 mrg } 1183 1.1 mrg 1184 1.1 mrg /* Record a libcall or unprototyped external function. CALLEE is the 1185 1.1 mrg SYMBOL_REF. Insert into the libfunc hash table and emit a ptx 1186 1.1 mrg declaration for it. */ 1187 1.1 mrg 1188 1.1 mrg static void 1189 1.1 mrg nvptx_record_libfunc (rtx callee, rtx retval, rtx pat) 1190 1.1 mrg { 1191 1.1 mrg rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT); 1192 1.1 mrg if (*slot == NULL) 1193 1.1 mrg { 1194 1.1 mrg *slot = callee; 1195 1.1 mrg 1196 1.1 mrg const char *name = XSTR (callee, 0); 1197 1.1 mrg write_fn_proto_from_insn (func_decls, name, retval, pat); 1198 1.1 mrg } 1199 1.1 mrg } 1200 1.1 mrg 1201 1.1 mrg /* DECL is an external FUNCTION_DECL, that we're referencing. If it 1202 1.1 mrg is prototyped, record it now. Otherwise record it as needed at end 1203 1.1 mrg of compilation, when we might have more information about it. */ 1204 1.1 mrg 1205 1.1 mrg void 1206 1.1 mrg nvptx_record_needed_fndecl (tree decl) 1207 1.1 mrg { 1208 1.1 mrg if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE) 1209 1.1 mrg { 1210 1.1 mrg tree *slot = needed_fndecls_htab->find_slot (decl, INSERT); 1211 1.1 mrg if (*slot == NULL) 1212 1.1 mrg *slot = decl; 1213 1.1 mrg } 1214 1.1 mrg else 1215 1.1 mrg nvptx_record_fndecl (decl); 1216 1.1 mrg } 1217 1.1 mrg 1218 1.1 mrg /* SYM is a SYMBOL_REF. If it refers to an external function, record 1219 1.1 mrg it as needed. */ 1220 1.1 mrg 1221 1.1 mrg static void 1222 1.1 mrg nvptx_maybe_record_fnsym (rtx sym) 1223 1.1 mrg { 1224 1.1 mrg tree decl = SYMBOL_REF_DECL (sym); 1225 1.1 mrg 1226 1.1 mrg if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl)) 1227 1.1 mrg nvptx_record_needed_fndecl (decl); 1228 1.1 mrg } 1229 1.1 mrg 1230 1.1 mrg /* Emit a local array to hold some part of a conventional stack frame 1231 1.1 mrg and initialize REGNO to point to it. If the size is zero, it'll 1232 1.1 mrg never be valid to dereference, so we can simply initialize to 1233 1.1 mrg zero. */ 1234 1.1 mrg 1235 1.1 mrg static void 1236 1.1 mrg init_frame (FILE *file, int regno, unsigned align, unsigned size) 1237 1.1 mrg { 1238 1.1 mrg if (size) 1239 1.1 mrg fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n", 1240 1.1 mrg align, reg_names[regno], size); 1241 1.1 mrg fprintf (file, "\t.reg.u%d %s;\n", 1242 1.1 mrg POINTER_SIZE, reg_names[regno]); 1243 1.1 mrg fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n" 1244 1.1 mrg : "\tmov.u%d %s, 0;\n"), 1245 1.1 mrg POINTER_SIZE, reg_names[regno], reg_names[regno]); 1246 1.1 mrg } 1247 1.1 mrg 1248 1.1 mrg /* Emit soft stack frame setup sequence. */ 1249 1.1 mrg 1250 1.1 mrg static void 1251 1.1 mrg init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size) 1252 1.1 mrg { 1253 1.1 mrg /* Maintain 64-bit stack alignment. */ 1254 1.1 mrg unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT; 1255 1.1 mrg size = ROUND_UP (size, keep_align); 1256 1.1 mrg int bits = POINTER_SIZE; 1257 1.1 mrg const char *reg_stack = reg_names[STACK_POINTER_REGNUM]; 1258 1.1 mrg const char *reg_frame = reg_names[FRAME_POINTER_REGNUM]; 1259 1.1 mrg const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM]; 1260 1.1 mrg const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM]; 1261 1.1 mrg fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack); 1262 1.1 mrg fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame); 1263 1.1 mrg fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot); 1264 1.1 mrg fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev); 1265 1.1 mrg fprintf (file, "\t{\n"); 1266 1.1 mrg fprintf (file, "\t\t.reg.u32 %%fstmp0;\n"); 1267 1.1 mrg fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits); 1268 1.1 mrg fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits); 1269 1.1 mrg fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n"); 1270 1.1 mrg fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n", 1271 1.1 mrg bits == 64 ? ".wide" : ".lo", bits / 8); 1272 1.1 mrg fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits); 1273 1.1 mrg 1274 1.1 mrg /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */ 1275 1.1 mrg fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot); 1276 1.1 mrg 1277 1.1 mrg /* Initialize %sspprev = __nvptx_stacks[tid.y]. */ 1278 1.1 mrg fprintf (file, "\t\tld.shared.u%d %s, [%s];\n", 1279 1.1 mrg bits, reg_sspprev, reg_sspslot); 1280 1.1 mrg 1281 1.1 mrg /* Initialize %frame = %sspprev - size. */ 1282 1.1 mrg fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n", 1283 1.1 mrg bits, reg_frame, reg_sspprev, size); 1284 1.1 mrg 1285 1.1 mrg /* Apply alignment, if larger than 64. */ 1286 1.1 mrg if (alignment > keep_align) 1287 1.1 mrg fprintf (file, "\t\tand.b%d %s, %s, %d;\n", 1288 1.1 mrg bits, reg_frame, reg_frame, -alignment); 1289 1.1 mrg 1290 1.1 mrg size = crtl->outgoing_args_size; 1291 1.1 mrg gcc_assert (size % keep_align == 0); 1292 1.1 mrg 1293 1.1 mrg /* Initialize %stack. */ 1294 1.1 mrg fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n", 1295 1.1 mrg bits, reg_stack, reg_frame, size); 1296 1.1 mrg 1297 1.1 mrg if (!crtl->is_leaf) 1298 1.1 mrg fprintf (file, "\t\tst.shared.u%d [%s], %s;\n", 1299 1.1 mrg bits, reg_sspslot, reg_stack); 1300 1.1 mrg fprintf (file, "\t}\n"); 1301 1.1 mrg cfun->machine->has_softstack = true; 1302 1.1 mrg need_softstack_decl = true; 1303 1.1 mrg } 1304 1.1 mrg 1305 1.1 mrg /* Emit code to initialize the REGNO predicate register to indicate 1306 1.1 mrg whether we are not lane zero on the NAME axis. */ 1307 1.1 mrg 1308 1.1 mrg static void 1309 1.1 mrg nvptx_init_axis_predicate (FILE *file, int regno, const char *name) 1310 1.1 mrg { 1311 1.1 mrg fprintf (file, "\t{\n"); 1312 1.1 mrg fprintf (file, "\t\t.reg.u32\t%%%s;\n", name); 1313 1.1 mrg if (strcmp (name, "x") == 0 && cfun->machine->red_partition) 1314 1.1 mrg { 1315 1.1 mrg fprintf (file, "\t\t.reg.u64\t%%t_red;\n"); 1316 1.1 mrg fprintf (file, "\t\t.reg.u64\t%%y64;\n"); 1317 1.1 mrg } 1318 1.1 mrg fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name); 1319 1.1 mrg fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name); 1320 1.1 mrg if (strcmp (name, "x") == 0 && cfun->machine->red_partition) 1321 1.1 mrg { 1322 1.1 mrg fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n"); 1323 1.1 mrg fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n"); 1324 1.1 mrg fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; " 1325 1.1 mrg "// vector reduction buffer\n", 1326 1.1 mrg REGNO (cfun->machine->red_partition), 1327 1.1 mrg vector_red_partition); 1328 1.1 mrg } 1329 1.1 mrg /* Verify vector_red_size. */ 1330 1.1 mrg gcc_assert (vector_red_partition * nvptx_mach_max_workers () 1331 1.1 mrg <= vector_red_size); 1332 1.1 mrg fprintf (file, "\t}\n"); 1333 1.1 mrg } 1334 1.1 mrg 1335 1.1 mrg /* Emit code to initialize OpenACC worker broadcast and synchronization 1336 1.1 mrg registers. */ 1337 1.1 mrg 1338 1.1 mrg static void 1339 1.1 mrg nvptx_init_oacc_workers (FILE *file) 1340 1.1 mrg { 1341 1.1 mrg fprintf (file, "\t{\n"); 1342 1.1 mrg fprintf (file, "\t\t.reg.u32\t%%tidy;\n"); 1343 1.1 mrg if (cfun->machine->bcast_partition) 1344 1.1 mrg { 1345 1.1 mrg fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n"); 1346 1.1 mrg fprintf (file, "\t\t.reg.u64\t%%y64;\n"); 1347 1.1 mrg } 1348 1.1 mrg fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n"); 1349 1.1 mrg if (cfun->machine->bcast_partition) 1350 1.1 mrg { 1351 1.1 mrg fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n"); 1352 1.1 mrg fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n"); 1353 1.1 mrg fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n"); 1354 1.1 mrg fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; " 1355 1.1 mrg "// vector broadcast offset\n", 1356 1.1 mrg REGNO (cfun->machine->bcast_partition), 1357 1.1 mrg oacc_bcast_partition); 1358 1.1 mrg } 1359 1.1 mrg /* Verify oacc_bcast_size. */ 1360 1.1 mrg gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1) 1361 1.1 mrg <= oacc_bcast_size); 1362 1.1 mrg if (cfun->machine->sync_bar) 1363 1.1 mrg fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; " 1364 1.1 mrg "// vector synchronization barrier\n", 1365 1.1 mrg REGNO (cfun->machine->sync_bar)); 1366 1.1 mrg fprintf (file, "\t}\n"); 1367 1.1 mrg } 1368 1.1 mrg 1369 1.1 mrg /* Emit code to initialize predicate and master lane index registers for 1370 1.1 mrg -muniform-simt code generation variant. */ 1371 1.1 mrg 1372 1.1 mrg static void 1373 1.1 mrg nvptx_init_unisimt_predicate (FILE *file) 1374 1.1 mrg { 1375 1.1 mrg cfun->machine->unisimt_location = gen_reg_rtx (Pmode); 1376 1.1 mrg int loc = REGNO (cfun->machine->unisimt_location); 1377 1.1 mrg int bits = POINTER_SIZE; 1378 1.1 mrg fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc); 1379 1.1 mrg fprintf (file, "\t{\n"); 1380 1.1 mrg fprintf (file, "\t\t.reg.u32 %%ustmp0;\n"); 1381 1.1 mrg fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits); 1382 1.1 mrg fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n"); 1383 1.1 mrg fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n", 1384 1.1 mrg bits == 64 ? ".wide" : ".lo"); 1385 1.1 mrg fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc); 1386 1.1 mrg fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc); 1387 1.1 mrg if (cfun->machine->unisimt_predicate) 1388 1.1 mrg { 1389 1.1 mrg int master = REGNO (cfun->machine->unisimt_master); 1390 1.1 mrg int pred = REGNO (cfun->machine->unisimt_predicate); 1391 1.1 mrg fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc); 1392 1.1 mrg if (cfun->machine->unisimt_outside_simt_predicate) 1393 1.1 mrg { 1394 1.1 mrg int pred_outside_simt 1395 1.1 mrg = REGNO (cfun->machine->unisimt_outside_simt_predicate); 1396 1.1 mrg fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, 0;\n", 1397 1.1 mrg pred_outside_simt, master); 1398 1.1 mrg } 1399 1.1 mrg fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n"); 1400 1.1 mrg /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */ 1401 1.1 mrg fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master); 1402 1.1 mrg /* Compute predicate as 'tid.x == master'. */ 1403 1.1 mrg fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master); 1404 1.1 mrg } 1405 1.1 mrg fprintf (file, "\t}\n"); 1406 1.1 mrg need_unisimt_decl = true; 1407 1.1 mrg } 1408 1.1 mrg 1409 1.1 mrg /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region: 1410 1.1 mrg 1411 1.1 mrg extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg); 1412 1.1 mrg void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize) 1413 1.1 mrg { 1414 1.1 mrg __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1); 1415 1.1 mrg __nvptx_uni[tid.y] = 0; 1416 1.1 mrg gomp_nvptx_main (ORIG, arg); 1417 1.1 mrg } 1418 1.1 mrg ORIG itself should not be emitted as a PTX .entry function. */ 1419 1.1 mrg 1420 1.1 mrg static void 1421 1.1 mrg write_omp_entry (FILE *file, const char *name, const char *orig) 1422 1.1 mrg { 1423 1.1 mrg static bool gomp_nvptx_main_declared; 1424 1.1 mrg if (!gomp_nvptx_main_declared) 1425 1.1 mrg { 1426 1.1 mrg gomp_nvptx_main_declared = true; 1427 1.1 mrg write_fn_marker (func_decls, false, true, "gomp_nvptx_main"); 1428 1.1 mrg func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE 1429 1.1 mrg << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n"; 1430 1.1 mrg } 1431 1.1 mrg /* PR79332. Single out this string; it confuses gcc.pot generation. */ 1432 1.1 mrg #define NTID_Y "%ntid.y" 1433 1.1 mrg #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\ 1434 1.1 mrg (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\ 1435 1.1 mrg {\n\ 1436 1.1 mrg .reg.u32 %r<3>;\n\ 1437 1.1 mrg .reg.u" PS " %R<4>;\n\ 1438 1.1 mrg mov.u32 %r0, %tid.y;\n\ 1439 1.1 mrg mov.u32 %r1, " NTID_Y ";\n\ 1440 1.1 mrg mov.u32 %r2, %ctaid.x;\n\ 1441 1.1 mrg cvt.u" PS ".u32 %R1, %r0;\n\ 1442 1.1 mrg " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\ 1443 1.1 mrg mov.u" PS " %R0, __nvptx_stacks;\n\ 1444 1.1 mrg " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\ 1445 1.1 mrg ld.param.u" PS " %R2, [%stack];\n\ 1446 1.1 mrg ld.param.u" PS " %R3, [%sz];\n\ 1447 1.1 mrg add.u" PS " %R2, %R2, %R3;\n\ 1448 1.1 mrg mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\ 1449 1.1 mrg st.shared.u" PS " [%R0], %R2;\n\ 1450 1.1 mrg mov.u" PS " %R0, __nvptx_uni;\n\ 1451 1.1 mrg " MAD_PS_32 " %R0, %r0, 4, %R0;\n\ 1452 1.1 mrg mov.u32 %r0, 0;\n\ 1453 1.1 mrg st.shared.u32 [%R0], %r0;\n\ 1454 1.1 mrg mov.u" PS " %R0, \0;\n\ 1455 1.1 mrg ld.param.u" PS " %R1, [%arg];\n\ 1456 1.1 mrg {\n\ 1457 1.1 mrg .param.u" PS " %P<2>;\n\ 1458 1.1 mrg st.param.u" PS " [%P0], %R0;\n\ 1459 1.1 mrg st.param.u" PS " [%P1], %R1;\n\ 1460 1.1 mrg call.uni gomp_nvptx_main, (%P0, %P1);\n\ 1461 1.1 mrg }\n\ 1462 1.1 mrg ret.uni;\n\ 1463 1.1 mrg }\n" 1464 1.1 mrg static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32"); 1465 1.1 mrg static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 "); 1466 1.1 mrg #undef ENTRY_TEMPLATE 1467 1.1 mrg #undef NTID_Y 1468 1.1 mrg const char *entry_1 = TARGET_ABI64 ? entry64 : entry32; 1469 1.1 mrg /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */ 1470 1.1 mrg const char *entry_2 = entry_1 + strlen (entry64) + 1; 1471 1.1 mrg fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2); 1472 1.1 mrg need_softstack_decl = need_unisimt_decl = true; 1473 1.1 mrg } 1474 1.1 mrg 1475 1.1 mrg /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx 1476 1.1 mrg function, including local var decls and copies from the arguments to 1477 1.1 mrg local regs. */ 1478 1.1 mrg 1479 1.1 mrg void 1480 1.1 mrg nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) 1481 1.1 mrg { 1482 1.1 mrg tree fntype = TREE_TYPE (decl); 1483 1.1 mrg tree result_type = TREE_TYPE (fntype); 1484 1.1 mrg int argno = 0; 1485 1.1 mrg 1486 1.1 mrg if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl)) 1487 1.1 mrg && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl))) 1488 1.1 mrg { 1489 1.1 mrg char *buf = (char *) alloca (strlen (name) + sizeof ("$impl")); 1490 1.1 mrg sprintf (buf, "%s$impl", name); 1491 1.1 mrg write_omp_entry (file, name, buf); 1492 1.1 mrg name = buf; 1493 1.1 mrg } 1494 1.1 mrg /* We construct the initial part of the function into a string 1495 1.1 mrg stream, in order to share the prototype writing code. */ 1496 1.1 mrg std::stringstream s; 1497 1.1 mrg write_fn_proto (s, true, name, decl); 1498 1.1 mrg s << "{\n"; 1499 1.1 mrg 1500 1.1 mrg bool return_in_mem = write_return_type (s, false, result_type); 1501 1.1 mrg if (return_in_mem) 1502 1.1 mrg argno = write_arg_type (s, 0, argno, ptr_type_node, true); 1503 1.1 mrg 1504 1.1 mrg /* Declare and initialize incoming arguments. */ 1505 1.1 mrg tree args = TYPE_ARG_TYPES (fntype); 1506 1.1 mrg bool prototyped = true; 1507 1.1 mrg if (!args) 1508 1.1 mrg { 1509 1.1 mrg args = DECL_ARGUMENTS (decl); 1510 1.1 mrg prototyped = false; 1511 1.1 mrg } 1512 1.1 mrg 1513 1.1 mrg for (; args != NULL_TREE; args = TREE_CHAIN (args)) 1514 1.1 mrg { 1515 1.1 mrg tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args); 1516 1.1 mrg 1517 1.1 mrg argno = write_arg_type (s, 0, argno, type, prototyped); 1518 1.1 mrg } 1519 1.1 mrg 1520 1.1 mrg if (stdarg_p (fntype)) 1521 1.1 mrg argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node, 1522 1.1 mrg true); 1523 1.1 mrg 1524 1.1 mrg if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain) 1525 1.1 mrg write_arg_type (s, STATIC_CHAIN_REGNUM, 1526 1.1 mrg DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node, 1527 1.1 mrg true); 1528 1.1 mrg 1529 1.1 mrg fprintf (file, "%s", s.str().c_str()); 1530 1.1 mrg 1531 1.1 mrg /* Usually 'crtl->is_leaf' is computed during register allocator 1532 1.1 mrg initialization (which is not done on NVPTX) or for pressure-sensitive 1533 1.1 mrg optimizations. Initialize it here, except if already set. */ 1534 1.1 mrg if (!crtl->is_leaf) 1535 1.1 mrg crtl->is_leaf = leaf_function_p (); 1536 1.1 mrg 1537 1.1 mrg HOST_WIDE_INT sz = get_frame_size (); 1538 1.1 mrg bool need_frameptr = sz || cfun->machine->has_chain; 1539 1.1 mrg int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT; 1540 1.1 mrg if (!TARGET_SOFT_STACK) 1541 1.1 mrg { 1542 1.1 mrg /* Declare a local var for outgoing varargs. */ 1543 1.1 mrg if (cfun->machine->has_varadic) 1544 1.1 mrg init_frame (file, STACK_POINTER_REGNUM, 1545 1.1 mrg UNITS_PER_WORD, crtl->outgoing_args_size); 1546 1.1 mrg 1547 1.1 mrg /* Declare a local variable for the frame. Force its size to be 1548 1.1 mrg DImode-compatible. */ 1549 1.1 mrg if (need_frameptr) 1550 1.1 mrg init_frame (file, FRAME_POINTER_REGNUM, alignment, 1551 1.1 mrg ROUND_UP (sz, GET_MODE_SIZE (DImode))); 1552 1.1 mrg } 1553 1.1 mrg else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca 1554 1.1 mrg || (cfun->machine->has_simtreg && !crtl->is_leaf)) 1555 1.1 mrg init_softstack_frame (file, alignment, sz); 1556 1.1 mrg 1557 1.1 mrg if (cfun->machine->has_simtreg) 1558 1.1 mrg { 1559 1.1 mrg unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size; 1560 1.1 mrg unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align; 1561 1.1 mrg align = MAX (align, GET_MODE_SIZE (DImode)); 1562 1.1 mrg if (!crtl->is_leaf || cfun->calls_alloca) 1563 1.1 mrg simtsz = HOST_WIDE_INT_M1U; 1564 1.1 mrg if (simtsz == HOST_WIDE_INT_M1U) 1565 1.1 mrg simtsz = nvptx_softstack_size; 1566 1.1 mrg if (cfun->machine->has_softstack) 1567 1.1 mrg simtsz += POINTER_SIZE / 8; 1568 1.1 mrg simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode)); 1569 1.1 mrg if (align > GET_MODE_SIZE (DImode)) 1570 1.1 mrg simtsz += align - GET_MODE_SIZE (DImode); 1571 1.1 mrg if (simtsz) 1572 1.1 mrg fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar[" 1573 1.1 mrg HOST_WIDE_INT_PRINT_DEC "];\n", simtsz); 1574 1.1 mrg } 1575 1.1 mrg 1576 1.1 mrg /* Restore the vector reduction partition register, if necessary. 1577 1.1 mrg FIXME: Find out when and why this is necessary, and fix it. */ 1578 1.1 mrg if (cfun->machine->red_partition) 1579 1.1 mrg regno_reg_rtx[REGNO (cfun->machine->red_partition)] 1580 1.1 mrg = cfun->machine->red_partition; 1581 1.1 mrg 1582 1.1 mrg /* Declare the pseudos we have as ptx registers. */ 1583 1.1 mrg int maxregs = max_reg_num (); 1584 1.1 mrg for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++) 1585 1.1 mrg { 1586 1.1 mrg if (regno_reg_rtx[i] != const0_rtx) 1587 1.1 mrg { 1588 1.1 mrg machine_mode mode = PSEUDO_REGNO_MODE (i); 1589 1.1 mrg machine_mode split = maybe_split_mode (mode); 1590 1.1 mrg 1591 1.1 mrg if (split_mode_p (mode)) 1592 1.1 mrg mode = split; 1593 1.1 mrg fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true)); 1594 1.1 mrg output_reg (file, i, split, -2); 1595 1.1 mrg fprintf (file, ";\n"); 1596 1.1 mrg } 1597 1.1 mrg } 1598 1.1 mrg 1599 1.1 mrg /* Emit axis predicates. */ 1600 1.1 mrg if (cfun->machine->axis_predicate[0]) 1601 1.1 mrg nvptx_init_axis_predicate (file, 1602 1.1 mrg REGNO (cfun->machine->axis_predicate[0]), "y"); 1603 1.1 mrg if (cfun->machine->axis_predicate[1]) 1604 1.1 mrg nvptx_init_axis_predicate (file, 1605 1.1 mrg REGNO (cfun->machine->axis_predicate[1]), "x"); 1606 1.1 mrg if (cfun->machine->unisimt_predicate 1607 1.1 mrg || (cfun->machine->has_simtreg && !crtl->is_leaf)) 1608 1.1 mrg nvptx_init_unisimt_predicate (file); 1609 1.1 mrg if (cfun->machine->bcast_partition || cfun->machine->sync_bar) 1610 1.1 mrg nvptx_init_oacc_workers (file); 1611 1.1 mrg } 1612 1.1 mrg 1613 1.1 mrg /* Output code for switching uniform-simt state. ENTERING indicates whether 1614 1.1 mrg we are entering or leaving non-uniform execution region. */ 1615 1.1 mrg 1616 1.1 mrg static void 1617 1.1 mrg nvptx_output_unisimt_switch (FILE *file, bool entering) 1618 1.1 mrg { 1619 1.1 mrg if (crtl->is_leaf && !cfun->machine->unisimt_predicate) 1620 1.1 mrg return; 1621 1.1 mrg fprintf (file, "\t{\n"); 1622 1.1 mrg fprintf (file, "\t\t.reg.u32 %%ustmp2;\n"); 1623 1.1 mrg fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0); 1624 1.1 mrg if (cfun->machine->unisimt_outside_simt_predicate) 1625 1.1 mrg { 1626 1.1 mrg int pred_outside_simt 1627 1.1 mrg = REGNO (cfun->machine->unisimt_outside_simt_predicate); 1628 1.1 mrg fprintf (file, "\t\tmov.pred %%r%d, %d;\n", pred_outside_simt, 1629 1.1 mrg entering ? 0 : 1); 1630 1.1 mrg } 1631 1.1 mrg if (!crtl->is_leaf) 1632 1.1 mrg { 1633 1.1 mrg int loc = REGNO (cfun->machine->unisimt_location); 1634 1.1 mrg fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc); 1635 1.1 mrg } 1636 1.1 mrg if (cfun->machine->unisimt_predicate) 1637 1.1 mrg { 1638 1.1 mrg int master = REGNO (cfun->machine->unisimt_master); 1639 1.1 mrg int pred = REGNO (cfun->machine->unisimt_predicate); 1640 1.1 mrg fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n"); 1641 1.1 mrg fprintf (file, "\t\tmov.u32 %%r%d, %s;\n", 1642 1.1 mrg master, entering ? "%ustmp2" : "0"); 1643 1.1 mrg fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master); 1644 1.1 mrg } 1645 1.1 mrg fprintf (file, "\t}\n"); 1646 1.1 mrg } 1647 1.1 mrg 1648 1.1 mrg /* Output code for allocating per-lane storage and switching soft-stack pointer. 1649 1.1 mrg ENTERING indicates whether we are entering or leaving non-uniform execution. 1650 1.1 mrg PTR is the register pointing to allocated storage, it is assigned to on 1651 1.1 mrg entering and used to restore state on leaving. SIZE and ALIGN are used only 1652 1.1 mrg on entering. */ 1653 1.1 mrg 1654 1.1 mrg static void 1655 1.1 mrg nvptx_output_softstack_switch (FILE *file, bool entering, 1656 1.1 mrg rtx ptr, rtx size, rtx align) 1657 1.1 mrg { 1658 1.1 mrg gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr)); 1659 1.1 mrg if (crtl->is_leaf && !cfun->machine->simt_stack_size) 1660 1.1 mrg return; 1661 1.1 mrg int bits = POINTER_SIZE, regno = REGNO (ptr); 1662 1.1 mrg fprintf (file, "\t{\n"); 1663 1.1 mrg if (entering) 1664 1.1 mrg { 1665 1.1 mrg fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + " 1666 1.1 mrg HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno, 1667 1.1 mrg cfun->machine->simt_stack_size); 1668 1.1 mrg fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno); 1669 1.1 mrg if (CONST_INT_P (size)) 1670 1.1 mrg fprintf (file, HOST_WIDE_INT_PRINT_DEC, 1671 1.1 mrg ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode))); 1672 1.1 mrg else 1673 1.1 mrg output_reg (file, REGNO (size), VOIDmode); 1674 1.1 mrg fputs (";\n", file); 1675 1.1 mrg if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode)) 1676 1.1 mrg fprintf (file, 1677 1.1 mrg "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n", 1678 1.1 mrg bits, regno, regno, UINTVAL (align)); 1679 1.1 mrg } 1680 1.1 mrg if (cfun->machine->has_softstack) 1681 1.1 mrg { 1682 1.1 mrg const char *reg_stack = reg_names[STACK_POINTER_REGNUM]; 1683 1.1 mrg if (entering) 1684 1.1 mrg { 1685 1.1 mrg fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n", 1686 1.1 mrg bits, regno, bits / 8, reg_stack); 1687 1.1 mrg fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n", 1688 1.1 mrg bits, reg_stack, regno, bits / 8); 1689 1.1 mrg } 1690 1.1 mrg else 1691 1.1 mrg { 1692 1.1 mrg fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n", 1693 1.1 mrg bits, reg_stack, regno, bits / 8); 1694 1.1 mrg } 1695 1.1 mrg nvptx_output_set_softstack (REGNO (stack_pointer_rtx)); 1696 1.1 mrg } 1697 1.1 mrg fprintf (file, "\t}\n"); 1698 1.1 mrg } 1699 1.1 mrg 1700 1.1 mrg /* Output code to enter non-uniform execution region. DEST is a register 1701 1.1 mrg to hold a per-lane allocation given by SIZE and ALIGN. */ 1702 1.1 mrg 1703 1.1 mrg const char * 1704 1.1 mrg nvptx_output_simt_enter (rtx dest, rtx size, rtx align) 1705 1.1 mrg { 1706 1.1 mrg nvptx_output_unisimt_switch (asm_out_file, true); 1707 1.1 mrg nvptx_output_softstack_switch (asm_out_file, true, dest, size, align); 1708 1.1 mrg return ""; 1709 1.1 mrg } 1710 1.1 mrg 1711 1.1 mrg /* Output code to leave non-uniform execution region. SRC is the register 1712 1.1 mrg holding per-lane storage previously allocated by omp_simt_enter insn. */ 1713 1.1 mrg 1714 1.1 mrg const char * 1715 1.1 mrg nvptx_output_simt_exit (rtx src) 1716 1.1 mrg { 1717 1.1 mrg nvptx_output_unisimt_switch (asm_out_file, false); 1718 1.1 mrg nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX); 1719 1.1 mrg return ""; 1720 1.1 mrg } 1721 1.1 mrg 1722 1.1 mrg /* Output instruction that sets soft stack pointer in shared memory to the 1723 1.1 mrg value in register given by SRC_REGNO. */ 1724 1.1 mrg 1725 1.1 mrg const char * 1726 1.1 mrg nvptx_output_set_softstack (unsigned src_regno) 1727 1.1 mrg { 1728 1.1 mrg if (cfun->machine->has_softstack && !crtl->is_leaf) 1729 1.1 mrg { 1730 1.1 mrg fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ", 1731 1.1 mrg POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]); 1732 1.1 mrg output_reg (asm_out_file, src_regno, VOIDmode); 1733 1.1 mrg fprintf (asm_out_file, ";\n"); 1734 1.1 mrg } 1735 1.1 mrg return ""; 1736 1.1 mrg } 1737 1.1 mrg /* Output a return instruction. Also copy the return value to its outgoing 1738 1.1 mrg location. */ 1739 1.1 mrg 1740 1.1 mrg const char * 1741 1.1 mrg nvptx_output_return (void) 1742 1.1 mrg { 1743 1.1 mrg machine_mode mode = (machine_mode)cfun->machine->return_mode; 1744 1.1 mrg 1745 1.1 mrg if (mode != VOIDmode) 1746 1.1 mrg fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n", 1747 1.1 mrg nvptx_ptx_type_from_mode (mode, false), 1748 1.1 mrg reg_names[NVPTX_RETURN_REGNUM], 1749 1.1 mrg reg_names[NVPTX_RETURN_REGNUM]); 1750 1.1 mrg 1751 1.1 mrg return "ret;"; 1752 1.1 mrg } 1753 1.1 mrg 1754 1.1 mrg /* Terminate a function by writing a closing brace to FILE. */ 1755 1.1 mrg 1756 1.1 mrg void 1757 1.1 mrg nvptx_function_end (FILE *file) 1758 1.1 mrg { 1759 1.1 mrg fprintf (file, "}\n"); 1760 1.1 mrg } 1761 1.1 mrg 1762 1.1 mrg /* Decide whether we can make a sibling call to a function. For ptx, we 1764 1.1 mrg can't. */ 1765 1.1 mrg 1766 1.1 mrg static bool 1767 1.1 mrg nvptx_function_ok_for_sibcall (tree, tree) 1768 1.1 mrg { 1769 1.1 mrg return false; 1770 1.1 mrg } 1771 1.1 mrg 1772 1.1 mrg /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */ 1773 1.1 mrg 1774 1.1 mrg static rtx 1775 1.1 mrg nvptx_get_drap_rtx (void) 1776 1.1 mrg { 1777 1.1 mrg if (TARGET_SOFT_STACK && stack_realign_drap) 1778 1.1 mrg return arg_pointer_rtx; 1779 1.1 mrg return NULL_RTX; 1780 1.1 mrg } 1781 1.1 mrg 1782 1.1 mrg /* Implement the TARGET_CALL_ARGS hook. Record information about one 1783 1.1 mrg argument to the next call. */ 1784 1.1 mrg 1785 1.1 mrg static void 1786 1.1 mrg nvptx_call_args (rtx arg, tree fntype) 1787 1.1 mrg { 1788 1.1 mrg if (!cfun->machine->doing_call) 1789 1.1 mrg { 1790 1.1 mrg cfun->machine->doing_call = true; 1791 1.1 mrg cfun->machine->is_varadic = false; 1792 1.1 mrg cfun->machine->num_args = 0; 1793 1.1 mrg 1794 1.1 mrg if (fntype && stdarg_p (fntype)) 1795 1.1 mrg { 1796 1.1 mrg cfun->machine->is_varadic = true; 1797 1.1 mrg cfun->machine->has_varadic = true; 1798 1.1 mrg cfun->machine->num_args++; 1799 1.1 mrg } 1800 1.1 mrg } 1801 1.1 mrg 1802 1.1 mrg if (REG_P (arg) && arg != pc_rtx) 1803 1.1 mrg { 1804 1.1 mrg cfun->machine->num_args++; 1805 1.1 mrg cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg, 1806 1.1 mrg cfun->machine->call_args); 1807 1.1 mrg } 1808 1.1 mrg } 1809 1.1 mrg 1810 1.1 mrg /* Implement the corresponding END_CALL_ARGS hook. Clear and free the 1811 1.1 mrg information we recorded. */ 1812 1.1 mrg 1813 1.1 mrg static void 1814 1.1 mrg nvptx_end_call_args (void) 1815 1.1 mrg { 1816 1.1 mrg cfun->machine->doing_call = false; 1817 1.1 mrg free_EXPR_LIST_list (&cfun->machine->call_args); 1818 1.1 mrg } 1819 1.1 mrg 1820 1.1 mrg /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep 1821 1.1 mrg track of whether calls involving static chains or varargs were seen 1822 1.1 mrg in the current function. 1823 1.1 mrg For libcalls, maintain a hash table of decls we have seen, and 1824 1.1 mrg record a function decl for later when encountering a new one. */ 1825 1.1 mrg 1826 1.1 mrg void 1827 1.1 mrg nvptx_expand_call (rtx retval, rtx address) 1828 1.1 mrg { 1829 1.1 mrg rtx callee = XEXP (address, 0); 1830 1.1 mrg rtx varargs = NULL_RTX; 1831 1.1 mrg unsigned parallel = 0; 1832 1.1 mrg 1833 1.1 mrg if (!call_insn_operand (callee, Pmode)) 1834 1.1 mrg { 1835 1.1 mrg callee = force_reg (Pmode, callee); 1836 1.1 mrg address = change_address (address, QImode, callee); 1837 1.1 mrg } 1838 1.1 mrg 1839 1.1 mrg if (GET_CODE (callee) == SYMBOL_REF) 1840 1.1 mrg { 1841 1.1 mrg tree decl = SYMBOL_REF_DECL (callee); 1842 1.1 mrg if (decl != NULL_TREE) 1843 1.1 mrg { 1844 1.1 mrg if (DECL_STATIC_CHAIN (decl)) 1845 1.1 mrg cfun->machine->has_chain = true; 1846 1.1 mrg 1847 1.1 mrg tree attr = oacc_get_fn_attrib (decl); 1848 1.1 mrg if (attr) 1849 1.1 mrg { 1850 1.1 mrg tree dims = TREE_VALUE (attr); 1851 1.1 mrg 1852 1.1 mrg parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1; 1853 1.1 mrg for (int ix = 0; ix != GOMP_DIM_MAX; ix++) 1854 1.1 mrg { 1855 1.1 mrg if (TREE_PURPOSE (dims) 1856 1.1 mrg && !integer_zerop (TREE_PURPOSE (dims))) 1857 1.1 mrg break; 1858 1.1 mrg /* Not on this axis. */ 1859 1.1 mrg parallel ^= GOMP_DIM_MASK (ix); 1860 1.1 mrg dims = TREE_CHAIN (dims); 1861 1.1 mrg } 1862 1.1 mrg } 1863 1.1 mrg } 1864 1.1 mrg } 1865 1.1 mrg 1866 1.1 mrg unsigned nargs = cfun->machine->num_args; 1867 1.1 mrg if (cfun->machine->is_varadic) 1868 1.1 mrg { 1869 1.1 mrg varargs = gen_reg_rtx (Pmode); 1870 1.1 mrg emit_move_insn (varargs, stack_pointer_rtx); 1871 1.1 mrg } 1872 1.1 mrg 1873 1.1 mrg rtvec vec = rtvec_alloc (nargs + 1); 1874 1.1 mrg rtx pat = gen_rtx_PARALLEL (VOIDmode, vec); 1875 1.1 mrg int vec_pos = 0; 1876 1.1 mrg 1877 1.1 mrg rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx); 1878 1.1 mrg rtx tmp_retval = retval; 1879 1.1 mrg if (retval) 1880 1.1 mrg { 1881 1.1 mrg if (!nvptx_register_operand (retval, GET_MODE (retval))) 1882 1.1 mrg tmp_retval = gen_reg_rtx (GET_MODE (retval)); 1883 1.1 mrg call = gen_rtx_SET (tmp_retval, call); 1884 1.1 mrg } 1885 1.1 mrg XVECEXP (pat, 0, vec_pos++) = call; 1886 1.1 mrg 1887 1.1 mrg /* Construct the call insn, including a USE for each argument pseudo 1888 1.1 mrg register. These will be used when printing the insn. */ 1889 1.1 mrg for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1)) 1890 1.1 mrg XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0)); 1891 1.1 mrg 1892 1.1 mrg if (varargs) 1893 1.1 mrg XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs); 1894 1.1 mrg 1895 1.1 mrg gcc_assert (vec_pos == XVECLEN (pat, 0)); 1896 1.1 mrg 1897 1.1 mrg nvptx_emit_forking (parallel, true); 1898 1.1 mrg emit_call_insn (pat); 1899 1.1 mrg nvptx_emit_joining (parallel, true); 1900 1.1 mrg 1901 1.1 mrg if (tmp_retval != retval) 1902 1.1 mrg emit_move_insn (retval, tmp_retval); 1903 1.1 mrg } 1904 1.1 mrg 1905 1.1 mrg /* Emit a comparison COMPARE, and return the new test to be used in the 1906 1.1 mrg jump. */ 1907 1.1 mrg 1908 1.1 mrg rtx 1909 1.1 mrg nvptx_expand_compare (rtx compare) 1910 1.1 mrg { 1911 1.1 mrg rtx pred = gen_reg_rtx (BImode); 1912 1.1 mrg rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode, 1913 1.1 mrg XEXP (compare, 0), XEXP (compare, 1)); 1914 1.1 mrg emit_insn (gen_rtx_SET (pred, cmp)); 1915 1.1 mrg return gen_rtx_NE (BImode, pred, const0_rtx); 1916 1.1 mrg } 1917 1.1 mrg 1918 1.1 mrg /* Expand the oacc fork & join primitive into ptx-required unspecs. */ 1919 1.1 mrg 1920 1.1 mrg void 1921 1.1 mrg nvptx_expand_oacc_fork (unsigned mode) 1922 1.1 mrg { 1923 1.1 mrg nvptx_emit_forking (GOMP_DIM_MASK (mode), false); 1924 1.1 mrg } 1925 1.1 mrg 1926 1.1 mrg void 1927 1.1 mrg nvptx_expand_oacc_join (unsigned mode) 1928 1.1 mrg { 1929 1.1 mrg nvptx_emit_joining (GOMP_DIM_MASK (mode), false); 1930 1.1 mrg } 1931 1.1 mrg 1932 1.1 mrg /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit 1933 1.1 mrg objects. */ 1934 1.1 mrg 1935 1.1 mrg static rtx 1936 1.1 mrg nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src) 1937 1.1 mrg { 1938 1.1 mrg rtx res; 1939 1.1 mrg 1940 1.1 mrg switch (GET_MODE (src)) 1941 1.1 mrg { 1942 1.1 mrg case E_DImode: 1943 1.1 mrg res = gen_unpackdisi2 (dst0, dst1, src); 1944 1.1 mrg break; 1945 1.1 mrg case E_DFmode: 1946 1.1 mrg res = gen_unpackdfsi2 (dst0, dst1, src); 1947 1.1 mrg break; 1948 1.1 mrg default: gcc_unreachable (); 1949 1.1 mrg } 1950 1.1 mrg return res; 1951 1.1 mrg } 1952 1.1 mrg 1953 1.1 mrg /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit 1954 1.1 mrg object. */ 1955 1.1 mrg 1956 1.1 mrg static rtx 1957 1.1 mrg nvptx_gen_pack (rtx dst, rtx src0, rtx src1) 1958 1.1 mrg { 1959 1.1 mrg rtx res; 1960 1.1 mrg 1961 1.1 mrg switch (GET_MODE (dst)) 1962 1.1 mrg { 1963 1.1 mrg case E_DImode: 1964 1.1 mrg res = gen_packsidi2 (dst, src0, src1); 1965 1.1 mrg break; 1966 1.1 mrg case E_DFmode: 1967 1.1 mrg res = gen_packsidf2 (dst, src0, src1); 1968 1.1 mrg break; 1969 1.1 mrg default: gcc_unreachable (); 1970 1.1 mrg } 1971 1.1 mrg return res; 1972 1.1 mrg } 1973 1.1 mrg 1974 1.1 mrg /* Generate an instruction or sequence to broadcast register REG 1975 1.1 mrg across the vectors of a single warp. */ 1976 1.1 mrg 1977 1.1 mrg rtx 1978 1.1 mrg nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind) 1979 1.1 mrg { 1980 1.1 mrg rtx res; 1981 1.1 mrg 1982 1.1 mrg switch (GET_MODE (dst)) 1983 1.1 mrg { 1984 1.1 mrg case E_DCmode: 1985 1.1 mrg case E_CDImode: 1986 1.1 mrg { 1987 1.1 mrg gcc_assert (GET_CODE (dst) == CONCAT); 1988 1.1 mrg gcc_assert (GET_CODE (src) == CONCAT); 1989 1.1 mrg rtx dst_real = XEXP (dst, 0); 1990 1.1 mrg rtx dst_imag = XEXP (dst, 1); 1991 1.1 mrg rtx src_real = XEXP (src, 0); 1992 1.1 mrg rtx src_imag = XEXP (src, 1); 1993 1.1 mrg 1994 1.1 mrg start_sequence (); 1995 1.1 mrg emit_insn (nvptx_gen_shuffle (dst_real, src_real, idx, kind)); 1996 1.1 mrg emit_insn (nvptx_gen_shuffle (dst_imag, src_imag, idx, kind)); 1997 1.1 mrg res = get_insns (); 1998 1.1 mrg end_sequence (); 1999 1.1 mrg } 2000 1.1 mrg break; 2001 1.1 mrg case E_SImode: 2002 1.1 mrg res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind)); 2003 1.1 mrg break; 2004 1.1 mrg case E_SFmode: 2005 1.1 mrg res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind)); 2006 1.1 mrg break; 2007 1.1 mrg case E_DImode: 2008 1.1 mrg case E_DFmode: 2009 1.1 mrg { 2010 1.1 mrg rtx tmp0 = gen_reg_rtx (SImode); 2011 1.1 mrg rtx tmp1 = gen_reg_rtx (SImode); 2012 1.1 mrg 2013 1.1 mrg start_sequence (); 2014 1.1 mrg emit_insn (nvptx_gen_unpack (tmp0, tmp1, src)); 2015 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind)); 2016 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind)); 2017 1.1 mrg emit_insn (nvptx_gen_pack (dst, tmp0, tmp1)); 2018 1.1 mrg res = get_insns (); 2019 1.1 mrg end_sequence (); 2020 1.1 mrg } 2021 1.1 mrg break; 2022 1.1 mrg case E_V2SImode: 2023 1.1 mrg { 2024 1.1 mrg rtx src0 = gen_rtx_SUBREG (SImode, src, 0); 2025 1.1 mrg rtx src1 = gen_rtx_SUBREG (SImode, src, 4); 2026 1.1 mrg rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0); 2027 1.1 mrg rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4); 2028 1.1 mrg rtx tmp0 = gen_reg_rtx (SImode); 2029 1.1 mrg rtx tmp1 = gen_reg_rtx (SImode); 2030 1.1 mrg start_sequence (); 2031 1.1 mrg emit_insn (gen_movsi (tmp0, src0)); 2032 1.1 mrg emit_insn (gen_movsi (tmp1, src1)); 2033 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind)); 2034 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind)); 2035 1.1 mrg emit_insn (gen_movsi (dst0, tmp0)); 2036 1.1 mrg emit_insn (gen_movsi (dst1, tmp1)); 2037 1.1 mrg res = get_insns (); 2038 1.1 mrg end_sequence (); 2039 1.1 mrg } 2040 1.1 mrg break; 2041 1.1 mrg case E_V2DImode: 2042 1.1 mrg { 2043 1.1 mrg rtx src0 = gen_rtx_SUBREG (DImode, src, 0); 2044 1.1 mrg rtx src1 = gen_rtx_SUBREG (DImode, src, 8); 2045 1.1 mrg rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0); 2046 1.1 mrg rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8); 2047 1.1 mrg rtx tmp0 = gen_reg_rtx (DImode); 2048 1.1 mrg rtx tmp1 = gen_reg_rtx (DImode); 2049 1.1 mrg start_sequence (); 2050 1.1 mrg emit_insn (gen_movdi (tmp0, src0)); 2051 1.1 mrg emit_insn (gen_movdi (tmp1, src1)); 2052 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind)); 2053 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind)); 2054 1.1 mrg emit_insn (gen_movdi (dst0, tmp0)); 2055 1.1 mrg emit_insn (gen_movdi (dst1, tmp1)); 2056 1.1 mrg res = get_insns (); 2057 1.1 mrg end_sequence (); 2058 1.1 mrg } 2059 1.1 mrg break; 2060 1.1 mrg case E_BImode: 2061 1.1 mrg { 2062 1.1 mrg rtx tmp = gen_reg_rtx (SImode); 2063 1.1 mrg 2064 1.1 mrg start_sequence (); 2065 1.1 mrg emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx)); 2066 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind)); 2067 1.1 mrg emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx))); 2068 1.1 mrg res = get_insns (); 2069 1.1 mrg end_sequence (); 2070 1.1 mrg } 2071 1.1 mrg break; 2072 1.1 mrg case E_QImode: 2073 1.1 mrg case E_HImode: 2074 1.1 mrg { 2075 1.1 mrg rtx tmp = gen_reg_rtx (SImode); 2076 1.1 mrg 2077 1.1 mrg start_sequence (); 2078 1.1 mrg emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src))); 2079 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind)); 2080 1.1 mrg emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst), 2081 1.1 mrg tmp))); 2082 1.1 mrg res = get_insns (); 2083 1.1 mrg end_sequence (); 2084 1.1 mrg } 2085 1.1 mrg break; 2086 1.1 mrg 2087 1.1 mrg default: 2088 1.1 mrg gcc_unreachable (); 2089 1.1 mrg } 2090 1.1 mrg return res; 2091 1.1 mrg } 2092 1.1 mrg 2093 1.1 mrg /* Generate an instruction or sequence to broadcast register REG 2094 1.1 mrg across the vectors of a single warp. */ 2095 1.1 mrg 2096 1.1 mrg static rtx 2097 1.1 mrg nvptx_gen_warp_bcast (rtx reg) 2098 1.1 mrg { 2099 1.1 mrg return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX); 2100 1.1 mrg } 2101 1.1 mrg 2102 1.1 mrg /* Structure used when generating a worker-level spill or fill. */ 2103 1.1 mrg 2104 1.1 mrg struct broadcast_data_t 2105 1.1 mrg { 2106 1.1 mrg rtx base; /* Register holding base addr of buffer. */ 2107 1.1 mrg rtx ptr; /* Iteration var, if needed. */ 2108 1.1 mrg unsigned offset; /* Offset into worker buffer. */ 2109 1.1 mrg }; 2110 1.1 mrg 2111 1.1 mrg /* Direction of the spill/fill and looping setup/teardown indicator. */ 2112 1.1 mrg 2113 1.1 mrg enum propagate_mask 2114 1.1 mrg { 2115 1.1 mrg PM_read = 1 << 0, 2116 1.1 mrg PM_write = 1 << 1, 2117 1.1 mrg PM_loop_begin = 1 << 2, 2118 1.1 mrg PM_loop_end = 1 << 3, 2119 1.1 mrg 2120 1.1 mrg PM_read_write = PM_read | PM_write 2121 1.1 mrg }; 2122 1.1 mrg 2123 1.1 mrg /* Generate instruction(s) to spill or fill register REG to/from the 2124 1.1 mrg worker broadcast array. PM indicates what is to be done, REP 2125 1.1 mrg how many loop iterations will be executed (0 for not a loop). */ 2126 1.1 mrg 2127 1.1 mrg static rtx 2128 1.1 mrg nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep, 2129 1.1 mrg broadcast_data_t *data, bool vector) 2130 1.1 mrg { 2131 1.1 mrg rtx res; 2132 1.1 mrg machine_mode mode = GET_MODE (reg); 2133 1.1 mrg 2134 1.1 mrg switch (mode) 2135 1.1 mrg { 2136 1.1 mrg case E_BImode: 2137 1.1 mrg { 2138 1.1 mrg rtx tmp = gen_reg_rtx (SImode); 2139 1.1 mrg 2140 1.1 mrg start_sequence (); 2141 1.1 mrg if (pm & PM_read) 2142 1.1 mrg emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx)); 2143 1.1 mrg emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector)); 2144 1.1 mrg if (pm & PM_write) 2145 1.1 mrg emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx))); 2146 1.1 mrg res = get_insns (); 2147 1.1 mrg end_sequence (); 2148 1.1 mrg } 2149 1.1 mrg break; 2150 1.1 mrg 2151 1.1 mrg default: 2152 1.1 mrg { 2153 1.1 mrg rtx addr = data->ptr; 2154 1.1 mrg 2155 1.1 mrg if (!addr) 2156 1.1 mrg { 2157 1.1 mrg unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT; 2158 1.1 mrg 2159 1.1 mrg oacc_bcast_align = MAX (oacc_bcast_align, align); 2160 1.1 mrg data->offset = ROUND_UP (data->offset, align); 2161 1.1 mrg addr = data->base; 2162 1.1 mrg gcc_assert (data->base != NULL); 2163 1.1 mrg if (data->offset) 2164 1.1 mrg addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset)); 2165 1.1 mrg } 2166 1.1 mrg 2167 1.1 mrg addr = gen_rtx_MEM (mode, addr); 2168 1.1 mrg if (pm == PM_read) 2169 1.1 mrg res = gen_rtx_SET (addr, reg); 2170 1.1 mrg else if (pm == PM_write) 2171 1.1 mrg res = gen_rtx_SET (reg, addr); 2172 1.1 mrg else 2173 1.1 mrg gcc_unreachable (); 2174 1.1 mrg 2175 1.1 mrg if (data->ptr) 2176 1.1 mrg { 2177 1.1 mrg /* We're using a ptr, increment it. */ 2178 1.1 mrg start_sequence (); 2179 1.1 mrg 2180 1.1 mrg emit_insn (res); 2181 1.1 mrg emit_insn (gen_adddi3 (data->ptr, data->ptr, 2182 1.1 mrg GEN_INT (GET_MODE_SIZE (GET_MODE (reg))))); 2183 1.1 mrg res = get_insns (); 2184 1.1 mrg end_sequence (); 2185 1.1 mrg } 2186 1.1 mrg else 2187 1.1 mrg rep = 1; 2188 1.1 mrg data->offset += rep * GET_MODE_SIZE (GET_MODE (reg)); 2189 1.1 mrg } 2190 1.1 mrg break; 2191 1.1 mrg } 2192 1.1 mrg return res; 2193 1.1 mrg } 2194 1.1 mrg 2195 1.1 mrg /* Returns true if X is a valid address for use in a memory reference. */ 2197 1.1 mrg 2198 1.1 mrg static bool 2199 1.1 mrg nvptx_legitimate_address_p (machine_mode, rtx x, bool) 2200 1.1 mrg { 2201 1.1 mrg enum rtx_code code = GET_CODE (x); 2202 1.1 mrg 2203 1.1 mrg switch (code) 2204 1.1 mrg { 2205 1.1 mrg case REG: 2206 1.1 mrg return true; 2207 1.1 mrg 2208 1.1 mrg case PLUS: 2209 1.1 mrg if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1))) 2210 1.1 mrg return true; 2211 1.1 mrg return false; 2212 1.1 mrg 2213 1.1 mrg case CONST: 2214 1.1 mrg case SYMBOL_REF: 2215 1.1 mrg case LABEL_REF: 2216 1.1 mrg return true; 2217 1.1 mrg 2218 1.1 mrg default: 2219 1.1 mrg return false; 2220 1.1 mrg } 2221 1.1 mrg } 2222 1.1 mrg 2223 1.1 mrg /* Machinery to output constant initializers. When beginning an 2225 1.1 mrg initializer, we decide on a fragment size (which is visible in ptx 2226 1.1 mrg in the type used), and then all initializer data is buffered until 2227 1.1 mrg a fragment is filled and ready to be written out. */ 2228 1.1 mrg 2229 1.1 mrg static struct 2230 1.1 mrg { 2231 1.1 mrg unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */ 2232 1.1 mrg unsigned HOST_WIDE_INT val; /* Current fragment value. */ 2233 1.1 mrg unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written 2234 1.1 mrg out. */ 2235 1.1 mrg unsigned size; /* Fragment size to accumulate. */ 2236 1.1 mrg unsigned offset; /* Offset within current fragment. */ 2237 1.1 mrg bool started; /* Whether we've output any initializer. */ 2238 1.1 mrg } init_frag; 2239 1.1 mrg 2240 1.1 mrg /* The current fragment is full, write it out. SYM may provide a 2241 1.1 mrg symbolic reference we should output, in which case the fragment 2242 1.1 mrg value is the addend. */ 2243 1.1 mrg 2244 1.1 mrg static void 2245 1.1 mrg output_init_frag (rtx sym) 2246 1.1 mrg { 2247 1.1 mrg fprintf (asm_out_file, init_frag.started ? ", " : " = { "); 2248 1.1 mrg unsigned HOST_WIDE_INT val = init_frag.val; 2249 1.1 mrg 2250 1.1 mrg init_frag.started = true; 2251 1.1 mrg init_frag.val = 0; 2252 1.1 mrg init_frag.offset = 0; 2253 1.1 mrg init_frag.remaining--; 2254 1.1 mrg 2255 1.1 mrg if (sym) 2256 1.1 mrg { 2257 1.1 mrg bool function = (SYMBOL_REF_DECL (sym) 2258 1.1 mrg && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL)); 2259 1.1 mrg if (!function) 2260 1.1 mrg fprintf (asm_out_file, "generic("); 2261 1.1 mrg output_address (VOIDmode, sym); 2262 1.1 mrg if (!function) 2263 1.1 mrg fprintf (asm_out_file, ")"); 2264 1.1 mrg if (val) 2265 1.1 mrg fprintf (asm_out_file, " + "); 2266 1.1 mrg } 2267 1.1 mrg 2268 1.1 mrg if (!sym || val) 2269 1.1 mrg fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val); 2270 1.1 mrg } 2271 1.1 mrg 2272 1.1 mrg /* Add value VAL of size SIZE to the data we're emitting, and keep 2273 1.1 mrg writing out chunks as they fill up. */ 2274 1.1 mrg 2275 1.1 mrg static void 2276 1.1 mrg nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size) 2277 1.1 mrg { 2278 1.1 mrg bool negative_p 2279 1.1 mrg = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1)); 2280 1.1 mrg 2281 1.1 mrg /* Avoid undefined behaviour. */ 2282 1.1 mrg if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT) 2283 1.1 mrg val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1; 2284 1.1 mrg 2285 1.1 mrg for (unsigned part = 0; size; size -= part) 2286 1.1 mrg { 2287 1.1 mrg if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT) 2288 1.1 mrg /* Avoid undefined behaviour. */ 2289 1.1 mrg val = negative_p ? -1 : 0; 2290 1.1 mrg else 2291 1.1 mrg val >>= (part * BITS_PER_UNIT); 2292 1.1 mrg part = init_frag.size - init_frag.offset; 2293 1.1 mrg part = MIN (part, size); 2294 1.1 mrg 2295 1.1 mrg unsigned HOST_WIDE_INT partial 2296 1.1 mrg = val << (init_frag.offset * BITS_PER_UNIT); 2297 1.1 mrg init_frag.val |= partial & init_frag.mask; 2298 1.1 mrg init_frag.offset += part; 2299 1.1 mrg 2300 1.1 mrg if (init_frag.offset == init_frag.size) 2301 1.1 mrg output_init_frag (NULL); 2302 1.1 mrg } 2303 1.1 mrg } 2304 1.1 mrg 2305 1.1 mrg /* Target hook for assembling integer object X of size SIZE. */ 2306 1.1 mrg 2307 1.1 mrg static bool 2308 1.1 mrg nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p)) 2309 1.1 mrg { 2310 1.1 mrg HOST_WIDE_INT val = 0; 2311 1.1 mrg 2312 1.1 mrg switch (GET_CODE (x)) 2313 1.1 mrg { 2314 1.1 mrg default: 2315 1.1 mrg /* Let the generic machinery figure it out, usually for a 2316 1.1 mrg CONST_WIDE_INT. */ 2317 1.1 mrg return false; 2318 1.1 mrg 2319 1.1 mrg case CONST_INT: 2320 1.1 mrg nvptx_assemble_value (INTVAL (x), size); 2321 1.1 mrg break; 2322 1.1 mrg 2323 1.1 mrg case CONST: 2324 1.1 mrg x = XEXP (x, 0); 2325 1.1 mrg gcc_assert (GET_CODE (x) == PLUS); 2326 1.1 mrg val = INTVAL (XEXP (x, 1)); 2327 1.1 mrg x = XEXP (x, 0); 2328 1.1 mrg gcc_assert (GET_CODE (x) == SYMBOL_REF); 2329 1.1 mrg gcc_fallthrough (); /* FALLTHROUGH */ 2330 1.1 mrg 2331 1.1 mrg case SYMBOL_REF: 2332 1.1 mrg gcc_assert (size == init_frag.size); 2333 1.1 mrg if (init_frag.offset) 2334 1.1 mrg sorry ("cannot emit unaligned pointers in ptx assembly"); 2335 1.1 mrg 2336 1.1 mrg nvptx_maybe_record_fnsym (x); 2337 1.1 mrg init_frag.val = val; 2338 1.1 mrg output_init_frag (x); 2339 1.1 mrg break; 2340 1.1 mrg } 2341 1.1 mrg 2342 1.1 mrg return true; 2343 1.1 mrg } 2344 1.1 mrg 2345 1.1 mrg /* Output SIZE zero bytes. We ignore the FILE argument since the 2346 1.1 mrg functions we're calling to perform the output just use 2347 1.1 mrg asm_out_file. */ 2348 1.1 mrg 2349 1.1 mrg void 2350 1.1 mrg nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size) 2351 1.1 mrg { 2352 1.1 mrg /* Finish the current fragment, if it's started. */ 2353 1.1 mrg if (init_frag.offset) 2354 1.1 mrg { 2355 1.1 mrg unsigned part = init_frag.size - init_frag.offset; 2356 1.1 mrg part = MIN (part, (unsigned)size); 2357 1.1 mrg size -= part; 2358 1.1 mrg nvptx_assemble_value (0, part); 2359 1.1 mrg } 2360 1.1 mrg 2361 1.1 mrg /* If this skip doesn't terminate the initializer, write as many 2362 1.1 mrg remaining pieces as possible directly. */ 2363 1.1 mrg if (size < init_frag.remaining * init_frag.size) 2364 1.1 mrg { 2365 1.1 mrg while (size >= init_frag.size) 2366 1.1 mrg { 2367 1.1 mrg size -= init_frag.size; 2368 1.1 mrg output_init_frag (NULL_RTX); 2369 1.1 mrg } 2370 1.1 mrg if (size) 2371 1.1 mrg nvptx_assemble_value (0, size); 2372 1.1 mrg } 2373 1.1 mrg } 2374 1.1 mrg 2375 1.1 mrg /* Output a string STR with length SIZE. As in nvptx_output_skip we 2376 1.1 mrg ignore the FILE arg. */ 2377 1.1 mrg 2378 1.1 mrg void 2379 1.1 mrg nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size) 2380 1.1 mrg { 2381 1.1 mrg for (unsigned HOST_WIDE_INT i = 0; i < size; i++) 2382 1.1 mrg nvptx_assemble_value (str[i], 1); 2383 1.1 mrg } 2384 1.1 mrg 2385 1.1 mrg /* Return true if TYPE is a record type where the last field is an array without 2386 1.1 mrg given dimension. */ 2387 1.1 mrg 2388 1.1 mrg static bool 2389 1.1 mrg flexible_array_member_type_p (const_tree type) 2390 1.1 mrg { 2391 1.1 mrg if (TREE_CODE (type) != RECORD_TYPE) 2392 1.1 mrg return false; 2393 1.1 mrg 2394 1.1 mrg const_tree last_field = NULL_TREE; 2395 1.1 mrg for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f)) 2396 1.1 mrg last_field = f; 2397 1.1 mrg 2398 1.1 mrg if (!last_field) 2399 1.1 mrg return false; 2400 1.1 mrg 2401 1.1 mrg const_tree last_field_type = TREE_TYPE (last_field); 2402 1.1 mrg if (TREE_CODE (last_field_type) != ARRAY_TYPE) 2403 1.1 mrg return false; 2404 1.1 mrg 2405 1.1 mrg return (! TYPE_DOMAIN (last_field_type) 2406 1.1 mrg || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type))); 2407 1.1 mrg } 2408 1.1 mrg 2409 1.1 mrg /* Emit a PTX variable decl and prepare for emission of its 2410 1.1 mrg initializer. NAME is the symbol name and SETION the PTX data 2411 1.1 mrg area. The type is TYPE, object size SIZE and alignment is ALIGN. 2412 1.1 mrg The caller has already emitted any indentation and linkage 2413 1.1 mrg specifier. It is responsible for any initializer, terminating ; 2414 1.1 mrg and newline. SIZE is in bytes, ALIGN is in bits -- confusingly 2415 1.1 mrg this is the opposite way round that PTX wants them! */ 2416 1.1 mrg 2417 1.1 mrg static void 2418 1.1 mrg nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section, 2419 1.1 mrg const_tree type, HOST_WIDE_INT size, unsigned align, 2420 1.1 mrg bool undefined = false) 2421 1.1 mrg { 2422 1.1 mrg bool atype = (TREE_CODE (type) == ARRAY_TYPE) 2423 1.1 mrg && (TYPE_DOMAIN (type) == NULL_TREE); 2424 1.1 mrg 2425 1.1 mrg if (undefined && flexible_array_member_type_p (type)) 2426 1.1 mrg { 2427 1.1 mrg size = 0; 2428 1.1 mrg atype = true; 2429 1.1 mrg } 2430 1.1 mrg 2431 1.1 mrg while (TREE_CODE (type) == ARRAY_TYPE) 2432 1.1 mrg type = TREE_TYPE (type); 2433 1.1 mrg 2434 1.1 mrg if (TREE_CODE (type) == VECTOR_TYPE 2435 1.1 mrg || TREE_CODE (type) == COMPLEX_TYPE) 2436 1.1 mrg /* Neither vector nor complex types can contain the other. */ 2437 1.1 mrg type = TREE_TYPE (type); 2438 1.1 mrg 2439 1.1 mrg unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type); 2440 1.1 mrg 2441 1.1 mrg /* Largest mode we're prepared to accept. For BLKmode types we 2442 1.1 mrg don't know if it'll contain pointer constants, so have to choose 2443 1.1 mrg pointer size, otherwise we can choose DImode. */ 2444 1.1 mrg machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode; 2445 1.1 mrg 2446 1.1 mrg elt_size |= GET_MODE_SIZE (elt_mode); 2447 1.1 mrg elt_size &= -elt_size; /* Extract LSB set. */ 2448 1.1 mrg 2449 1.1 mrg init_frag.size = elt_size; 2450 1.1 mrg /* Avoid undefined shift behavior by using '2'. */ 2451 1.1 mrg init_frag.mask = ((unsigned HOST_WIDE_INT)2 2452 1.1 mrg << (elt_size * BITS_PER_UNIT - 1)) - 1; 2453 1.1 mrg init_frag.val = 0; 2454 1.1 mrg init_frag.offset = 0; 2455 1.1 mrg init_frag.started = false; 2456 1.1 mrg /* Size might not be a multiple of elt size, if there's an 2457 1.1 mrg initialized trailing struct array with smaller type than 2458 1.1 mrg elt_size. */ 2459 1.1 mrg init_frag.remaining = (size + elt_size - 1) / elt_size; 2460 1.1 mrg 2461 1.1 mrg fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ", 2462 1.1 mrg section, align / BITS_PER_UNIT, 2463 1.1 mrg elt_size * BITS_PER_UNIT); 2464 1.1 mrg assemble_name (file, name); 2465 1.1 mrg 2466 1.1 mrg if (size) 2467 1.1 mrg /* We make everything an array, to simplify any initialization 2468 1.1 mrg emission. */ 2469 1.1 mrg fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining); 2470 1.1 mrg else if (atype) 2471 1.1 mrg fprintf (file, "[]"); 2472 1.1 mrg } 2473 1.1 mrg 2474 1.1 mrg /* Called when the initializer for a decl has been completely output through 2475 1.1 mrg combinations of the three functions above. */ 2476 1.1 mrg 2477 1.1 mrg static void 2478 1.1 mrg nvptx_assemble_decl_end (void) 2479 1.1 mrg { 2480 1.1 mrg if (init_frag.offset) 2481 1.1 mrg /* This can happen with a packed struct with trailing array member. */ 2482 1.1 mrg nvptx_assemble_value (0, init_frag.size - init_frag.offset); 2483 1.1 mrg fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n"); 2484 1.1 mrg } 2485 1.1 mrg 2486 1.1 mrg /* Output an uninitialized common or file-scope variable. */ 2487 1.1 mrg 2488 1.1 mrg void 2489 1.1 mrg nvptx_output_aligned_decl (FILE *file, const char *name, 2490 1.1 mrg const_tree decl, HOST_WIDE_INT size, unsigned align) 2491 1.1 mrg { 2492 1.1 mrg write_var_marker (file, true, TREE_PUBLIC (decl), name); 2493 1.1 mrg 2494 1.1 mrg /* If this is public, it is common. The nearest thing we have to 2495 1.1 mrg common is weak. */ 2496 1.1 mrg fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : ""); 2497 1.1 mrg 2498 1.1 mrg nvptx_assemble_decl_begin (file, name, section_for_decl (decl), 2499 1.1 mrg TREE_TYPE (decl), size, align); 2500 1.1 mrg nvptx_assemble_decl_end (); 2501 1.1 mrg } 2502 1.1 mrg 2503 1.1 mrg /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of 2504 1.1 mrg writing a constant variable EXP with NAME and SIZE and its 2505 1.1 mrg initializer to FILE. */ 2506 1.1 mrg 2507 1.1 mrg static void 2508 1.1 mrg nvptx_asm_declare_constant_name (FILE *file, const char *name, 2509 1.1 mrg const_tree exp, HOST_WIDE_INT obj_size) 2510 1.1 mrg { 2511 1.1 mrg write_var_marker (file, true, false, name); 2512 1.1 mrg 2513 1.1 mrg fprintf (file, "\t"); 2514 1.1 mrg 2515 1.1 mrg tree type = TREE_TYPE (exp); 2516 1.1 mrg nvptx_assemble_decl_begin (file, name, ".const", type, obj_size, 2517 1.1 mrg TYPE_ALIGN (type)); 2518 1.1 mrg } 2519 1.1 mrg 2520 1.1 mrg /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing 2521 1.1 mrg a variable DECL with NAME to FILE. */ 2522 1.1 mrg 2523 1.1 mrg void 2524 1.1 mrg nvptx_declare_object_name (FILE *file, const char *name, const_tree decl) 2525 1.1 mrg { 2526 1.1 mrg write_var_marker (file, true, TREE_PUBLIC (decl), name); 2527 1.1 mrg 2528 1.1 mrg fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? "" 2529 1.1 mrg : DECL_WEAK (decl) ? ".weak " : ".visible ")); 2530 1.1 mrg 2531 1.1 mrg tree type = TREE_TYPE (decl); 2532 1.1 mrg HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl)); 2533 1.1 mrg nvptx_assemble_decl_begin (file, name, section_for_decl (decl), 2534 1.1 mrg type, obj_size, DECL_ALIGN (decl)); 2535 1.1 mrg } 2536 1.1 mrg 2537 1.1 mrg /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */ 2538 1.1 mrg 2539 1.1 mrg static void 2540 1.1 mrg nvptx_globalize_label (FILE *, const char *) 2541 1.1 mrg { 2542 1.1 mrg } 2543 1.1 mrg 2544 1.1 mrg /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern 2545 1.1 mrg declaration only for variable DECL with NAME to FILE. */ 2546 1.1 mrg 2547 1.1 mrg static void 2548 1.1 mrg nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl) 2549 1.1 mrg { 2550 1.1 mrg /* The middle end can place constant pool decls into the varpool as 2551 1.1 mrg undefined. Until that is fixed, catch the problem here. */ 2552 1.1 mrg if (DECL_IN_CONSTANT_POOL (decl)) 2553 1.1 mrg return; 2554 1.1 mrg 2555 1.1 mrg /* We support weak defintions, and hence have the right 2556 1.1 mrg ASM_WEAKEN_DECL definition. Diagnose the problem here. */ 2557 1.1 mrg if (DECL_WEAK (decl)) 2558 1.1 mrg error_at (DECL_SOURCE_LOCATION (decl), 2559 1.1 mrg "PTX does not support weak declarations" 2560 1.1 mrg " (only weak definitions)"); 2561 1.1 mrg write_var_marker (file, false, TREE_PUBLIC (decl), name); 2562 1.1 mrg 2563 1.1 mrg fprintf (file, "\t.extern "); 2564 1.1 mrg tree size = DECL_SIZE_UNIT (decl); 2565 1.1 mrg nvptx_assemble_decl_begin (file, name, section_for_decl (decl), 2566 1.1 mrg TREE_TYPE (decl), size ? tree_to_shwi (size) : 0, 2567 1.1 mrg DECL_ALIGN (decl), true); 2568 1.1 mrg nvptx_assemble_decl_end (); 2569 1.1 mrg } 2570 1.1 mrg 2571 1.1 mrg /* Output a pattern for a move instruction. */ 2572 1.1 mrg 2573 1.1 mrg const char * 2574 1.1 mrg nvptx_output_mov_insn (rtx dst, rtx src) 2575 1.1 mrg { 2576 1.1 mrg machine_mode dst_mode = GET_MODE (dst); 2577 1.1 mrg machine_mode src_mode = GET_MODE (src); 2578 1.1 mrg machine_mode dst_inner = (GET_CODE (dst) == SUBREG 2579 1.1 mrg ? GET_MODE (XEXP (dst, 0)) : dst_mode); 2580 1.1 mrg machine_mode src_inner = (GET_CODE (src) == SUBREG 2581 1.1 mrg ? GET_MODE (XEXP (src, 0)) : dst_mode); 2582 1.1 mrg 2583 1.1 mrg rtx sym = src; 2584 1.1 mrg if (GET_CODE (sym) == CONST) 2585 1.1 mrg sym = XEXP (XEXP (sym, 0), 0); 2586 1.1 mrg if (SYMBOL_REF_P (sym)) 2587 1.1 mrg { 2588 1.1 mrg if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC) 2589 1.1 mrg return "%.\tcvta%D1%t0\t%0, %1;"; 2590 1.1 mrg nvptx_maybe_record_fnsym (sym); 2591 1.1 mrg } 2592 1.1 mrg 2593 1.1 mrg if (src_inner == dst_inner) 2594 1.1 mrg return "%.\tmov%t0\t%0, %1;"; 2595 1.1 mrg 2596 1.1 mrg if (CONSTANT_P (src)) 2597 1.1 mrg return (GET_MODE_CLASS (dst_inner) == MODE_INT 2598 1.1 mrg && GET_MODE_CLASS (src_inner) != MODE_FLOAT 2599 1.1 mrg ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;"); 2600 1.1 mrg 2601 1.1 mrg if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner)) 2602 1.1 mrg { 2603 1.1 mrg if (GET_MODE_BITSIZE (dst_mode) == 128 2604 1.1 mrg && GET_MODE_BITSIZE (src_mode) == 128) 2605 1.1 mrg { 2606 1.1 mrg /* mov.b128 is not supported. */ 2607 1.1 mrg if (dst_inner == V2DImode && src_inner == TImode) 2608 1.1 mrg return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;"; 2609 1.1 mrg else if (dst_inner == TImode && src_inner == V2DImode) 2610 1.1 mrg return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;"; 2611 1.1 mrg 2612 1.1 mrg gcc_unreachable (); 2613 1.1 mrg } 2614 1.1 mrg return "%.\tmov.b%T0\t%0, %1;"; 2615 1.1 mrg } 2616 1.1 mrg 2617 1.1 mrg if (GET_MODE_BITSIZE (src_inner) == 128 2618 1.1 mrg && GET_MODE_BITSIZE (src_mode) == 64) 2619 1.1 mrg return "%.\tmov.b%T0\t%0, %1;"; 2620 1.1 mrg 2621 1.1 mrg return "%.\tcvt%t0%t1\t%0, %1;"; 2622 1.1 mrg } 2623 1.1 mrg 2624 1.1 mrg /* Output a pre/post barrier for MEM_OPERAND according to MEMMODEL. */ 2625 1.1 mrg 2626 1.1 mrg static void 2627 1.1 mrg nvptx_output_barrier (rtx *mem_operand, int memmodel, bool pre_p) 2628 1.1 mrg { 2629 1.1 mrg bool post_p = !pre_p; 2630 1.1 mrg 2631 1.1 mrg switch (memmodel) 2632 1.1 mrg { 2633 1.1 mrg case MEMMODEL_RELAXED: 2634 1.1 mrg return; 2635 1.1 mrg case MEMMODEL_CONSUME: 2636 1.1 mrg case MEMMODEL_ACQUIRE: 2637 1.1 mrg case MEMMODEL_SYNC_ACQUIRE: 2638 1.1 mrg if (post_p) 2639 1.1 mrg break; 2640 1.1 mrg return; 2641 1.1 mrg case MEMMODEL_RELEASE: 2642 1.1 mrg case MEMMODEL_SYNC_RELEASE: 2643 1.1 mrg if (pre_p) 2644 1.1 mrg break; 2645 1.1 mrg return; 2646 1.1 mrg case MEMMODEL_ACQ_REL: 2647 1.1 mrg case MEMMODEL_SEQ_CST: 2648 1.1 mrg case MEMMODEL_SYNC_SEQ_CST: 2649 1.1 mrg if (pre_p || post_p) 2650 1.1 mrg break; 2651 1.1 mrg return; 2652 1.1 mrg default: 2653 1.1 mrg gcc_unreachable (); 2654 1.1 mrg } 2655 1.1 mrg 2656 1.1 mrg output_asm_insn ("%.\tmembar%B0;", mem_operand); 2657 1.1 mrg } 2658 1.1 mrg 2659 1.1 mrg const char * 2660 1.1 mrg nvptx_output_atomic_insn (const char *asm_template, rtx *operands, int mem_pos, 2661 1.1 mrg int memmodel_pos) 2662 1.1 mrg { 2663 1.1 mrg nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]), 2664 1.1 mrg true); 2665 1.1 mrg output_asm_insn (asm_template, operands); 2666 1.1 mrg nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]), 2667 1.1 mrg false); 2668 1.1 mrg return ""; 2669 1.1 mrg } 2670 1.1 mrg 2671 1.1 mrg static void nvptx_print_operand (FILE *, rtx, int); 2672 1.1 mrg 2673 1.1 mrg /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this 2674 1.1 mrg involves writing .param declarations and in/out copies into them. For 2675 1.1 mrg indirect calls, also write the .callprototype. */ 2676 1.1 mrg 2677 1.1 mrg const char * 2678 1.1 mrg nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee) 2679 1.1 mrg { 2680 1.1 mrg char buf[16]; 2681 1.1 mrg static int labelno; 2682 1.1 mrg bool needs_tgt = register_operand (callee, Pmode); 2683 1.1 mrg rtx pat = PATTERN (insn); 2684 1.1 mrg if (GET_CODE (pat) == COND_EXEC) 2685 1.1 mrg pat = COND_EXEC_CODE (pat); 2686 1.1 mrg int arg_end = XVECLEN (pat, 0); 2687 1.1 mrg tree decl = NULL_TREE; 2688 1.1 mrg 2689 1.1 mrg fprintf (asm_out_file, "\t{\n"); 2690 1.1 mrg if (result != NULL) 2691 1.1 mrg fprintf (asm_out_file, "\t\t.param%s %s_in;\n", 2692 1.1 mrg nvptx_ptx_type_from_mode (GET_MODE (result), false), 2693 1.1 mrg reg_names[NVPTX_RETURN_REGNUM]); 2694 1.1 mrg 2695 1.1 mrg /* Ensure we have a ptx declaration in the output if necessary. */ 2696 1.1 mrg if (GET_CODE (callee) == SYMBOL_REF) 2697 1.1 mrg { 2698 1.1 mrg decl = SYMBOL_REF_DECL (callee); 2699 1.1 mrg if (!decl 2700 1.1 mrg || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl)))) 2701 1.1 mrg nvptx_record_libfunc (callee, result, pat); 2702 1.1 mrg else if (DECL_EXTERNAL (decl)) 2703 1.1 mrg nvptx_record_fndecl (decl); 2704 1.1 mrg } 2705 1.1 mrg 2706 1.1 mrg if (needs_tgt) 2707 1.1 mrg { 2708 1.1 mrg ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno); 2709 1.1 mrg labelno++; 2710 1.1 mrg ASM_OUTPUT_LABEL (asm_out_file, buf); 2711 1.1 mrg std::stringstream s; 2712 1.1 mrg write_fn_proto_from_insn (s, NULL, result, pat); 2713 1.1 mrg fputs (s.str().c_str(), asm_out_file); 2714 1.1 mrg } 2715 1.1 mrg 2716 1.1 mrg for (int argno = 1; argno < arg_end; argno++) 2717 1.1 mrg { 2718 1.1 mrg rtx t = XEXP (XVECEXP (pat, 0, argno), 0); 2719 1.1 mrg machine_mode mode = GET_MODE (t); 2720 1.1 mrg const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); 2721 1.1 mrg 2722 1.1 mrg /* Mode splitting has already been done. */ 2723 1.1 mrg fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n" 2724 1.1 mrg "\t\tst.param%s [%%out_arg%d], ", 2725 1.1 mrg ptx_type, argno, ptx_type, argno); 2726 1.1 mrg output_reg (asm_out_file, REGNO (t), VOIDmode); 2727 1.1 mrg fprintf (asm_out_file, ";\n"); 2728 1.1 mrg } 2729 1.1 mrg 2730 1.1 mrg /* The '.' stands for the call's predicate, if any. */ 2731 1.1 mrg nvptx_print_operand (asm_out_file, NULL_RTX, '.'); 2732 1.1 mrg fprintf (asm_out_file, "\t\tcall "); 2733 1.1 mrg if (result != NULL_RTX) 2734 1.1 mrg fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]); 2735 1.1 mrg 2736 1.1 mrg if (decl) 2737 1.1 mrg { 2738 1.1 mrg char *replaced_dots = NULL; 2739 1.1 mrg const char *name = get_fnname_from_decl (decl); 2740 1.1 mrg const char *replacement = nvptx_name_replacement (name); 2741 1.1 mrg if (replacement != name) 2742 1.1 mrg name = replacement; 2743 1.1 mrg else 2744 1.1 mrg { 2745 1.1 mrg replaced_dots = nvptx_replace_dot (name); 2746 1.1 mrg if (replaced_dots) 2747 1.1 mrg name = replaced_dots; 2748 1.1 mrg } 2749 1.1 mrg assemble_name (asm_out_file, name); 2750 1.1 mrg if (replaced_dots) 2751 1.1 mrg XDELETE (replaced_dots); 2752 1.1 mrg } 2753 1.1 mrg else 2754 1.1 mrg output_address (VOIDmode, callee); 2755 1.1 mrg 2756 1.1 mrg const char *open = "("; 2757 1.1 mrg for (int argno = 1; argno < arg_end; argno++) 2758 1.1 mrg { 2759 1.1 mrg fprintf (asm_out_file, ", %s%%out_arg%d", open, argno); 2760 1.1 mrg open = ""; 2761 1.1 mrg } 2762 1.1 mrg if (decl && DECL_STATIC_CHAIN (decl)) 2763 1.1 mrg { 2764 1.1 mrg fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]); 2765 1.1 mrg open = ""; 2766 1.1 mrg } 2767 1.1 mrg if (!open[0]) 2768 1.1 mrg fprintf (asm_out_file, ")"); 2769 1.1 mrg 2770 1.1 mrg if (needs_tgt) 2771 1.1 mrg { 2772 1.1 mrg fprintf (asm_out_file, ", "); 2773 1.1 mrg assemble_name (asm_out_file, buf); 2774 1.1 mrg } 2775 1.1 mrg fprintf (asm_out_file, ";\n"); 2776 1.1 mrg 2777 1.1 mrg if (find_reg_note (insn, REG_NORETURN, NULL)) 2778 1.1 mrg { 2779 1.1 mrg /* No return functions confuse the PTX JIT, as it doesn't realize 2780 1.1 mrg the flow control barrier they imply. It can seg fault if it 2781 1.1 mrg encounters what looks like an unexitable loop. Emit a trailing 2782 1.1 mrg trap and exit, which it does grok. */ 2783 1.1 mrg fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n"); 2784 1.1 mrg fprintf (asm_out_file, "\t\texit; // (noreturn)\n"); 2785 1.1 mrg } 2786 1.1 mrg 2787 1.1 mrg if (result) 2788 1.1 mrg { 2789 1.1 mrg static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8]; 2790 1.1 mrg 2791 1.1 mrg if (!rval[0]) 2792 1.1 mrg /* We must escape the '%' that starts RETURN_REGNUM. */ 2793 1.1 mrg sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}", 2794 1.1 mrg reg_names[NVPTX_RETURN_REGNUM]); 2795 1.1 mrg return rval; 2796 1.1 mrg } 2797 1.1 mrg 2798 1.1 mrg return "}"; 2799 1.1 mrg } 2800 1.1 mrg 2801 1.1 mrg /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */ 2802 1.1 mrg 2803 1.1 mrg static bool 2804 1.1 mrg nvptx_print_operand_punct_valid_p (unsigned char c) 2805 1.1 mrg { 2806 1.1 mrg return c == '.' || c== '#'; 2807 1.1 mrg } 2808 1.1 mrg 2809 1.1 mrg /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */ 2810 1.1 mrg 2811 1.1 mrg static void 2812 1.1 mrg nvptx_print_address_operand (FILE *file, rtx x, machine_mode) 2813 1.1 mrg { 2814 1.1 mrg rtx off; 2815 1.1 mrg if (GET_CODE (x) == CONST) 2816 1.1 mrg x = XEXP (x, 0); 2817 1.1 mrg switch (GET_CODE (x)) 2818 1.1 mrg { 2819 1.1 mrg case PLUS: 2820 1.1 mrg off = XEXP (x, 1); 2821 1.1 mrg output_address (VOIDmode, XEXP (x, 0)); 2822 1.1 mrg fprintf (file, "+"); 2823 1.1 mrg output_address (VOIDmode, off); 2824 1.1 mrg break; 2825 1.1 mrg 2826 1.1 mrg case SYMBOL_REF: 2827 1.1 mrg case LABEL_REF: 2828 1.1 mrg output_addr_const (file, x); 2829 1.1 mrg break; 2830 1.1 mrg 2831 1.1 mrg default: 2832 1.1 mrg gcc_assert (GET_CODE (x) != MEM); 2833 1.1 mrg nvptx_print_operand (file, x, 0); 2834 1.1 mrg break; 2835 1.1 mrg } 2836 1.1 mrg } 2837 1.1 mrg 2838 1.1 mrg /* Write assembly language output for the address ADDR to FILE. */ 2839 1.1 mrg 2840 1.1 mrg static void 2841 1.1 mrg nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr) 2842 1.1 mrg { 2843 1.1 mrg nvptx_print_address_operand (file, addr, mode); 2844 1.1 mrg } 2845 1.1 mrg 2846 1.1 mrg static nvptx_data_area 2847 1.1 mrg nvptx_mem_data_area (const_rtx x) 2848 1.1 mrg { 2849 1.1 mrg gcc_assert (GET_CODE (x) == MEM); 2850 1.1 mrg 2851 1.1 mrg const_rtx addr = XEXP (x, 0); 2852 1.1 mrg subrtx_iterator::array_type array; 2853 1.1 mrg FOR_EACH_SUBRTX (iter, array, addr, ALL) 2854 1.1 mrg if (SYMBOL_REF_P (*iter)) 2855 1.1 mrg return SYMBOL_DATA_AREA (*iter); 2856 1.1 mrg 2857 1.1 mrg return DATA_AREA_GENERIC; 2858 1.1 mrg } 2859 1.1 mrg 2860 1.1 mrg bool 2861 1.1 mrg nvptx_mem_maybe_shared_p (const_rtx x) 2862 1.1 mrg { 2863 1.1 mrg nvptx_data_area area = nvptx_mem_data_area (x); 2864 1.1 mrg return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC; 2865 1.1 mrg } 2866 1.1 mrg 2867 1.1 mrg /* Print an operand, X, to FILE, with an optional modifier in CODE. 2868 1.1 mrg 2869 1.1 mrg Meaning of CODE: 2870 1.1 mrg . -- print the predicate for the instruction or an emptry string for an 2871 1.1 mrg unconditional one. 2872 1.1 mrg # -- print a rounding mode for the instruction 2873 1.1 mrg 2874 1.1 mrg A -- print a data area for a MEM 2875 1.1 mrg c -- print an opcode suffix for a comparison operator, including a type code 2876 1.1 mrg D -- print a data area for a MEM operand 2877 1.1 mrg S -- print a shuffle kind specified by CONST_INT 2878 1.1 mrg t -- print a type opcode suffix, promoting QImode to 32 bits 2879 1.1 mrg T -- print a type size in bits 2880 1.1 mrg u -- print a type opcode suffix without promotions. 2881 1.1 mrg x -- print a destination operand that may also be a bit bucket. */ 2882 1.1 mrg 2883 1.1 mrg static void 2884 1.1 mrg nvptx_print_operand (FILE *file, rtx x, int code) 2885 1.1 mrg { 2886 1.1 mrg if (code == '.') 2887 1.1 mrg { 2888 1.1 mrg x = current_insn_predicate; 2889 1.1 mrg if (x) 2890 1.1 mrg { 2891 1.1 mrg fputs ("@", file); 2892 1.1 mrg if (GET_CODE (x) == EQ) 2893 1.1 mrg fputs ("!", file); 2894 1.1 mrg output_reg (file, REGNO (XEXP (x, 0)), VOIDmode); 2895 1.1 mrg } 2896 1.1 mrg return; 2897 1.1 mrg } 2898 1.1 mrg else if (code == '#') 2899 1.1 mrg { 2900 1.1 mrg fputs (".rn", file); 2901 1.1 mrg return; 2902 1.1 mrg } 2903 1.1 mrg 2904 1.1 mrg enum rtx_code x_code = GET_CODE (x); 2905 1.1 mrg machine_mode mode = GET_MODE (x); 2906 1.1 mrg 2907 1.1 mrg switch (code) 2908 1.1 mrg { 2909 1.1 mrg case 'x': 2910 1.1 mrg if (current_output_insn != NULL 2911 1.1 mrg && find_reg_note (current_output_insn, REG_UNUSED, x) != NULL_RTX) 2912 1.1 mrg { 2913 1.1 mrg fputs ("_", file); 2914 1.1 mrg return; 2915 1.1 mrg } 2916 1.1 mrg goto common; 2917 1.1 mrg case 'B': 2918 1.1 mrg if (SYMBOL_REF_P (XEXP (x, 0))) 2919 1.1 mrg switch (SYMBOL_DATA_AREA (XEXP (x, 0))) 2920 1.1 mrg { 2921 1.1 mrg case DATA_AREA_GENERIC: 2922 1.1 mrg /* Assume worst-case: global. */ 2923 1.1 mrg gcc_fallthrough (); /* FALLTHROUGH. */ 2924 1.1 mrg case DATA_AREA_GLOBAL: 2925 1.1 mrg break; 2926 1.1 mrg case DATA_AREA_SHARED: 2927 1.1 mrg fputs (".cta", file); 2928 1.1 mrg return; 2929 1.1 mrg case DATA_AREA_LOCAL: 2930 1.1 mrg case DATA_AREA_CONST: 2931 1.1 mrg case DATA_AREA_PARAM: 2932 1.1 mrg default: 2933 1.1 mrg gcc_unreachable (); 2934 1.1 mrg } 2935 1.1 mrg 2936 1.1 mrg /* There are 2 cases where membar.sys differs from membar.gl: 2937 1.1 mrg - host accesses global memory (f.i. systemwide atomics) 2938 1.1 mrg - 2 or more devices are setup in peer-to-peer mode, and one 2939 1.1 mrg peer can access global memory of other peer. 2940 1.1 mrg Neither are currently supported by openMP/OpenACC on nvptx, but 2941 1.1 mrg that could change, so we default to membar.sys. We could support 2942 1.1 mrg this more optimally by adding DATA_AREA_SYS and then emitting 2943 1.1 mrg .gl for DATA_AREA_GLOBAL and .sys for DATA_AREA_SYS. */ 2944 1.1 mrg fputs (".sys", file); 2945 1.1 mrg return; 2946 1.1 mrg 2947 1.1 mrg case 'A': 2948 1.1 mrg x = XEXP (x, 0); 2949 1.1 mrg gcc_fallthrough (); /* FALLTHROUGH. */ 2950 1.1 mrg 2951 1.1 mrg case 'D': 2952 1.1 mrg if (GET_CODE (x) == CONST) 2953 1.1 mrg x = XEXP (x, 0); 2954 1.1 mrg if (GET_CODE (x) == PLUS) 2955 1.1 mrg x = XEXP (x, 0); 2956 1.1 mrg 2957 1.1 mrg if (GET_CODE (x) == SYMBOL_REF) 2958 1.1 mrg fputs (section_for_sym (x), file); 2959 1.1 mrg break; 2960 1.1 mrg 2961 1.1 mrg case 't': 2962 1.1 mrg case 'u': 2963 1.1 mrg if (x_code == SUBREG) 2964 1.1 mrg { 2965 1.1 mrg machine_mode inner_mode = GET_MODE (SUBREG_REG (x)); 2966 1.1 mrg if (VECTOR_MODE_P (inner_mode) 2967 1.1 mrg && (GET_MODE_SIZE (mode) 2968 1.1 mrg <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode)))) 2969 1.1 mrg mode = GET_MODE_INNER (inner_mode); 2970 1.1 mrg else if (split_mode_p (inner_mode)) 2971 1.1 mrg mode = maybe_split_mode (inner_mode); 2972 1.1 mrg else 2973 1.1 mrg mode = inner_mode; 2974 1.1 mrg } 2975 1.1 mrg fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't')); 2976 1.1 mrg break; 2977 1.1 mrg 2978 1.1 mrg case 'H': 2979 1.1 mrg case 'L': 2980 1.1 mrg { 2981 1.1 mrg rtx inner_x = SUBREG_REG (x); 2982 1.1 mrg machine_mode inner_mode = GET_MODE (inner_x); 2983 1.1 mrg machine_mode split = maybe_split_mode (inner_mode); 2984 1.1 mrg 2985 1.1 mrg output_reg (file, REGNO (inner_x), split, 2986 1.1 mrg (code == 'H' 2987 1.1 mrg ? GET_MODE_SIZE (inner_mode) / 2 2988 1.1 mrg : 0)); 2989 1.1 mrg } 2990 1.1 mrg break; 2991 1.1 mrg 2992 1.1 mrg case 'S': 2993 1.1 mrg { 2994 1.1 mrg nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x); 2995 1.1 mrg /* Same order as nvptx_shuffle_kind. */ 2996 1.1 mrg static const char *const kinds[] = 2997 1.1 mrg {".up", ".down", ".bfly", ".idx"}; 2998 1.1 mrg fputs (kinds[kind], file); 2999 1.1 mrg } 3000 1.1 mrg break; 3001 1.1 mrg 3002 1.1 mrg case 'T': 3003 1.1 mrg fprintf (file, "%d", GET_MODE_BITSIZE (mode)); 3004 1.1 mrg break; 3005 1.1 mrg 3006 1.1 mrg case 'j': 3007 1.1 mrg fprintf (file, "@"); 3008 1.1 mrg goto common; 3009 1.1 mrg 3010 1.1 mrg case 'J': 3011 1.1 mrg fprintf (file, "@!"); 3012 1.1 mrg goto common; 3013 1.1 mrg 3014 1.1 mrg case 'c': 3015 1.1 mrg mode = GET_MODE (XEXP (x, 0)); 3016 1.1 mrg switch (x_code) 3017 1.1 mrg { 3018 1.1 mrg case EQ: 3019 1.1 mrg fputs (".eq", file); 3020 1.1 mrg break; 3021 1.1 mrg case NE: 3022 1.1 mrg if (FLOAT_MODE_P (mode)) 3023 1.1 mrg fputs (".neu", file); 3024 1.1 mrg else 3025 1.1 mrg fputs (".ne", file); 3026 1.1 mrg break; 3027 1.1 mrg case LE: 3028 1.1 mrg case LEU: 3029 1.1 mrg fputs (".le", file); 3030 1.1 mrg break; 3031 1.1 mrg case GE: 3032 1.1 mrg case GEU: 3033 1.1 mrg fputs (".ge", file); 3034 1.1 mrg break; 3035 1.1 mrg case LT: 3036 1.1 mrg case LTU: 3037 1.1 mrg fputs (".lt", file); 3038 1.1 mrg break; 3039 1.1 mrg case GT: 3040 1.1 mrg case GTU: 3041 1.1 mrg fputs (".gt", file); 3042 1.1 mrg break; 3043 1.1 mrg case LTGT: 3044 1.1 mrg fputs (".ne", file); 3045 1.1 mrg break; 3046 1.1 mrg case UNEQ: 3047 1.1 mrg fputs (".equ", file); 3048 1.1 mrg break; 3049 1.1 mrg case UNLE: 3050 1.1 mrg fputs (".leu", file); 3051 1.1 mrg break; 3052 1.1 mrg case UNGE: 3053 1.1 mrg fputs (".geu", file); 3054 1.1 mrg break; 3055 1.1 mrg case UNLT: 3056 1.1 mrg fputs (".ltu", file); 3057 1.1 mrg break; 3058 1.1 mrg case UNGT: 3059 1.1 mrg fputs (".gtu", file); 3060 1.1 mrg break; 3061 1.1 mrg case UNORDERED: 3062 1.1 mrg fputs (".nan", file); 3063 1.1 mrg break; 3064 1.1 mrg case ORDERED: 3065 1.1 mrg fputs (".num", file); 3066 1.1 mrg break; 3067 1.1 mrg default: 3068 1.1 mrg gcc_unreachable (); 3069 1.1 mrg } 3070 1.1 mrg if (FLOAT_MODE_P (mode) 3071 1.1 mrg || x_code == EQ || x_code == NE 3072 1.1 mrg || x_code == GEU || x_code == GTU 3073 1.1 mrg || x_code == LEU || x_code == LTU) 3074 1.1 mrg fputs (nvptx_ptx_type_from_mode (mode, true), file); 3075 1.1 mrg else 3076 1.1 mrg fprintf (file, ".s%d", GET_MODE_BITSIZE (mode)); 3077 1.1 mrg break; 3078 1.1 mrg default: 3079 1.1 mrg common: 3080 1.1 mrg switch (x_code) 3081 1.1 mrg { 3082 1.1 mrg case SUBREG: 3083 1.1 mrg { 3084 1.1 mrg rtx inner_x = SUBREG_REG (x); 3085 1.1 mrg machine_mode inner_mode = GET_MODE (inner_x); 3086 1.1 mrg machine_mode split = maybe_split_mode (inner_mode); 3087 1.1 mrg 3088 1.1 mrg if (VECTOR_MODE_P (inner_mode) 3089 1.1 mrg && (GET_MODE_SIZE (mode) 3090 1.1 mrg <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode)))) 3091 1.1 mrg { 3092 1.1 mrg output_reg (file, REGNO (inner_x), VOIDmode); 3093 1.1 mrg fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y"); 3094 1.1 mrg } 3095 1.1 mrg else if (split_mode_p (inner_mode) 3096 1.1 mrg && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode))) 3097 1.1 mrg output_reg (file, REGNO (inner_x), split); 3098 1.1 mrg else 3099 1.1 mrg output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x)); 3100 1.1 mrg } 3101 1.1 mrg break; 3102 1.1 mrg 3103 1.1 mrg case REG: 3104 1.1 mrg output_reg (file, REGNO (x), maybe_split_mode (mode)); 3105 1.1 mrg break; 3106 1.1 mrg 3107 1.1 mrg case MEM: 3108 1.1 mrg fputc ('[', file); 3109 1.1 mrg nvptx_print_address_operand (file, XEXP (x, 0), mode); 3110 1.1 mrg fputc (']', file); 3111 1.1 mrg break; 3112 1.1 mrg 3113 1.1 mrg case CONST_INT: 3114 1.1 mrg output_addr_const (file, x); 3115 1.1 mrg break; 3116 1.1 mrg 3117 1.1 mrg case CONST: 3118 1.1 mrg case SYMBOL_REF: 3119 1.1 mrg case LABEL_REF: 3120 1.1 mrg /* We could use output_addr_const, but that can print things like 3121 1.1 mrg "x-8", which breaks ptxas. Need to ensure it is output as 3122 1.1 mrg "x+-8". */ 3123 1.1 mrg nvptx_print_address_operand (file, x, VOIDmode); 3124 1.1 mrg break; 3125 1.1 mrg 3126 1.1 mrg case CONST_DOUBLE: 3127 1.1 mrg long vals[2]; 3128 1.1 mrg real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode); 3129 1.1 mrg vals[0] &= 0xffffffff; 3130 1.1 mrg vals[1] &= 0xffffffff; 3131 1.1 mrg if (mode == SFmode) 3132 1.1 mrg fprintf (file, "0f%08lx", vals[0]); 3133 1.1 mrg else 3134 1.1 mrg fprintf (file, "0d%08lx%08lx", vals[1], vals[0]); 3135 1.1 mrg break; 3136 1.1 mrg 3137 1.1 mrg case CONST_VECTOR: 3138 1.1 mrg { 3139 1.1 mrg unsigned n = CONST_VECTOR_NUNITS (x); 3140 1.1 mrg fprintf (file, "{ "); 3141 1.1 mrg for (unsigned i = 0; i < n; ++i) 3142 1.1 mrg { 3143 1.1 mrg if (i != 0) 3144 1.1 mrg fprintf (file, ", "); 3145 1.1 mrg 3146 1.1 mrg rtx elem = CONST_VECTOR_ELT (x, i); 3147 1.1 mrg output_addr_const (file, elem); 3148 1.1 mrg } 3149 1.1 mrg fprintf (file, " }"); 3150 1.1 mrg } 3151 1.1 mrg break; 3152 1.1 mrg 3153 1.1 mrg default: 3154 1.1 mrg output_addr_const (file, x); 3155 1.1 mrg } 3156 1.1 mrg } 3157 1.1 mrg } 3158 1.1 mrg 3159 1.1 mrg /* Record replacement regs used to deal with subreg operands. */ 3161 1.1 mrg struct reg_replace 3162 1.1 mrg { 3163 1.1 mrg rtx replacement[MAX_RECOG_OPERANDS]; 3164 1.1 mrg machine_mode mode; 3165 1.1 mrg int n_allocated; 3166 1.1 mrg int n_in_use; 3167 1.1 mrg }; 3168 1.1 mrg 3169 1.1 mrg /* Allocate or reuse a replacement in R and return the rtx. */ 3170 1.1 mrg 3171 1.1 mrg static rtx 3172 1.1 mrg get_replacement (struct reg_replace *r) 3173 1.1 mrg { 3174 1.1 mrg if (r->n_allocated == r->n_in_use) 3175 1.1 mrg r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode); 3176 1.1 mrg return r->replacement[r->n_in_use++]; 3177 1.1 mrg } 3178 1.1 mrg 3179 1.1 mrg /* Clean up subreg operands. In ptx assembly, everything is typed, and 3180 1.1 mrg the presence of subregs would break the rules for most instructions. 3181 1.1 mrg Replace them with a suitable new register of the right size, plus 3182 1.1 mrg conversion copyin/copyout instructions. */ 3183 1.1 mrg 3184 1.1 mrg static void 3185 1.1 mrg nvptx_reorg_subreg (void) 3186 1.1 mrg { 3187 1.1 mrg struct reg_replace qiregs, hiregs, siregs, diregs; 3188 1.1 mrg rtx_insn *insn, *next; 3189 1.1 mrg 3190 1.1 mrg qiregs.n_allocated = 0; 3191 1.1 mrg hiregs.n_allocated = 0; 3192 1.1 mrg siregs.n_allocated = 0; 3193 1.1 mrg diregs.n_allocated = 0; 3194 1.1 mrg qiregs.mode = QImode; 3195 1.1 mrg hiregs.mode = HImode; 3196 1.1 mrg siregs.mode = SImode; 3197 1.1 mrg diregs.mode = DImode; 3198 1.1 mrg 3199 1.1 mrg for (insn = get_insns (); insn; insn = next) 3200 1.1 mrg { 3201 1.1 mrg next = NEXT_INSN (insn); 3202 1.1 mrg if (!NONDEBUG_INSN_P (insn) 3203 1.1 mrg || asm_noperands (PATTERN (insn)) >= 0 3204 1.1 mrg || GET_CODE (PATTERN (insn)) == USE 3205 1.1 mrg || GET_CODE (PATTERN (insn)) == CLOBBER) 3206 1.1 mrg continue; 3207 1.1 mrg 3208 1.1 mrg qiregs.n_in_use = 0; 3209 1.1 mrg hiregs.n_in_use = 0; 3210 1.1 mrg siregs.n_in_use = 0; 3211 1.1 mrg diregs.n_in_use = 0; 3212 1.1 mrg extract_insn (insn); 3213 1.1 mrg enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn); 3214 1.1 mrg 3215 1.1 mrg for (int i = 0; i < recog_data.n_operands; i++) 3216 1.1 mrg { 3217 1.1 mrg rtx op = recog_data.operand[i]; 3218 1.1 mrg if (GET_CODE (op) != SUBREG) 3219 1.1 mrg continue; 3220 1.1 mrg 3221 1.1 mrg rtx inner = SUBREG_REG (op); 3222 1.1 mrg 3223 1.1 mrg machine_mode outer_mode = GET_MODE (op); 3224 1.1 mrg machine_mode inner_mode = GET_MODE (inner); 3225 1.1 mrg gcc_assert (s_ok); 3226 1.1 mrg if (s_ok 3227 1.1 mrg && (GET_MODE_PRECISION (inner_mode) 3228 1.1 mrg >= GET_MODE_PRECISION (outer_mode))) 3229 1.1 mrg continue; 3230 1.1 mrg gcc_assert (SCALAR_INT_MODE_P (outer_mode)); 3231 1.1 mrg struct reg_replace *r = (outer_mode == QImode ? &qiregs 3232 1.1 mrg : outer_mode == HImode ? &hiregs 3233 1.1 mrg : outer_mode == SImode ? &siregs 3234 1.1 mrg : &diregs); 3235 1.1 mrg rtx new_reg = get_replacement (r); 3236 1.1 mrg 3237 1.1 mrg if (recog_data.operand_type[i] != OP_OUT) 3238 1.1 mrg { 3239 1.1 mrg enum rtx_code code; 3240 1.1 mrg if (GET_MODE_PRECISION (inner_mode) 3241 1.1 mrg < GET_MODE_PRECISION (outer_mode)) 3242 1.1 mrg code = ZERO_EXTEND; 3243 1.1 mrg else 3244 1.1 mrg code = TRUNCATE; 3245 1.1 mrg 3246 1.1 mrg rtx pat = gen_rtx_SET (new_reg, 3247 1.1 mrg gen_rtx_fmt_e (code, outer_mode, inner)); 3248 1.1 mrg emit_insn_before (pat, insn); 3249 1.1 mrg } 3250 1.1 mrg 3251 1.1 mrg if (recog_data.operand_type[i] != OP_IN) 3252 1.1 mrg { 3253 1.1 mrg enum rtx_code code; 3254 1.1 mrg if (GET_MODE_PRECISION (inner_mode) 3255 1.1 mrg < GET_MODE_PRECISION (outer_mode)) 3256 1.1 mrg code = TRUNCATE; 3257 1.1 mrg else 3258 1.1 mrg code = ZERO_EXTEND; 3259 1.1 mrg 3260 1.1 mrg rtx pat = gen_rtx_SET (inner, 3261 1.1 mrg gen_rtx_fmt_e (code, inner_mode, new_reg)); 3262 1.1 mrg emit_insn_after (pat, insn); 3263 1.1 mrg } 3264 1.1 mrg validate_change (insn, recog_data.operand_loc[i], new_reg, false); 3265 1.1 mrg } 3266 1.1 mrg } 3267 1.1 mrg } 3268 1.1 mrg 3269 1.1 mrg /* Return a SImode "master lane index" register for uniform-simt, allocating on 3270 1.1 mrg first use. */ 3271 1.1 mrg 3272 1.1 mrg static rtx 3273 1.1 mrg nvptx_get_unisimt_master () 3274 1.1 mrg { 3275 1.1 mrg rtx &master = cfun->machine->unisimt_master; 3276 1.1 mrg return master ? master : master = gen_reg_rtx (SImode); 3277 1.1 mrg } 3278 1.1 mrg 3279 1.1 mrg /* Return a BImode "predicate" register for uniform-simt, similar to above. */ 3280 1.1 mrg 3281 1.1 mrg static rtx 3282 1.1 mrg nvptx_get_unisimt_predicate () 3283 1.1 mrg { 3284 1.1 mrg rtx &pred = cfun->machine->unisimt_predicate; 3285 1.1 mrg return pred ? pred : pred = gen_reg_rtx (BImode); 3286 1.1 mrg } 3287 1.1 mrg 3288 1.1 mrg static rtx 3289 1.1 mrg nvptx_get_unisimt_outside_simt_predicate () 3290 1.1 mrg { 3291 1.1 mrg rtx &pred = cfun->machine->unisimt_outside_simt_predicate; 3292 1.1 mrg return pred ? pred : pred = gen_reg_rtx (BImode); 3293 1.1 mrg } 3294 1.1 mrg 3295 1.1 mrg /* Return true if given call insn references one of the functions provided by 3296 1.1 mrg the CUDA runtime: malloc, free, vprintf. */ 3297 1.1 mrg 3298 1.1 mrg static bool 3299 1.1 mrg nvptx_call_insn_is_syscall_p (rtx_insn *insn) 3300 1.1 mrg { 3301 1.1 mrg rtx pat = PATTERN (insn); 3302 1.1 mrg gcc_checking_assert (GET_CODE (pat) == PARALLEL); 3303 1.1 mrg pat = XVECEXP (pat, 0, 0); 3304 1.1 mrg if (GET_CODE (pat) == SET) 3305 1.1 mrg pat = SET_SRC (pat); 3306 1.1 mrg gcc_checking_assert (GET_CODE (pat) == CALL 3307 1.1 mrg && GET_CODE (XEXP (pat, 0)) == MEM); 3308 1.1 mrg rtx addr = XEXP (XEXP (pat, 0), 0); 3309 1.1 mrg if (GET_CODE (addr) != SYMBOL_REF) 3310 1.1 mrg return false; 3311 1.1 mrg const char *name = XSTR (addr, 0); 3312 1.1 mrg /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the 3313 1.1 mrg references with forced assembler name refer to PTX syscalls. For vprintf, 3314 1.1 mrg accept both normal and forced-assembler-name references. */ 3315 1.1 mrg return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf") 3316 1.1 mrg || !strcmp (name, "*malloc") 3317 1.1 mrg || !strcmp (name, "*free")); 3318 1.1 mrg } 3319 1.1 mrg 3320 1.1 mrg /* If SET subexpression of INSN sets a register, emit a shuffle instruction to 3321 1.1 mrg propagate its value from lane MASTER to current lane. */ 3322 1.1 mrg 3323 1.1 mrg static bool 3324 1.1 mrg nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master) 3325 1.1 mrg { 3326 1.1 mrg rtx reg; 3327 1.1 mrg if (GET_CODE (set) == SET 3328 1.1 mrg && REG_P (reg = SET_DEST (set)) 3329 1.1 mrg && find_reg_note (insn, REG_UNUSED, reg) == NULL_RTX) 3330 1.1 mrg { 3331 1.1 mrg emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), 3332 1.1 mrg insn); 3333 1.1 mrg return true; 3334 1.1 mrg } 3335 1.1 mrg 3336 1.1 mrg return false; 3337 1.1 mrg } 3338 1.1 mrg 3339 1.1 mrg static void 3340 1.1 mrg predicate_insn (rtx_insn *insn, rtx pred) 3341 1.1 mrg { 3342 1.1 mrg rtx pat = PATTERN (insn); 3343 1.1 mrg pred = gen_rtx_NE (BImode, pred, const0_rtx); 3344 1.1 mrg pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat); 3345 1.1 mrg bool changed_p = validate_change (insn, &PATTERN (insn), pat, false); 3346 1.1 mrg gcc_assert (changed_p); 3347 1.1 mrg } 3348 1.1 mrg 3349 1.1 mrg /* Adjust code for uniform-simt code generation variant by making atomics and 3350 1.1 mrg "syscalls" conditionally executed, and inserting shuffle-based propagation 3351 1.1 mrg for registers being set. */ 3352 1.1 mrg 3353 1.1 mrg static void 3354 1.1 mrg nvptx_reorg_uniform_simt () 3355 1.1 mrg { 3356 1.1 mrg rtx_insn *insn, *next; 3357 1.1 mrg 3358 1.1 mrg for (insn = get_insns (); insn; insn = next) 3359 1.1 mrg { 3360 1.1 mrg next = NEXT_INSN (insn); 3361 1.1 mrg 3362 1.1 mrg /* Skip NOTE, USE, etc. */ 3363 1.1 mrg if (!INSN_P (insn) || recog_memoized (insn) == -1) 3364 1.1 mrg continue; 3365 1.1 mrg 3366 1.1 mrg if (CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn)) 3367 1.1 mrg { 3368 1.1 mrg /* Handle syscall. */ 3369 1.1 mrg } 3370 1.1 mrg else if (get_attr_atomic (insn)) 3371 1.1 mrg { 3372 1.1 mrg /* Handle atomic insn. */ 3373 1.1 mrg } 3374 1.1 mrg else 3375 1.1 mrg continue; 3376 1.1 mrg 3377 1.1 mrg rtx pat = PATTERN (insn); 3378 1.1 mrg rtx master = nvptx_get_unisimt_master (); 3379 1.1 mrg bool shuffle_p = false; 3380 1.1 mrg switch (GET_CODE (pat)) 3381 1.1 mrg { 3382 1.1 mrg case PARALLEL: 3383 1.1 mrg for (int i = 0; i < XVECLEN (pat, 0); i++) 3384 1.1 mrg shuffle_p 3385 1.1 mrg |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master); 3386 1.1 mrg break; 3387 1.1 mrg case SET: 3388 1.1 mrg shuffle_p |= nvptx_unisimt_handle_set (pat, insn, master); 3389 1.1 mrg break; 3390 1.1 mrg default: 3391 1.1 mrg gcc_unreachable (); 3392 1.1 mrg } 3393 1.1 mrg 3394 1.1 mrg if (shuffle_p && TARGET_PTX_6_0) 3395 1.1 mrg { 3396 1.1 mrg /* The shuffle is a sync, so uniformity is guaranteed. */ 3397 1.1 mrg } 3398 1.1 mrg else 3399 1.1 mrg { 3400 1.1 mrg if (TARGET_PTX_6_0) 3401 1.1 mrg { 3402 1.1 mrg gcc_assert (!shuffle_p); 3403 1.1 mrg /* Emit after the insn, to guarantee uniformity. */ 3404 1.1 mrg emit_insn_after (gen_nvptx_warpsync (), insn); 3405 1.1 mrg } 3406 1.1 mrg else 3407 1.1 mrg { 3408 1.1 mrg /* Emit after the insn (and before the shuffle, if there are any) 3409 1.1 mrg to check uniformity. */ 3410 1.1 mrg emit_insn_after (gen_nvptx_uniform_warp_check (), insn); 3411 1.1 mrg } 3412 1.1 mrg } 3413 1.1 mrg 3414 1.1 mrg rtx pred = nvptx_get_unisimt_predicate (); 3415 1.1 mrg predicate_insn (insn, pred); 3416 1.1 mrg 3417 1.1 mrg pred = NULL_RTX; 3418 1.1 mrg for (rtx_insn *post = NEXT_INSN (insn); post != next; 3419 1.1 mrg post = NEXT_INSN (post)) 3420 1.1 mrg { 3421 1.1 mrg if (pred == NULL_RTX) 3422 1.1 mrg pred = nvptx_get_unisimt_outside_simt_predicate (); 3423 1.1 mrg predicate_insn (post, pred); 3424 1.1 mrg } 3425 1.1 mrg } 3426 1.1 mrg } 3427 1.1 mrg 3428 1.1 mrg /* Offloading function attributes. */ 3429 1.1 mrg 3430 1.1 mrg struct offload_attrs 3431 1.1 mrg { 3432 1.1 mrg unsigned mask; 3433 1.1 mrg int num_gangs; 3434 1.1 mrg int num_workers; 3435 1.1 mrg int vector_length; 3436 1.1 mrg }; 3437 1.1 mrg 3438 1.1 mrg /* Define entries for cfun->machine->axis_dim. */ 3439 1.1 mrg 3440 1.1 mrg #define MACH_VECTOR_LENGTH 0 3441 1.1 mrg #define MACH_MAX_WORKERS 1 3442 1.1 mrg 3443 1.1 mrg static void populate_offload_attrs (offload_attrs *oa); 3444 1.1 mrg 3445 1.1 mrg static void 3446 1.1 mrg init_axis_dim (void) 3447 1.1 mrg { 3448 1.1 mrg offload_attrs oa; 3449 1.1 mrg int max_workers; 3450 1.1 mrg 3451 1.1 mrg populate_offload_attrs (&oa); 3452 1.1 mrg 3453 1.1 mrg if (oa.num_workers == 0) 3454 1.1 mrg max_workers = PTX_CTA_SIZE / oa.vector_length; 3455 1.1 mrg else 3456 1.1 mrg max_workers = oa.num_workers; 3457 1.1 mrg 3458 1.1 mrg cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length; 3459 1.1 mrg cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers; 3460 1.1 mrg cfun->machine->axis_dim_init_p = true; 3461 1.1 mrg } 3462 1.1 mrg 3463 1.1 mrg static int ATTRIBUTE_UNUSED 3464 1.1 mrg nvptx_mach_max_workers () 3465 1.1 mrg { 3466 1.1 mrg if (!cfun->machine->axis_dim_init_p) 3467 1.1 mrg init_axis_dim (); 3468 1.1 mrg return cfun->machine->axis_dim[MACH_MAX_WORKERS]; 3469 1.1 mrg } 3470 1.1 mrg 3471 1.1 mrg static int ATTRIBUTE_UNUSED 3472 1.1 mrg nvptx_mach_vector_length () 3473 1.1 mrg { 3474 1.1 mrg if (!cfun->machine->axis_dim_init_p) 3475 1.1 mrg init_axis_dim (); 3476 1.1 mrg return cfun->machine->axis_dim[MACH_VECTOR_LENGTH]; 3477 1.1 mrg } 3478 1.1 mrg 3479 1.1 mrg /* Loop structure of the function. The entire function is described as 3480 1.1 mrg a NULL loop. */ 3481 1.1 mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'. */ 3482 1.1 mrg 3483 1.1 mrg struct parallel 3484 1.1 mrg { 3485 1.1 mrg /* Parent parallel. */ 3486 1.1 mrg parallel *parent; 3487 1.1 mrg 3488 1.1 mrg /* Next sibling parallel. */ 3489 1.1 mrg parallel *next; 3490 1.1 mrg 3491 1.1 mrg /* First child parallel. */ 3492 1.1 mrg parallel *inner; 3493 1.1 mrg 3494 1.1 mrg /* Partitioning mask of the parallel. */ 3495 1.1 mrg unsigned mask; 3496 1.1 mrg 3497 1.1 mrg /* Partitioning used within inner parallels. */ 3498 1.1 mrg unsigned inner_mask; 3499 1.1 mrg 3500 1.1 mrg /* Location of parallel forked and join. The forked is the first 3501 1.1 mrg block in the parallel and the join is the first block after of 3502 1.1 mrg the partition. */ 3503 1.1 mrg basic_block forked_block; 3504 1.1 mrg basic_block join_block; 3505 1.1 mrg 3506 1.1 mrg rtx_insn *forked_insn; 3507 1.1 mrg rtx_insn *join_insn; 3508 1.1 mrg 3509 1.1 mrg rtx_insn *fork_insn; 3510 1.1 mrg rtx_insn *joining_insn; 3511 1.1 mrg 3512 1.1 mrg /* Basic blocks in this parallel, but not in child parallels. The 3513 1.1 mrg FORKED and JOINING blocks are in the partition. The FORK and JOIN 3514 1.1 mrg blocks are not. */ 3515 1.1 mrg auto_vec<basic_block> blocks; 3516 1.1 mrg 3517 1.1 mrg public: 3518 1.1 mrg parallel (parallel *parent, unsigned mode); 3519 1.1 mrg ~parallel (); 3520 1.1 mrg }; 3521 1.1 mrg 3522 1.1 mrg /* Constructor links the new parallel into it's parent's chain of 3523 1.1 mrg children. */ 3524 1.1 mrg 3525 1.1 mrg parallel::parallel (parallel *parent_, unsigned mask_) 3526 1.1 mrg :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0) 3527 1.1 mrg { 3528 1.1 mrg forked_block = join_block = 0; 3529 1.1 mrg forked_insn = join_insn = 0; 3530 1.1 mrg fork_insn = joining_insn = 0; 3531 1.1 mrg 3532 1.1 mrg if (parent) 3533 1.1 mrg { 3534 1.1 mrg next = parent->inner; 3535 1.1 mrg parent->inner = this; 3536 1.1 mrg } 3537 1.1 mrg } 3538 1.1 mrg 3539 1.1 mrg parallel::~parallel () 3540 1.1 mrg { 3541 1.1 mrg delete inner; 3542 1.1 mrg delete next; 3543 1.1 mrg } 3544 1.1 mrg 3545 1.1 mrg /* Map of basic blocks to insns */ 3546 1.1 mrg typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t; 3547 1.1 mrg 3548 1.1 mrg /* A tuple of an insn of interest and the BB in which it resides. */ 3549 1.1 mrg typedef std::pair<rtx_insn *, basic_block> insn_bb_t; 3550 1.1 mrg typedef auto_vec<insn_bb_t> insn_bb_vec_t; 3551 1.1 mrg 3552 1.1 mrg /* Split basic blocks such that each forked and join unspecs are at 3553 1.1 mrg the start of their basic blocks. Thus afterwards each block will 3554 1.1 mrg have a single partitioning mode. We also do the same for return 3555 1.1 mrg insns, as they are executed by every thread. Return the 3556 1.1 mrg partitioning mode of the function as a whole. Populate MAP with 3557 1.1 mrg head and tail blocks. We also clear the BB visited flag, which is 3558 1.1 mrg used when finding partitions. */ 3559 1.1 mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_split_blocks'. */ 3560 1.1 mrg 3561 1.1 mrg static void 3562 1.1 mrg nvptx_split_blocks (bb_insn_map_t *map) 3563 1.1 mrg { 3564 1.1 mrg insn_bb_vec_t worklist; 3565 1.1 mrg basic_block block; 3566 1.1 mrg rtx_insn *insn; 3567 1.1 mrg 3568 1.1 mrg /* Locate all the reorg instructions of interest. */ 3569 1.1 mrg FOR_ALL_BB_FN (block, cfun) 3570 1.1 mrg { 3571 1.1 mrg bool seen_insn = false; 3572 1.1 mrg 3573 1.1 mrg /* Clear visited flag, for use by parallel locator */ 3574 1.1 mrg block->flags &= ~BB_VISITED; 3575 1.1 mrg 3576 1.1 mrg FOR_BB_INSNS (block, insn) 3577 1.1 mrg { 3578 1.1 mrg if (!INSN_P (insn)) 3579 1.1 mrg continue; 3580 1.1 mrg switch (recog_memoized (insn)) 3581 1.1 mrg { 3582 1.1 mrg default: 3583 1.1 mrg seen_insn = true; 3584 1.1 mrg continue; 3585 1.1 mrg case CODE_FOR_nvptx_forked: 3586 1.1 mrg case CODE_FOR_nvptx_join: 3587 1.1 mrg break; 3588 1.1 mrg 3589 1.1 mrg case CODE_FOR_return: 3590 1.1 mrg /* We also need to split just before return insns, as 3591 1.1 mrg that insn needs executing by all threads, but the 3592 1.1 mrg block it is in probably does not. */ 3593 1.1 mrg break; 3594 1.1 mrg } 3595 1.1 mrg 3596 1.1 mrg if (seen_insn) 3597 1.1 mrg /* We've found an instruction that must be at the start of 3598 1.1 mrg a block, but isn't. Add it to the worklist. */ 3599 1.1 mrg worklist.safe_push (insn_bb_t (insn, block)); 3600 1.1 mrg else 3601 1.1 mrg /* It was already the first instruction. Just add it to 3602 1.1 mrg the map. */ 3603 1.1 mrg map->get_or_insert (block) = insn; 3604 1.1 mrg seen_insn = true; 3605 1.1 mrg } 3606 1.1 mrg } 3607 1.1 mrg 3608 1.1 mrg /* Split blocks on the worklist. */ 3609 1.1 mrg unsigned ix; 3610 1.1 mrg insn_bb_t *elt; 3611 1.1 mrg basic_block remap = 0; 3612 1.1 mrg for (ix = 0; worklist.iterate (ix, &elt); ix++) 3613 1.1 mrg { 3614 1.1 mrg if (remap != elt->second) 3615 1.1 mrg { 3616 1.1 mrg block = elt->second; 3617 1.1 mrg remap = block; 3618 1.1 mrg } 3619 1.1 mrg 3620 1.1 mrg /* Split block before insn. The insn is in the new block */ 3621 1.1 mrg edge e = split_block (block, PREV_INSN (elt->first)); 3622 1.1 mrg 3623 1.1 mrg block = e->dest; 3624 1.1 mrg map->get_or_insert (block) = elt->first; 3625 1.1 mrg } 3626 1.1 mrg } 3627 1.1 mrg 3628 1.1 mrg /* Return true if MASK contains parallelism that requires shared 3629 1.1 mrg memory to broadcast. */ 3630 1.1 mrg 3631 1.1 mrg static bool 3632 1.1 mrg nvptx_needs_shared_bcast (unsigned mask) 3633 1.1 mrg { 3634 1.1 mrg bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER); 3635 1.1 mrg bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) 3636 1.1 mrg && nvptx_mach_vector_length () != PTX_WARP_SIZE; 3637 1.1 mrg 3638 1.1 mrg return worker || large_vector; 3639 1.1 mrg } 3640 1.1 mrg 3641 1.1 mrg /* BLOCK is a basic block containing a head or tail instruction. 3642 1.1 mrg Locate the associated prehead or pretail instruction, which must be 3643 1.1 mrg in the single predecessor block. */ 3644 1.1 mrg 3645 1.1 mrg static rtx_insn * 3646 1.1 mrg nvptx_discover_pre (basic_block block, int expected) 3647 1.1 mrg { 3648 1.1 mrg gcc_assert (block->preds->length () == 1); 3649 1.1 mrg basic_block pre_block = (*block->preds)[0]->src; 3650 1.1 mrg rtx_insn *pre_insn; 3651 1.1 mrg 3652 1.1 mrg for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn); 3653 1.1 mrg pre_insn = PREV_INSN (pre_insn)) 3654 1.1 mrg gcc_assert (pre_insn != BB_HEAD (pre_block)); 3655 1.1 mrg 3656 1.1 mrg gcc_assert (recog_memoized (pre_insn) == expected); 3657 1.1 mrg return pre_insn; 3658 1.1 mrg } 3659 1.1 mrg 3660 1.1 mrg /* Dump this parallel and all its inner parallels. */ 3661 1.1 mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_dump_pars'. */ 3662 1.1 mrg 3663 1.1 mrg static void 3664 1.1 mrg nvptx_dump_pars (parallel *par, unsigned depth) 3665 1.1 mrg { 3666 1.1 mrg fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n", 3667 1.1 mrg depth, par->mask, 3668 1.1 mrg par->forked_block ? par->forked_block->index : -1, 3669 1.1 mrg par->join_block ? par->join_block->index : -1); 3670 1.1 mrg 3671 1.1 mrg fprintf (dump_file, " blocks:"); 3672 1.1 mrg 3673 1.1 mrg basic_block block; 3674 1.1 mrg for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++) 3675 1.1 mrg fprintf (dump_file, " %d", block->index); 3676 1.1 mrg fprintf (dump_file, "\n"); 3677 1.1 mrg if (par->inner) 3678 1.1 mrg nvptx_dump_pars (par->inner, depth + 1); 3679 1.1 mrg 3680 1.1 mrg if (par->next) 3681 1.1 mrg nvptx_dump_pars (par->next, depth); 3682 1.1 mrg } 3683 1.1 mrg 3684 1.1 mrg /* If BLOCK contains a fork/join marker, process it to create or 3685 1.1 mrg terminate a loop structure. Add this block to the current loop, 3686 1.1 mrg and then walk successor blocks. */ 3687 1.1 mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_find_par'. */ 3688 1.1 mrg 3689 1.1 mrg static parallel * 3690 1.1 mrg nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) 3691 1.1 mrg { 3692 1.1 mrg if (block->flags & BB_VISITED) 3693 1.1 mrg return par; 3694 1.1 mrg block->flags |= BB_VISITED; 3695 1.1 mrg 3696 1.1 mrg if (rtx_insn **endp = map->get (block)) 3697 1.1 mrg { 3698 1.1 mrg rtx_insn *end = *endp; 3699 1.1 mrg 3700 1.1 mrg /* This is a block head or tail, or return instruction. */ 3701 1.1 mrg switch (recog_memoized (end)) 3702 1.1 mrg { 3703 1.1 mrg case CODE_FOR_return: 3704 1.1 mrg /* Return instructions are in their own block, and we 3705 1.1 mrg don't need to do anything more. */ 3706 1.1 mrg return par; 3707 1.1 mrg 3708 1.1 mrg case CODE_FOR_nvptx_forked: 3709 1.1 mrg /* Loop head, create a new inner loop and add it into 3710 1.1 mrg our parent's child list. */ 3711 1.1 mrg { 3712 1.1 mrg unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0)); 3713 1.1 mrg 3714 1.1 mrg gcc_assert (mask); 3715 1.1 mrg par = new parallel (par, mask); 3716 1.1 mrg par->forked_block = block; 3717 1.1 mrg par->forked_insn = end; 3718 1.1 mrg if (nvptx_needs_shared_bcast (mask)) 3719 1.1 mrg par->fork_insn 3720 1.1 mrg = nvptx_discover_pre (block, CODE_FOR_nvptx_fork); 3721 1.1 mrg } 3722 1.1 mrg break; 3723 1.1 mrg 3724 1.1 mrg case CODE_FOR_nvptx_join: 3725 1.1 mrg /* A loop tail. Finish the current loop and return to 3726 1.1 mrg parent. */ 3727 1.1 mrg { 3728 1.1 mrg unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0)); 3729 1.1 mrg 3730 1.1 mrg gcc_assert (par->mask == mask); 3731 1.1 mrg gcc_assert (par->join_block == NULL); 3732 1.1 mrg par->join_block = block; 3733 1.1 mrg par->join_insn = end; 3734 1.1 mrg if (nvptx_needs_shared_bcast (mask)) 3735 1.1 mrg par->joining_insn 3736 1.1 mrg = nvptx_discover_pre (block, CODE_FOR_nvptx_joining); 3737 1.1 mrg par = par->parent; 3738 1.1 mrg } 3739 1.1 mrg break; 3740 1.1 mrg 3741 1.1 mrg default: 3742 1.1 mrg gcc_unreachable (); 3743 1.1 mrg } 3744 1.1 mrg } 3745 1.1 mrg 3746 1.1 mrg if (par) 3747 1.1 mrg /* Add this block onto the current loop's list of blocks. */ 3748 1.1 mrg par->blocks.safe_push (block); 3749 1.1 mrg else 3750 1.1 mrg /* This must be the entry block. Create a NULL parallel. */ 3751 1.1 mrg par = new parallel (0, 0); 3752 1.1 mrg 3753 1.1 mrg /* Walk successor blocks. */ 3754 1.1 mrg edge e; 3755 1.1 mrg edge_iterator ei; 3756 1.1 mrg 3757 1.1 mrg FOR_EACH_EDGE (e, ei, block->succs) 3758 1.1 mrg nvptx_find_par (map, par, e->dest); 3759 1.1 mrg 3760 1.1 mrg return par; 3761 1.1 mrg } 3762 1.1 mrg 3763 1.1 mrg /* DFS walk the CFG looking for fork & join markers. Construct 3764 1.1 mrg loop structures as we go. MAP is a mapping of basic blocks 3765 1.1 mrg to head & tail markers, discovered when splitting blocks. This 3766 1.1 mrg speeds up the discovery. We rely on the BB visited flag having 3767 1.1 mrg been cleared when splitting blocks. */ 3768 1.1 mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_discover_pars'. */ 3769 1.1 mrg 3770 1.1 mrg static parallel * 3771 1.1 mrg nvptx_discover_pars (bb_insn_map_t *map) 3772 1.1 mrg { 3773 1.1 mrg basic_block block; 3774 1.1 mrg 3775 1.1 mrg /* Mark exit blocks as visited. */ 3776 1.1 mrg block = EXIT_BLOCK_PTR_FOR_FN (cfun); 3777 1.1 mrg block->flags |= BB_VISITED; 3778 1.1 mrg 3779 1.1 mrg /* And entry block as not. */ 3780 1.1 mrg block = ENTRY_BLOCK_PTR_FOR_FN (cfun); 3781 1.1 mrg block->flags &= ~BB_VISITED; 3782 1.1 mrg 3783 1.1 mrg parallel *par = nvptx_find_par (map, 0, block); 3784 1.1 mrg 3785 1.1 mrg if (dump_file) 3786 1.1 mrg { 3787 1.1 mrg fprintf (dump_file, "\nLoops\n"); 3788 1.1 mrg nvptx_dump_pars (par, 0); 3789 1.1 mrg fprintf (dump_file, "\n"); 3790 1.1 mrg } 3791 1.1 mrg 3792 1.1 mrg return par; 3793 1.1 mrg } 3794 1.1 mrg 3795 1.1 mrg /* Analyse a group of BBs within a partitioned region and create N 3796 1.1 mrg Single-Entry-Single-Exit regions. Some of those regions will be 3797 1.1 mrg trivial ones consisting of a single BB. The blocks of a 3798 1.1 mrg partitioned region might form a set of disjoint graphs -- because 3799 1.1 mrg the region encloses a differently partitoned sub region. 3800 1.1 mrg 3801 1.1 mrg We use the linear time algorithm described in 'Finding Regions Fast: 3802 1.1 mrg Single Entry Single Exit and control Regions in Linear Time' 3803 1.1 mrg Johnson, Pearson & Pingali. That algorithm deals with complete 3804 1.1 mrg CFGs, where a back edge is inserted from END to START, and thus the 3805 1.1 mrg problem becomes one of finding equivalent loops. 3806 1.1 mrg 3807 1.1 mrg In this case we have a partial CFG. We complete it by redirecting 3808 1.1 mrg any incoming edge to the graph to be from an arbitrary external BB, 3809 1.1 mrg and similarly redirecting any outgoing edge to be to that BB. 3810 1.1 mrg Thus we end up with a closed graph. 3811 1.1 mrg 3812 1.1 mrg The algorithm works by building a spanning tree of an undirected 3813 1.1 mrg graph and keeping track of back edges from nodes further from the 3814 1.1 mrg root in the tree to nodes nearer to the root in the tree. In the 3815 1.1 mrg description below, the root is up and the tree grows downwards. 3816 1.1 mrg 3817 1.1 mrg We avoid having to deal with degenerate back-edges to the same 3818 1.1 mrg block, by splitting each BB into 3 -- one for input edges, one for 3819 1.1 mrg the node itself and one for the output edges. Such back edges are 3820 1.1 mrg referred to as 'Brackets'. Cycle equivalent nodes will have the 3821 1.1 mrg same set of brackets. 3822 1.1 mrg 3823 1.1 mrg Determining bracket equivalency is done by maintaining a list of 3824 1.1 mrg brackets in such a manner that the list length and final bracket 3825 1.1 mrg uniquely identify the set. 3826 1.1 mrg 3827 1.1 mrg We use coloring to mark all BBs with cycle equivalency with the 3828 1.1 mrg same color. This is the output of the 'Finding Regions Fast' 3829 1.1 mrg algorithm. Notice it doesn't actually find the set of nodes within 3830 1.1 mrg a particular region, just unorderd sets of nodes that are the 3831 1.1 mrg entries and exits of SESE regions. 3832 1.1 mrg 3833 1.1 mrg After determining cycle equivalency, we need to find the minimal 3834 1.1 mrg set of SESE regions. Do this with a DFS coloring walk of the 3835 1.1 mrg complete graph. We're either 'looking' or 'coloring'. When 3836 1.1 mrg looking, and we're in the subgraph, we start coloring the color of 3837 1.1 mrg the current node, and remember that node as the start of the 3838 1.1 mrg current color's SESE region. Every time we go to a new node, we 3839 1.1 mrg decrement the count of nodes with thet color. If it reaches zero, 3840 1.1 mrg we remember that node as the end of the current color's SESE region 3841 1.1 mrg and return to 'looking'. Otherwise we color the node the current 3842 1.1 mrg color. 3843 1.1 mrg 3844 1.1 mrg This way we end up with coloring the inside of non-trivial SESE 3845 1.1 mrg regions with the color of that region. */ 3846 1.1 mrg 3847 1.1 mrg /* A pair of BBs. We use this to represent SESE regions. */ 3848 1.1 mrg typedef std::pair<basic_block, basic_block> bb_pair_t; 3849 1.1 mrg typedef auto_vec<bb_pair_t> bb_pair_vec_t; 3850 1.1 mrg 3851 1.1 mrg /* A node in the undirected CFG. The discriminator SECOND indicates just 3852 1.1 mrg above or just below the BB idicated by FIRST. */ 3853 1.1 mrg typedef std::pair<basic_block, int> pseudo_node_t; 3854 1.1 mrg 3855 1.1 mrg /* A bracket indicates an edge towards the root of the spanning tree of the 3856 1.1 mrg undirected graph. Each bracket has a color, determined 3857 1.1 mrg from the currrent set of brackets. */ 3858 1.1 mrg struct bracket 3859 1.1 mrg { 3860 1.1 mrg pseudo_node_t back; /* Back target */ 3861 1.1 mrg 3862 1.1 mrg /* Current color and size of set. */ 3863 1.1 mrg unsigned color; 3864 1.1 mrg unsigned size; 3865 1.1 mrg 3866 1.1 mrg bracket (pseudo_node_t back_) 3867 1.1 mrg : back (back_), color (~0u), size (~0u) 3868 1.1 mrg { 3869 1.1 mrg } 3870 1.1 mrg 3871 1.1 mrg unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length) 3872 1.1 mrg { 3873 1.1 mrg if (length != size) 3874 1.1 mrg { 3875 1.1 mrg size = length; 3876 1.1 mrg color = color_counts.length (); 3877 1.1 mrg color_counts.quick_push (0); 3878 1.1 mrg } 3879 1.1 mrg color_counts[color]++; 3880 1.1 mrg return color; 3881 1.1 mrg } 3882 1.1 mrg }; 3883 1.1 mrg 3884 1.1 mrg typedef auto_vec<bracket> bracket_vec_t; 3885 1.1 mrg 3886 1.1 mrg /* Basic block info for finding SESE regions. */ 3887 1.1 mrg 3888 1.1 mrg struct bb_sese 3889 1.1 mrg { 3890 1.1 mrg int node; /* Node number in spanning tree. */ 3891 1.1 mrg int parent; /* Parent node number. */ 3892 1.1 mrg 3893 1.1 mrg /* The algorithm splits each node A into Ai, A', Ao. The incoming 3894 1.1 mrg edges arrive at pseudo-node Ai and the outgoing edges leave at 3895 1.1 mrg pseudo-node Ao. We have to remember which way we arrived at a 3896 1.1 mrg particular node when generating the spanning tree. dir > 0 means 3897 1.1 mrg we arrived at Ai, dir < 0 means we arrived at Ao. */ 3898 1.1 mrg int dir; 3899 1.1 mrg 3900 1.1 mrg /* Lowest numbered pseudo-node reached via a backedge from thsis 3901 1.1 mrg node, or any descendant. */ 3902 1.1 mrg pseudo_node_t high; 3903 1.1 mrg 3904 1.1 mrg int color; /* Cycle-equivalence color */ 3905 1.1 mrg 3906 1.1 mrg /* Stack of brackets for this node. */ 3907 1.1 mrg bracket_vec_t brackets; 3908 1.1 mrg 3909 1.1 mrg bb_sese (unsigned node_, unsigned p, int dir_) 3910 1.1 mrg :node (node_), parent (p), dir (dir_) 3911 1.1 mrg { 3912 1.1 mrg } 3913 1.1 mrg ~bb_sese (); 3914 1.1 mrg 3915 1.1 mrg /* Push a bracket ending at BACK. */ 3916 1.1 mrg void push (const pseudo_node_t &back) 3917 1.1 mrg { 3918 1.1 mrg if (dump_file) 3919 1.1 mrg fprintf (dump_file, "Pushing backedge %d:%+d\n", 3920 1.1 mrg back.first ? back.first->index : 0, back.second); 3921 1.1 mrg brackets.safe_push (bracket (back)); 3922 1.1 mrg } 3923 1.1 mrg 3924 1.1 mrg void append (bb_sese *child); 3925 1.1 mrg void remove (const pseudo_node_t &); 3926 1.1 mrg 3927 1.1 mrg /* Set node's color. */ 3928 1.1 mrg void set_color (auto_vec<unsigned> &color_counts) 3929 1.1 mrg { 3930 1.1 mrg color = brackets.last ().get_color (color_counts, brackets.length ()); 3931 1.1 mrg } 3932 1.1 mrg }; 3933 1.1 mrg 3934 1.1 mrg bb_sese::~bb_sese () 3935 1.1 mrg { 3936 1.1 mrg } 3937 1.1 mrg 3938 1.1 mrg /* Destructively append CHILD's brackets. */ 3939 1.1 mrg 3940 1.1 mrg void 3941 1.1 mrg bb_sese::append (bb_sese *child) 3942 1.1 mrg { 3943 1.1 mrg if (int len = child->brackets.length ()) 3944 1.1 mrg { 3945 1.1 mrg int ix; 3946 1.1 mrg 3947 1.1 mrg if (dump_file) 3948 1.1 mrg { 3949 1.1 mrg for (ix = 0; ix < len; ix++) 3950 1.1 mrg { 3951 1.1 mrg const pseudo_node_t &pseudo = child->brackets[ix].back; 3952 1.1 mrg fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n", 3953 1.1 mrg child->node, pseudo.first ? pseudo.first->index : 0, 3954 1.1 mrg pseudo.second); 3955 1.1 mrg } 3956 1.1 mrg } 3957 1.1 mrg if (!brackets.length ()) 3958 1.1 mrg std::swap (brackets, child->brackets); 3959 1.1 mrg else 3960 1.1 mrg { 3961 1.1 mrg brackets.reserve (len); 3962 1.1 mrg for (ix = 0; ix < len; ix++) 3963 1.1 mrg brackets.quick_push (child->brackets[ix]); 3964 1.1 mrg } 3965 1.1 mrg } 3966 1.1 mrg } 3967 1.1 mrg 3968 1.1 mrg /* Remove brackets that terminate at PSEUDO. */ 3969 1.1 mrg 3970 1.1 mrg void 3971 1.1 mrg bb_sese::remove (const pseudo_node_t &pseudo) 3972 1.1 mrg { 3973 1.1 mrg unsigned removed = 0; 3974 1.1 mrg int len = brackets.length (); 3975 1.1 mrg 3976 1.1 mrg for (int ix = 0; ix < len; ix++) 3977 1.1 mrg { 3978 1.1 mrg if (brackets[ix].back == pseudo) 3979 1.1 mrg { 3980 1.1 mrg if (dump_file) 3981 1.1 mrg fprintf (dump_file, "Removing backedge %d:%+d\n", 3982 1.1 mrg pseudo.first ? pseudo.first->index : 0, pseudo.second); 3983 1.1 mrg removed++; 3984 1.1 mrg } 3985 1.1 mrg else if (removed) 3986 1.1 mrg brackets[ix-removed] = brackets[ix]; 3987 1.1 mrg } 3988 1.1 mrg while (removed--) 3989 1.1 mrg brackets.pop (); 3990 1.1 mrg } 3991 1.1 mrg 3992 1.1 mrg /* Accessors for BB's aux pointer. */ 3993 1.1 mrg #define BB_SET_SESE(B, S) ((B)->aux = (S)) 3994 1.1 mrg #define BB_GET_SESE(B) ((bb_sese *)(B)->aux) 3995 1.1 mrg 3996 1.1 mrg /* DFS walk creating SESE data structures. Only cover nodes with 3997 1.1 mrg BB_VISITED set. Append discovered blocks to LIST. We number in 3998 1.1 mrg increments of 3 so that the above and below pseudo nodes can be 3999 1.1 mrg implicitly numbered too. */ 4000 1.1 mrg 4001 1.1 mrg static int 4002 1.1 mrg nvptx_sese_number (int n, int p, int dir, basic_block b, 4003 1.1 mrg auto_vec<basic_block> *list) 4004 1.1 mrg { 4005 1.1 mrg if (BB_GET_SESE (b)) 4006 1.1 mrg return n; 4007 1.1 mrg 4008 1.1 mrg if (dump_file) 4009 1.1 mrg fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n", 4010 1.1 mrg b->index, n, p, dir); 4011 1.1 mrg 4012 1.1 mrg BB_SET_SESE (b, new bb_sese (n, p, dir)); 4013 1.1 mrg p = n; 4014 1.1 mrg 4015 1.1 mrg n += 3; 4016 1.1 mrg list->quick_push (b); 4017 1.1 mrg 4018 1.1 mrg /* First walk the nodes on the 'other side' of this node, then walk 4019 1.1 mrg the nodes on the same side. */ 4020 1.1 mrg for (unsigned ix = 2; ix; ix--) 4021 1.1 mrg { 4022 1.1 mrg vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds; 4023 1.1 mrg size_t offset = (dir > 0 ? offsetof (edge_def, dest) 4024 1.1 mrg : offsetof (edge_def, src)); 4025 1.1 mrg edge e; 4026 1.1 mrg edge_iterator ei; 4027 1.1 mrg 4028 1.1 mrg FOR_EACH_EDGE (e, ei, edges) 4029 1.1 mrg { 4030 1.1 mrg basic_block target = *(basic_block *)((char *)e + offset); 4031 1.1 mrg 4032 1.1 mrg if (target->flags & BB_VISITED) 4033 1.1 mrg n = nvptx_sese_number (n, p, dir, target, list); 4034 1.1 mrg } 4035 1.1 mrg dir = -dir; 4036 1.1 mrg } 4037 1.1 mrg return n; 4038 1.1 mrg } 4039 1.1 mrg 4040 1.1 mrg /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME. 4041 1.1 mrg EDGES are the outgoing edges and OFFSET is the offset to the src 4042 1.1 mrg or dst block on the edges. */ 4043 1.1 mrg 4044 1.1 mrg static void 4045 1.1 mrg nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, 4046 1.1 mrg vec<edge, va_gc> *edges, size_t offset) 4047 1.1 mrg { 4048 1.1 mrg edge e; 4049 1.1 mrg edge_iterator ei; 4050 1.1 mrg int hi_back = depth; 4051 1.1 mrg pseudo_node_t node_back (nullptr, depth); 4052 1.1 mrg int hi_child = depth; 4053 1.1 mrg pseudo_node_t node_child (nullptr, depth); 4054 1.1 mrg basic_block child = NULL; 4055 1.1 mrg unsigned num_children = 0; 4056 1.1 mrg int usd = -dir * sese->dir; 4057 1.1 mrg 4058 1.1 mrg if (dump_file) 4059 1.1 mrg fprintf (dump_file, "\nProcessing %d(%d) %+d\n", 4060 1.1 mrg me->index, sese->node, dir); 4061 1.1 mrg 4062 1.1 mrg if (dir < 0) 4063 1.1 mrg { 4064 1.1 mrg /* This is the above pseudo-child. It has the BB itself as an 4065 1.1 mrg additional child node. */ 4066 1.1 mrg node_child = sese->high; 4067 1.1 mrg hi_child = node_child.second; 4068 1.1 mrg if (node_child.first) 4069 1.1 mrg hi_child += BB_GET_SESE (node_child.first)->node; 4070 1.1 mrg num_children++; 4071 1.1 mrg } 4072 1.1 mrg 4073 1.1 mrg /* Examine each edge. 4074 1.1 mrg - if it is a child (a) append its bracket list and (b) record 4075 1.1 mrg whether it is the child with the highest reaching bracket. 4076 1.1 mrg - if it is an edge to ancestor, record whether it's the highest 4077 1.1 mrg reaching backlink. */ 4078 1.1 mrg FOR_EACH_EDGE (e, ei, edges) 4079 1.1 mrg { 4080 1.1 mrg basic_block target = *(basic_block *)((char *)e + offset); 4081 1.1 mrg 4082 1.1 mrg if (bb_sese *t_sese = BB_GET_SESE (target)) 4083 1.1 mrg { 4084 1.1 mrg if (t_sese->parent == sese->node && !(t_sese->dir + usd)) 4085 1.1 mrg { 4086 1.1 mrg /* Child node. Append its bracket list. */ 4087 1.1 mrg num_children++; 4088 1.1 mrg sese->append (t_sese); 4089 1.1 mrg 4090 1.1 mrg /* Compare it's hi value. */ 4091 1.1 mrg int t_hi = t_sese->high.second; 4092 1.1 mrg 4093 1.1 mrg if (basic_block child_hi_block = t_sese->high.first) 4094 1.1 mrg t_hi += BB_GET_SESE (child_hi_block)->node; 4095 1.1 mrg 4096 1.1 mrg if (hi_child > t_hi) 4097 1.1 mrg { 4098 1.1 mrg hi_child = t_hi; 4099 1.1 mrg node_child = t_sese->high; 4100 1.1 mrg child = target; 4101 1.1 mrg } 4102 1.1 mrg } 4103 1.1 mrg else if (t_sese->node < sese->node + dir 4104 1.1 mrg && !(dir < 0 && sese->parent == t_sese->node)) 4105 1.1 mrg { 4106 1.1 mrg /* Non-parental ancestor node -- a backlink. */ 4107 1.1 mrg int d = usd * t_sese->dir; 4108 1.1 mrg int back = t_sese->node + d; 4109 1.1 mrg 4110 1.1 mrg if (hi_back > back) 4111 1.1 mrg { 4112 1.1 mrg hi_back = back; 4113 1.1 mrg node_back = pseudo_node_t (target, d); 4114 1.1 mrg } 4115 1.1 mrg } 4116 1.1 mrg } 4117 1.1 mrg else 4118 1.1 mrg { /* Fallen off graph, backlink to entry node. */ 4119 1.1 mrg hi_back = 0; 4120 1.1 mrg node_back = pseudo_node_t (nullptr, 0); 4121 1.1 mrg } 4122 1.1 mrg } 4123 1.1 mrg 4124 1.1 mrg /* Remove any brackets that terminate at this pseudo node. */ 4125 1.1 mrg sese->remove (pseudo_node_t (me, dir)); 4126 1.1 mrg 4127 1.1 mrg /* Now push any backlinks from this pseudo node. */ 4128 1.1 mrg FOR_EACH_EDGE (e, ei, edges) 4129 1.1 mrg { 4130 1.1 mrg basic_block target = *(basic_block *)((char *)e + offset); 4131 1.1 mrg if (bb_sese *t_sese = BB_GET_SESE (target)) 4132 1.1 mrg { 4133 1.1 mrg if (t_sese->node < sese->node + dir 4134 1.1 mrg && !(dir < 0 && sese->parent == t_sese->node)) 4135 1.1 mrg /* Non-parental ancestor node - backedge from me. */ 4136 1.1 mrg sese->push (pseudo_node_t (target, usd * t_sese->dir)); 4137 1.1 mrg } 4138 1.1 mrg else 4139 1.1 mrg { 4140 1.1 mrg /* back edge to entry node */ 4141 1.1 mrg sese->push (pseudo_node_t (nullptr, 0)); 4142 1.1 mrg } 4143 1.1 mrg } 4144 1.1 mrg 4145 1.1 mrg /* If this node leads directly or indirectly to a no-return region of 4146 1.1 mrg the graph, then fake a backedge to entry node. */ 4147 1.1 mrg if (!sese->brackets.length () || !edges || !edges->length ()) 4148 1.1 mrg { 4149 1.1 mrg hi_back = 0; 4150 1.1 mrg node_back = pseudo_node_t (nullptr, 0); 4151 1.1 mrg sese->push (node_back); 4152 1.1 mrg } 4153 1.1 mrg 4154 1.1 mrg /* Record the highest reaching backedge from us or a descendant. */ 4155 1.1 mrg sese->high = hi_back < hi_child ? node_back : node_child; 4156 1.1 mrg 4157 1.1 mrg if (num_children > 1) 4158 1.1 mrg { 4159 1.1 mrg /* There is more than one child -- this is a Y shaped piece of 4160 1.1 mrg spanning tree. We have to insert a fake backedge from this 4161 1.1 mrg node to the highest ancestor reached by not-the-highest 4162 1.1 mrg reaching child. Note that there may be multiple children 4163 1.1 mrg with backedges to the same highest node. That's ok and we 4164 1.1 mrg insert the edge to that highest node. */ 4165 1.1 mrg hi_child = depth; 4166 1.1 mrg if (dir < 0 && child) 4167 1.1 mrg { 4168 1.1 mrg node_child = sese->high; 4169 1.1 mrg hi_child = node_child.second; 4170 1.1 mrg if (node_child.first) 4171 1.1 mrg hi_child += BB_GET_SESE (node_child.first)->node; 4172 1.1 mrg } 4173 1.1 mrg 4174 1.1 mrg FOR_EACH_EDGE (e, ei, edges) 4175 1.1 mrg { 4176 1.1 mrg basic_block target = *(basic_block *)((char *)e + offset); 4177 1.1 mrg 4178 1.1 mrg if (target == child) 4179 1.1 mrg /* Ignore the highest child. */ 4180 1.1 mrg continue; 4181 1.1 mrg 4182 1.1 mrg bb_sese *t_sese = BB_GET_SESE (target); 4183 1.1 mrg if (!t_sese) 4184 1.1 mrg continue; 4185 1.1 mrg if (t_sese->parent != sese->node) 4186 1.1 mrg /* Not a child. */ 4187 1.1 mrg continue; 4188 1.1 mrg 4189 1.1 mrg /* Compare its hi value. */ 4190 1.1 mrg int t_hi = t_sese->high.second; 4191 1.1 mrg 4192 1.1 mrg if (basic_block child_hi_block = t_sese->high.first) 4193 1.1 mrg t_hi += BB_GET_SESE (child_hi_block)->node; 4194 1.1 mrg 4195 1.1 mrg if (hi_child > t_hi) 4196 1.1 mrg { 4197 1.1 mrg hi_child = t_hi; 4198 1.1 mrg node_child = t_sese->high; 4199 1.1 mrg } 4200 1.1 mrg } 4201 1.1 mrg 4202 1.1 mrg sese->push (node_child); 4203 1.1 mrg } 4204 1.1 mrg } 4205 1.1 mrg 4206 1.1 mrg 4207 1.1 mrg /* DFS walk of BB graph. Color node BLOCK according to COLORING then 4208 1.1 mrg proceed to successors. Set SESE entry and exit nodes of 4209 1.1 mrg REGIONS. */ 4210 1.1 mrg 4211 1.1 mrg static void 4212 1.1 mrg nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t ®ions, 4213 1.1 mrg basic_block block, int coloring) 4214 1.1 mrg { 4215 1.1 mrg bb_sese *sese = BB_GET_SESE (block); 4216 1.1 mrg 4217 1.1 mrg if (block->flags & BB_VISITED) 4218 1.1 mrg { 4219 1.1 mrg /* If we've already encountered this block, either we must not 4220 1.1 mrg be coloring, or it must have been colored the current color. */ 4221 1.1 mrg gcc_assert (coloring < 0 || (sese && coloring == sese->color)); 4222 1.1 mrg return; 4223 1.1 mrg } 4224 1.1 mrg 4225 1.1 mrg block->flags |= BB_VISITED; 4226 1.1 mrg 4227 1.1 mrg if (sese) 4228 1.1 mrg { 4229 1.1 mrg if (coloring < 0) 4230 1.1 mrg { 4231 1.1 mrg /* Start coloring a region. */ 4232 1.1 mrg regions[sese->color].first = block; 4233 1.1 mrg coloring = sese->color; 4234 1.1 mrg } 4235 1.1 mrg 4236 1.1 mrg if (!--color_counts[sese->color] && sese->color == coloring) 4237 1.1 mrg { 4238 1.1 mrg /* Found final block of SESE region. */ 4239 1.1 mrg regions[sese->color].second = block; 4240 1.1 mrg coloring = -1; 4241 1.1 mrg } 4242 1.1 mrg else 4243 1.1 mrg /* Color the node, so we can assert on revisiting the node 4244 1.1 mrg that the graph is indeed SESE. */ 4245 1.1 mrg sese->color = coloring; 4246 1.1 mrg } 4247 1.1 mrg else 4248 1.1 mrg /* Fallen off the subgraph, we cannot be coloring. */ 4249 1.1 mrg gcc_assert (coloring < 0); 4250 1.1 mrg 4251 1.1 mrg /* Walk each successor block. */ 4252 1.1 mrg if (block->succs && block->succs->length ()) 4253 1.1 mrg { 4254 1.1 mrg edge e; 4255 1.1 mrg edge_iterator ei; 4256 1.1 mrg 4257 1.1 mrg FOR_EACH_EDGE (e, ei, block->succs) 4258 1.1 mrg nvptx_sese_color (color_counts, regions, e->dest, coloring); 4259 1.1 mrg } 4260 1.1 mrg else 4261 1.1 mrg gcc_assert (coloring < 0); 4262 1.1 mrg } 4263 1.1 mrg 4264 1.1 mrg /* Find minimal set of SESE regions covering BLOCKS. REGIONS might 4265 1.1 mrg end up with NULL entries in it. */ 4266 1.1 mrg 4267 1.1 mrg static void 4268 1.1 mrg nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions) 4269 1.1 mrg { 4270 1.1 mrg basic_block block; 4271 1.1 mrg int ix; 4272 1.1 mrg 4273 1.1 mrg /* First clear each BB of the whole function. */ 4274 1.1 mrg FOR_ALL_BB_FN (block, cfun) 4275 1.1 mrg { 4276 1.1 mrg block->flags &= ~BB_VISITED; 4277 1.1 mrg BB_SET_SESE (block, 0); 4278 1.1 mrg } 4279 1.1 mrg 4280 1.1 mrg /* Mark blocks in the function that are in this graph. */ 4281 1.1 mrg for (ix = 0; blocks.iterate (ix, &block); ix++) 4282 1.1 mrg block->flags |= BB_VISITED; 4283 1.1 mrg 4284 1.1 mrg /* Counts of nodes assigned to each color. There cannot be more 4285 1.1 mrg colors than blocks (and hopefully there will be fewer). */ 4286 1.1 mrg auto_vec<unsigned> color_counts; 4287 1.1 mrg color_counts.reserve (blocks.length ()); 4288 1.1 mrg 4289 1.1 mrg /* Worklist of nodes in the spanning tree. Again, there cannot be 4290 1.1 mrg more nodes in the tree than blocks (there will be fewer if the 4291 1.1 mrg CFG of blocks is disjoint). */ 4292 1.1 mrg auto_vec<basic_block> spanlist; 4293 1.1 mrg spanlist.reserve (blocks.length ()); 4294 1.1 mrg 4295 1.1 mrg /* Make sure every block has its cycle class determined. */ 4296 1.1 mrg for (ix = 0; blocks.iterate (ix, &block); ix++) 4297 1.1 mrg { 4298 1.1 mrg if (BB_GET_SESE (block)) 4299 1.1 mrg /* We already met this block in an earlier graph solve. */ 4300 1.1 mrg continue; 4301 1.1 mrg 4302 1.1 mrg if (dump_file) 4303 1.1 mrg fprintf (dump_file, "Searching graph starting at %d\n", block->index); 4304 1.1 mrg 4305 1.1 mrg /* Number the nodes reachable from block initial DFS order. */ 4306 1.1 mrg int depth = nvptx_sese_number (2, 0, +1, block, &spanlist); 4307 1.1 mrg 4308 1.1 mrg /* Now walk in reverse DFS order to find cycle equivalents. */ 4309 1.1 mrg while (spanlist.length ()) 4310 1.1 mrg { 4311 1.1 mrg block = spanlist.pop (); 4312 1.1 mrg bb_sese *sese = BB_GET_SESE (block); 4313 1.1 mrg 4314 1.1 mrg /* Do the pseudo node below. */ 4315 1.1 mrg nvptx_sese_pseudo (block, sese, depth, +1, 4316 1.1 mrg sese->dir > 0 ? block->succs : block->preds, 4317 1.1 mrg (sese->dir > 0 ? offsetof (edge_def, dest) 4318 1.1 mrg : offsetof (edge_def, src))); 4319 1.1 mrg sese->set_color (color_counts); 4320 1.1 mrg /* Do the pseudo node above. */ 4321 1.1 mrg nvptx_sese_pseudo (block, sese, depth, -1, 4322 1.1 mrg sese->dir < 0 ? block->succs : block->preds, 4323 1.1 mrg (sese->dir < 0 ? offsetof (edge_def, dest) 4324 1.1 mrg : offsetof (edge_def, src))); 4325 1.1 mrg } 4326 1.1 mrg if (dump_file) 4327 1.1 mrg fprintf (dump_file, "\n"); 4328 1.1 mrg } 4329 1.1 mrg 4330 1.1 mrg if (dump_file) 4331 1.1 mrg { 4332 1.1 mrg unsigned count; 4333 1.1 mrg const char *comma = ""; 4334 1.1 mrg 4335 1.1 mrg fprintf (dump_file, "Found %d cycle equivalents\n", 4336 1.1 mrg color_counts.length ()); 4337 1.1 mrg for (ix = 0; color_counts.iterate (ix, &count); ix++) 4338 1.1 mrg { 4339 1.1 mrg fprintf (dump_file, "%s%d[%d]={", comma, ix, count); 4340 1.1 mrg 4341 1.1 mrg comma = ""; 4342 1.1 mrg for (unsigned jx = 0; blocks.iterate (jx, &block); jx++) 4343 1.1 mrg if (BB_GET_SESE (block)->color == ix) 4344 1.1 mrg { 4345 1.1 mrg block->flags |= BB_VISITED; 4346 1.1 mrg fprintf (dump_file, "%s%d", comma, block->index); 4347 1.1 mrg comma=","; 4348 1.1 mrg } 4349 1.1 mrg fprintf (dump_file, "}"); 4350 1.1 mrg comma = ", "; 4351 1.1 mrg } 4352 1.1 mrg fprintf (dump_file, "\n"); 4353 1.1 mrg } 4354 1.1 mrg 4355 1.1 mrg /* Now we've colored every block in the subgraph. We now need to 4356 1.1 mrg determine the minimal set of SESE regions that cover that 4357 1.1 mrg subgraph. Do this with a DFS walk of the complete function. 4358 1.1 mrg During the walk we're either 'looking' or 'coloring'. When we 4359 1.1 mrg reach the last node of a particular color, we stop coloring and 4360 1.1 mrg return to looking. */ 4361 1.1 mrg 4362 1.1 mrg /* There cannot be more SESE regions than colors. */ 4363 1.1 mrg regions.reserve (color_counts.length ()); 4364 1.1 mrg for (ix = color_counts.length (); ix--;) 4365 1.1 mrg regions.quick_push (bb_pair_t (0, 0)); 4366 1.1 mrg 4367 1.1 mrg for (ix = 0; blocks.iterate (ix, &block); ix++) 4368 1.1 mrg block->flags &= ~BB_VISITED; 4369 1.1 mrg 4370 1.1 mrg nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1); 4371 1.1 mrg 4372 1.1 mrg if (dump_file) 4373 1.1 mrg { 4374 1.1 mrg const char *comma = ""; 4375 1.1 mrg int len = regions.length (); 4376 1.1 mrg 4377 1.1 mrg fprintf (dump_file, "SESE regions:"); 4378 1.1 mrg for (ix = 0; ix != len; ix++) 4379 1.1 mrg { 4380 1.1 mrg basic_block from = regions[ix].first; 4381 1.1 mrg basic_block to = regions[ix].second; 4382 1.1 mrg 4383 1.1 mrg if (from) 4384 1.1 mrg { 4385 1.1 mrg fprintf (dump_file, "%s %d{%d", comma, ix, from->index); 4386 1.1 mrg if (to != from) 4387 1.1 mrg fprintf (dump_file, "->%d", to->index); 4388 1.1 mrg 4389 1.1 mrg int color = BB_GET_SESE (from)->color; 4390 1.1 mrg 4391 1.1 mrg /* Print the blocks within the region (excluding ends). */ 4392 1.1 mrg FOR_EACH_BB_FN (block, cfun) 4393 1.1 mrg { 4394 1.1 mrg bb_sese *sese = BB_GET_SESE (block); 4395 1.1 mrg 4396 1.1 mrg if (sese && sese->color == color 4397 1.1 mrg && block != from && block != to) 4398 1.1 mrg fprintf (dump_file, ".%d", block->index); 4399 1.1 mrg } 4400 1.1 mrg fprintf (dump_file, "}"); 4401 1.1 mrg } 4402 1.1 mrg comma = ","; 4403 1.1 mrg } 4404 1.1 mrg fprintf (dump_file, "\n\n"); 4405 1.1 mrg } 4406 1.1 mrg 4407 1.1 mrg for (ix = 0; blocks.iterate (ix, &block); ix++) 4408 1.1 mrg delete BB_GET_SESE (block); 4409 1.1 mrg } 4410 1.1 mrg 4411 1.1 mrg #undef BB_SET_SESE 4412 1.1 mrg #undef BB_GET_SESE 4413 1.1 mrg 4414 1.1 mrg /* Propagate live state at the start of a partitioned region. IS_CALL 4415 1.1 mrg indicates whether the propagation is for a (partitioned) call 4416 1.1 mrg instruction. BLOCK provides the live register information, and 4417 1.1 mrg might not contain INSN. Propagation is inserted just after INSN. RW 4418 1.1 mrg indicates whether we are reading and/or writing state. This 4419 1.1 mrg separation is needed for worker-level proppagation where we 4420 1.1 mrg essentially do a spill & fill. FN is the underlying worker 4421 1.1 mrg function to generate the propagation instructions for single 4422 1.1 mrg register. DATA is user data. 4423 1.1 mrg 4424 1.1 mrg Returns true if we didn't emit any instructions. 4425 1.1 mrg 4426 1.1 mrg We propagate the live register set for non-calls and the entire 4427 1.1 mrg frame for calls and non-calls. We could do better by (a) 4428 1.1 mrg propagating just the live set that is used within the partitioned 4429 1.1 mrg regions and (b) only propagating stack entries that are used. The 4430 1.1 mrg latter might be quite hard to determine. */ 4431 1.1 mrg 4432 1.1 mrg typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool); 4433 1.1 mrg 4434 1.1 mrg static bool 4435 1.1 mrg nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn, 4436 1.1 mrg propagate_mask rw, propagator_fn fn, void *data, bool vector) 4437 1.1 mrg { 4438 1.1 mrg bitmap live = DF_LIVE_IN (block); 4439 1.1 mrg bitmap_iterator iterator; 4440 1.1 mrg unsigned ix; 4441 1.1 mrg bool empty = true; 4442 1.1 mrg 4443 1.1 mrg /* Copy the frame array. */ 4444 1.1 mrg HOST_WIDE_INT fs = get_frame_size (); 4445 1.1 mrg if (fs) 4446 1.1 mrg { 4447 1.1 mrg rtx tmp = gen_reg_rtx (DImode); 4448 1.1 mrg rtx idx = NULL_RTX; 4449 1.1 mrg rtx ptr = gen_reg_rtx (Pmode); 4450 1.1 mrg rtx pred = NULL_RTX; 4451 1.1 mrg rtx_code_label *label = NULL; 4452 1.1 mrg 4453 1.1 mrg empty = false; 4454 1.1 mrg /* The frame size might not be DImode compatible, but the frame 4455 1.1 mrg array's declaration will be. So it's ok to round up here. */ 4456 1.1 mrg fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode); 4457 1.1 mrg /* Detect single iteration loop. */ 4458 1.1 mrg if (fs == 1) 4459 1.1 mrg fs = 0; 4460 1.1 mrg 4461 1.1 mrg start_sequence (); 4462 1.1 mrg emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx)); 4463 1.1 mrg if (fs) 4464 1.1 mrg { 4465 1.1 mrg idx = gen_reg_rtx (SImode); 4466 1.1 mrg pred = gen_reg_rtx (BImode); 4467 1.1 mrg label = gen_label_rtx (); 4468 1.1 mrg 4469 1.1 mrg emit_insn (gen_rtx_SET (idx, GEN_INT (fs))); 4470 1.1 mrg /* Allow worker function to initialize anything needed. */ 4471 1.1 mrg rtx init = fn (tmp, PM_loop_begin, fs, data, vector); 4472 1.1 mrg if (init) 4473 1.1 mrg emit_insn (init); 4474 1.1 mrg emit_label (label); 4475 1.1 mrg LABEL_NUSES (label)++; 4476 1.1 mrg emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1))); 4477 1.1 mrg } 4478 1.1 mrg if (rw & PM_read) 4479 1.1 mrg emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr))); 4480 1.1 mrg emit_insn (fn (tmp, rw, fs, data, vector)); 4481 1.1 mrg if (rw & PM_write) 4482 1.1 mrg emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp)); 4483 1.1 mrg if (fs) 4484 1.1 mrg { 4485 1.1 mrg emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx))); 4486 1.1 mrg emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode)))); 4487 1.1 mrg emit_insn (gen_br_true_uni (pred, label)); 4488 1.1 mrg rtx fini = fn (tmp, PM_loop_end, fs, data, vector); 4489 1.1 mrg if (fini) 4490 1.1 mrg emit_insn (fini); 4491 1.1 mrg emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx)); 4492 1.1 mrg } 4493 1.1 mrg emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp)); 4494 1.1 mrg emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr)); 4495 1.1 mrg rtx cpy = get_insns (); 4496 1.1 mrg end_sequence (); 4497 1.1 mrg insn = emit_insn_after (cpy, insn); 4498 1.1 mrg } 4499 1.1 mrg 4500 1.1 mrg if (!is_call) 4501 1.1 mrg /* Copy live registers. */ 4502 1.1 mrg EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator) 4503 1.1 mrg { 4504 1.1 mrg rtx reg = regno_reg_rtx[ix]; 4505 1.1 mrg 4506 1.1 mrg if (REGNO (reg) >= FIRST_PSEUDO_REGISTER) 4507 1.1 mrg { 4508 1.1 mrg rtx bcast = fn (reg, rw, 0, data, vector); 4509 1.1 mrg 4510 1.1 mrg insn = emit_insn_after (bcast, insn); 4511 1.1 mrg empty = false; 4512 1.1 mrg } 4513 1.1 mrg } 4514 1.1 mrg return empty; 4515 1.1 mrg } 4516 1.1 mrg 4517 1.1 mrg /* Worker for nvptx_warp_propagate. */ 4518 1.1 mrg 4519 1.1 mrg static rtx 4520 1.1 mrg warp_prop_gen (rtx reg, propagate_mask pm, 4521 1.1 mrg unsigned ARG_UNUSED (count), void *ARG_UNUSED (data), 4522 1.1 mrg bool ARG_UNUSED (vector)) 4523 1.1 mrg { 4524 1.1 mrg if (!(pm & PM_read_write)) 4525 1.1 mrg return 0; 4526 1.1 mrg 4527 1.1 mrg return nvptx_gen_warp_bcast (reg); 4528 1.1 mrg } 4529 1.1 mrg 4530 1.1 mrg /* Propagate state that is live at start of BLOCK across the vectors 4531 1.1 mrg of a single warp. Propagation is inserted just after INSN. 4532 1.1 mrg IS_CALL and return as for nvptx_propagate. */ 4533 1.1 mrg 4534 1.1 mrg static bool 4535 1.1 mrg nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn) 4536 1.1 mrg { 4537 1.1 mrg return nvptx_propagate (is_call, block, insn, PM_read_write, 4538 1.1 mrg warp_prop_gen, 0, false); 4539 1.1 mrg } 4540 1.1 mrg 4541 1.1 mrg /* Worker for nvptx_shared_propagate. */ 4542 1.1 mrg 4543 1.1 mrg static rtx 4544 1.1 mrg shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_, 4545 1.1 mrg bool vector) 4546 1.1 mrg { 4547 1.1 mrg broadcast_data_t *data = (broadcast_data_t *)data_; 4548 1.1 mrg 4549 1.1 mrg if (pm & PM_loop_begin) 4550 1.1 mrg { 4551 1.1 mrg /* Starting a loop, initialize pointer. */ 4552 1.1 mrg unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT; 4553 1.1 mrg 4554 1.1 mrg oacc_bcast_align = MAX (oacc_bcast_align, align); 4555 1.1 mrg data->offset = ROUND_UP (data->offset, align); 4556 1.1 mrg 4557 1.1 mrg data->ptr = gen_reg_rtx (Pmode); 4558 1.1 mrg 4559 1.1 mrg return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset)); 4560 1.1 mrg } 4561 1.1 mrg else if (pm & PM_loop_end) 4562 1.1 mrg { 4563 1.1 mrg rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr); 4564 1.1 mrg data->ptr = NULL_RTX; 4565 1.1 mrg return clobber; 4566 1.1 mrg } 4567 1.1 mrg else 4568 1.1 mrg return nvptx_gen_shared_bcast (reg, pm, rep, data, vector); 4569 1.1 mrg } 4570 1.1 mrg 4571 1.1 mrg /* Spill or fill live state that is live at start of BLOCK. PRE_P 4572 1.1 mrg indicates if this is just before partitioned mode (do spill), or 4573 1.1 mrg just after it starts (do fill). Sequence is inserted just after 4574 1.1 mrg INSN. IS_CALL and return as for nvptx_propagate. */ 4575 1.1 mrg 4576 1.1 mrg static bool 4577 1.1 mrg nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block, 4578 1.1 mrg rtx_insn *insn, bool vector) 4579 1.1 mrg { 4580 1.1 mrg broadcast_data_t data; 4581 1.1 mrg 4582 1.1 mrg data.base = gen_reg_rtx (Pmode); 4583 1.1 mrg data.offset = 0; 4584 1.1 mrg data.ptr = NULL_RTX; 4585 1.1 mrg 4586 1.1 mrg bool empty = nvptx_propagate (is_call, block, insn, 4587 1.1 mrg pre_p ? PM_read : PM_write, shared_prop_gen, 4588 1.1 mrg &data, vector); 4589 1.1 mrg gcc_assert (empty == !data.offset); 4590 1.1 mrg if (data.offset) 4591 1.1 mrg { 4592 1.1 mrg rtx bcast_sym = oacc_bcast_sym; 4593 1.1 mrg 4594 1.1 mrg /* Stuff was emitted, initialize the base pointer now. */ 4595 1.1 mrg if (vector && nvptx_mach_max_workers () > 1) 4596 1.1 mrg { 4597 1.1 mrg if (!cfun->machine->bcast_partition) 4598 1.1 mrg { 4599 1.1 mrg /* It would be nice to place this register in 4600 1.1 mrg DATA_AREA_SHARED. */ 4601 1.1 mrg cfun->machine->bcast_partition = gen_reg_rtx (DImode); 4602 1.1 mrg } 4603 1.1 mrg if (!cfun->machine->sync_bar) 4604 1.1 mrg cfun->machine->sync_bar = gen_reg_rtx (SImode); 4605 1.1 mrg 4606 1.1 mrg bcast_sym = cfun->machine->bcast_partition; 4607 1.1 mrg } 4608 1.1 mrg 4609 1.1 mrg rtx init = gen_rtx_SET (data.base, bcast_sym); 4610 1.1 mrg emit_insn_after (init, insn); 4611 1.1 mrg 4612 1.1 mrg unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align); 4613 1.1 mrg unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE 4614 1.1 mrg ? nvptx_mach_max_workers () + 1 4615 1.1 mrg : 1); 4616 1.1 mrg 4617 1.1 mrg oacc_bcast_partition = MAX (oacc_bcast_partition, psize); 4618 1.1 mrg oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum); 4619 1.1 mrg } 4620 1.1 mrg return empty; 4621 1.1 mrg } 4622 1.1 mrg 4623 1.1 mrg /* Emit a CTA-level synchronization barrier. LOCK is the barrier number, 4624 1.1 mrg which is an integer or a register. THREADS is the number of threads 4625 1.1 mrg controlled by the barrier. */ 4626 1.1 mrg 4627 1.1 mrg static rtx 4628 1.1 mrg nvptx_cta_sync (rtx lock, int threads) 4629 1.1 mrg { 4630 1.1 mrg return gen_nvptx_barsync (lock, GEN_INT (threads)); 4631 1.1 mrg } 4632 1.1 mrg 4633 1.1 mrg #if WORKAROUND_PTXJIT_BUG 4634 1.1 mrg /* Return first real insn in BB, or return NULL_RTX if BB does not contain 4635 1.1 mrg real insns. */ 4636 1.1 mrg 4637 1.1 mrg static rtx_insn * 4638 1.1 mrg bb_first_real_insn (basic_block bb) 4639 1.1 mrg { 4640 1.1 mrg rtx_insn *insn; 4641 1.1 mrg 4642 1.1 mrg /* Find first insn of from block. */ 4643 1.1 mrg FOR_BB_INSNS (bb, insn) 4644 1.1 mrg if (INSN_P (insn)) 4645 1.1 mrg return insn; 4646 1.1 mrg 4647 1.1 mrg return 0; 4648 1.1 mrg } 4649 1.1 mrg #endif 4650 1.1 mrg 4651 1.1 mrg /* Return true if INSN needs neutering. */ 4652 1.1 mrg 4653 1.1 mrg static bool 4654 1.1 mrg needs_neutering_p (rtx_insn *insn) 4655 1.1 mrg { 4656 1.1 mrg if (!INSN_P (insn)) 4657 1.1 mrg return false; 4658 1.1 mrg 4659 1.1 mrg switch (recog_memoized (insn)) 4660 1.1 mrg { 4661 1.1 mrg case CODE_FOR_nvptx_fork: 4662 1.1 mrg case CODE_FOR_nvptx_forked: 4663 1.1 mrg case CODE_FOR_nvptx_joining: 4664 1.1 mrg case CODE_FOR_nvptx_join: 4665 1.1 mrg case CODE_FOR_nvptx_barsync: 4666 1.1 mrg return false; 4667 1.1 mrg default: 4668 1.1 mrg return true; 4669 1.1 mrg } 4670 1.1 mrg } 4671 1.1 mrg 4672 1.1 mrg /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */ 4673 1.1 mrg 4674 1.1 mrg static bool 4675 1.1 mrg verify_neutering_jumps (basic_block from, 4676 1.1 mrg rtx_insn *vector_jump, rtx_insn *worker_jump, 4677 1.1 mrg rtx_insn *vector_label, rtx_insn *worker_label) 4678 1.1 mrg { 4679 1.1 mrg basic_block bb = from; 4680 1.1 mrg rtx_insn *insn = BB_HEAD (bb); 4681 1.1 mrg bool seen_worker_jump = false; 4682 1.1 mrg bool seen_vector_jump = false; 4683 1.1 mrg bool seen_worker_label = false; 4684 1.1 mrg bool seen_vector_label = false; 4685 1.1 mrg bool worker_neutered = false; 4686 1.1 mrg bool vector_neutered = false; 4687 1.1 mrg while (true) 4688 1.1 mrg { 4689 1.1 mrg if (insn == worker_jump) 4690 1.1 mrg { 4691 1.1 mrg seen_worker_jump = true; 4692 1.1 mrg worker_neutered = true; 4693 1.1 mrg gcc_assert (!vector_neutered); 4694 1.1 mrg } 4695 1.1 mrg else if (insn == vector_jump) 4696 1.1 mrg { 4697 1.1 mrg seen_vector_jump = true; 4698 1.1 mrg vector_neutered = true; 4699 1.1 mrg } 4700 1.1 mrg else if (insn == worker_label) 4701 1.1 mrg { 4702 1.1 mrg seen_worker_label = true; 4703 1.1 mrg gcc_assert (worker_neutered); 4704 1.1 mrg worker_neutered = false; 4705 1.1 mrg } 4706 1.1 mrg else if (insn == vector_label) 4707 1.1 mrg { 4708 1.1 mrg seen_vector_label = true; 4709 1.1 mrg gcc_assert (vector_neutered); 4710 1.1 mrg vector_neutered = false; 4711 1.1 mrg } 4712 1.1 mrg else if (INSN_P (insn)) 4713 1.1 mrg switch (recog_memoized (insn)) 4714 1.1 mrg { 4715 1.1 mrg case CODE_FOR_nvptx_barsync: 4716 1.1 mrg gcc_assert (!vector_neutered && !worker_neutered); 4717 1.1 mrg break; 4718 1.1 mrg default: 4719 1.1 mrg break; 4720 1.1 mrg } 4721 1.1 mrg 4722 1.1 mrg if (insn != BB_END (bb)) 4723 1.1 mrg insn = NEXT_INSN (insn); 4724 1.1 mrg else if (JUMP_P (insn) && single_succ_p (bb) 4725 1.1 mrg && !seen_vector_jump && !seen_worker_jump) 4726 1.1 mrg { 4727 1.1 mrg bb = single_succ (bb); 4728 1.1 mrg insn = BB_HEAD (bb); 4729 1.1 mrg } 4730 1.1 mrg else 4731 1.1 mrg break; 4732 1.1 mrg } 4733 1.1 mrg 4734 1.1 mrg gcc_assert (!(vector_jump && !seen_vector_jump)); 4735 1.1 mrg gcc_assert (!(worker_jump && !seen_worker_jump)); 4736 1.1 mrg 4737 1.1 mrg if (seen_vector_label || seen_worker_label) 4738 1.1 mrg { 4739 1.1 mrg gcc_assert (!(vector_label && !seen_vector_label)); 4740 1.1 mrg gcc_assert (!(worker_label && !seen_worker_label)); 4741 1.1 mrg 4742 1.1 mrg return true; 4743 1.1 mrg } 4744 1.1 mrg 4745 1.1 mrg return false; 4746 1.1 mrg } 4747 1.1 mrg 4748 1.1 mrg /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */ 4749 1.1 mrg 4750 1.1 mrg static void 4751 1.1 mrg verify_neutering_labels (basic_block to, rtx_insn *vector_label, 4752 1.1 mrg rtx_insn *worker_label) 4753 1.1 mrg { 4754 1.1 mrg basic_block bb = to; 4755 1.1 mrg rtx_insn *insn = BB_END (bb); 4756 1.1 mrg bool seen_worker_label = false; 4757 1.1 mrg bool seen_vector_label = false; 4758 1.1 mrg while (true) 4759 1.1 mrg { 4760 1.1 mrg if (insn == worker_label) 4761 1.1 mrg { 4762 1.1 mrg seen_worker_label = true; 4763 1.1 mrg gcc_assert (!seen_vector_label); 4764 1.1 mrg } 4765 1.1 mrg else if (insn == vector_label) 4766 1.1 mrg seen_vector_label = true; 4767 1.1 mrg else if (INSN_P (insn)) 4768 1.1 mrg switch (recog_memoized (insn)) 4769 1.1 mrg { 4770 1.1 mrg case CODE_FOR_nvptx_barsync: 4771 1.1 mrg gcc_assert (!seen_vector_label && !seen_worker_label); 4772 1.1 mrg break; 4773 1.1 mrg } 4774 1.1 mrg 4775 1.1 mrg if (insn != BB_HEAD (bb)) 4776 1.1 mrg insn = PREV_INSN (insn); 4777 1.1 mrg else 4778 1.1 mrg break; 4779 1.1 mrg } 4780 1.1 mrg 4781 1.1 mrg gcc_assert (!(vector_label && !seen_vector_label)); 4782 1.1 mrg gcc_assert (!(worker_label && !seen_worker_label)); 4783 1.1 mrg } 4784 1.1 mrg 4785 1.1 mrg /* Single neutering according to MASK. FROM is the incoming block and 4786 1.1 mrg TO is the outgoing block. These may be the same block. Insert at 4787 1.1 mrg start of FROM: 4788 1.1 mrg 4789 1.1 mrg if (tid.<axis>) goto end. 4790 1.1 mrg 4791 1.1 mrg and insert before ending branch of TO (if there is such an insn): 4792 1.1 mrg 4793 1.1 mrg end: 4794 1.1 mrg <possibly-broadcast-cond> 4795 1.1 mrg <branch> 4796 1.1 mrg 4797 1.1 mrg We currently only use differnt FROM and TO when skipping an entire 4798 1.1 mrg loop. We could do more if we detected superblocks. */ 4799 1.1 mrg 4800 1.1 mrg static void 4801 1.1 mrg nvptx_single (unsigned mask, basic_block from, basic_block to) 4802 1.1 mrg { 4803 1.1 mrg rtx_insn *head = BB_HEAD (from); 4804 1.1 mrg rtx_insn *tail = BB_END (to); 4805 1.1 mrg unsigned skip_mask = mask; 4806 1.1 mrg 4807 1.1 mrg while (true) 4808 1.1 mrg { 4809 1.1 mrg /* Find first insn of from block. */ 4810 1.1 mrg while (head != BB_END (from) && !needs_neutering_p (head)) 4811 1.1 mrg head = NEXT_INSN (head); 4812 1.1 mrg 4813 1.1 mrg if (from == to) 4814 1.1 mrg break; 4815 1.1 mrg 4816 1.1 mrg if (!(JUMP_P (head) && single_succ_p (from))) 4817 1.1 mrg break; 4818 1.1 mrg 4819 1.1 mrg basic_block jump_target = single_succ (from); 4820 1.1 mrg if (!single_pred_p (jump_target)) 4821 1.1 mrg break; 4822 1.1 mrg 4823 1.1 mrg from = jump_target; 4824 1.1 mrg head = BB_HEAD (from); 4825 1.1 mrg } 4826 1.1 mrg 4827 1.1 mrg /* Find last insn of to block */ 4828 1.1 mrg rtx_insn *limit = from == to ? head : BB_HEAD (to); 4829 1.1 mrg while (tail != limit && !INSN_P (tail) && !LABEL_P (tail)) 4830 1.1 mrg tail = PREV_INSN (tail); 4831 1.1 mrg 4832 1.1 mrg /* Detect if tail is a branch. */ 4833 1.1 mrg rtx tail_branch = NULL_RTX; 4834 1.1 mrg rtx cond_branch = NULL_RTX; 4835 1.1 mrg if (tail && INSN_P (tail)) 4836 1.1 mrg { 4837 1.1 mrg tail_branch = PATTERN (tail); 4838 1.1 mrg if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx) 4839 1.1 mrg tail_branch = NULL_RTX; 4840 1.1 mrg else 4841 1.1 mrg { 4842 1.1 mrg cond_branch = SET_SRC (tail_branch); 4843 1.1 mrg if (GET_CODE (cond_branch) != IF_THEN_ELSE) 4844 1.1 mrg cond_branch = NULL_RTX; 4845 1.1 mrg } 4846 1.1 mrg } 4847 1.1 mrg 4848 1.1 mrg if (tail == head) 4849 1.1 mrg { 4850 1.1 mrg /* If this is empty, do nothing. */ 4851 1.1 mrg if (!head || !needs_neutering_p (head)) 4852 1.1 mrg return; 4853 1.1 mrg 4854 1.1 mrg if (cond_branch) 4855 1.1 mrg { 4856 1.1 mrg /* If we're only doing vector single, there's no need to 4857 1.1 mrg emit skip code because we'll not insert anything. */ 4858 1.1 mrg if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))) 4859 1.1 mrg skip_mask = 0; 4860 1.1 mrg } 4861 1.1 mrg else if (tail_branch) 4862 1.1 mrg /* Block with only unconditional branch. Nothing to do. */ 4863 1.1 mrg return; 4864 1.1 mrg } 4865 1.1 mrg 4866 1.1 mrg /* Insert the vector test inside the worker test. */ 4867 1.1 mrg unsigned mode; 4868 1.1 mrg rtx_insn *before = tail; 4869 1.1 mrg rtx_insn *neuter_start = NULL; 4870 1.1 mrg rtx_insn *worker_label = NULL, *vector_label = NULL; 4871 1.1 mrg rtx_insn *worker_jump = NULL, *vector_jump = NULL; 4872 1.1 mrg rtx_insn *warp_sync = NULL; 4873 1.1 mrg for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) 4874 1.1 mrg if (GOMP_DIM_MASK (mode) & skip_mask) 4875 1.1 mrg { 4876 1.1 mrg rtx_code_label *label = gen_label_rtx (); 4877 1.1 mrg rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER]; 4878 1.1 mrg rtx_insn **mode_jump 4879 1.1 mrg = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump; 4880 1.1 mrg rtx_insn **mode_label 4881 1.1 mrg = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label; 4882 1.1 mrg 4883 1.1 mrg if (!pred) 4884 1.1 mrg { 4885 1.1 mrg pred = gen_reg_rtx (BImode); 4886 1.1 mrg cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred; 4887 1.1 mrg } 4888 1.1 mrg 4889 1.1 mrg rtx br; 4890 1.1 mrg if (mode == GOMP_DIM_VECTOR) 4891 1.1 mrg br = gen_br_true (pred, label); 4892 1.1 mrg else 4893 1.1 mrg br = gen_br_true_uni (pred, label); 4894 1.1 mrg if (neuter_start) 4895 1.1 mrg neuter_start = emit_insn_after (br, neuter_start); 4896 1.1 mrg else 4897 1.1 mrg neuter_start = emit_insn_before (br, head); 4898 1.1 mrg *mode_jump = neuter_start; 4899 1.1 mrg 4900 1.1 mrg LABEL_NUSES (label)++; 4901 1.1 mrg rtx_insn *label_insn; 4902 1.1 mrg if (tail_branch) 4903 1.1 mrg { 4904 1.1 mrg label_insn = emit_label_before (label, before); 4905 1.1 mrg if (mode == GOMP_DIM_VECTOR) 4906 1.1 mrg { 4907 1.1 mrg if (TARGET_PTX_6_0) 4908 1.1 mrg warp_sync = emit_insn_after (gen_nvptx_warpsync (), 4909 1.1 mrg label_insn); 4910 1.1 mrg else 4911 1.1 mrg warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (), 4912 1.1 mrg label_insn); 4913 1.1 mrg } 4914 1.1 mrg before = label_insn; 4915 1.1 mrg } 4916 1.1 mrg else 4917 1.1 mrg { 4918 1.1 mrg label_insn = emit_label_after (label, tail); 4919 1.1 mrg if (mode == GOMP_DIM_VECTOR) 4920 1.1 mrg { 4921 1.1 mrg if (TARGET_PTX_6_0) 4922 1.1 mrg warp_sync = emit_insn_after (gen_nvptx_warpsync (), 4923 1.1 mrg label_insn); 4924 1.1 mrg else 4925 1.1 mrg warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (), 4926 1.1 mrg label_insn); 4927 1.1 mrg } 4928 1.1 mrg if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER) 4929 1.1 mrg && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL)) 4930 1.1 mrg emit_insn_after (gen_exit (), label_insn); 4931 1.1 mrg } 4932 1.1 mrg 4933 1.1 mrg *mode_label = label_insn; 4934 1.1 mrg } 4935 1.1 mrg 4936 1.1 mrg /* Now deal with propagating the branch condition. */ 4937 1.1 mrg if (cond_branch) 4938 1.1 mrg { 4939 1.1 mrg rtx pvar = XEXP (XEXP (cond_branch, 0), 0); 4940 1.1 mrg 4941 1.1 mrg if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask 4942 1.1 mrg && nvptx_mach_vector_length () == PTX_WARP_SIZE) 4943 1.1 mrg { 4944 1.1 mrg /* Vector mode only, do a shuffle. */ 4945 1.1 mrg #if WORKAROUND_PTXJIT_BUG 4946 1.1 mrg /* The branch condition %rcond is propagated like this: 4947 1.1 mrg 4948 1.1 mrg { 4949 1.1 mrg .reg .u32 %x; 4950 1.1 mrg mov.u32 %x,%tid.x; 4951 1.1 mrg setp.ne.u32 %rnotvzero,%x,0; 4952 1.1 mrg } 4953 1.1 mrg 4954 1.1 mrg @%rnotvzero bra Lskip; 4955 1.1 mrg setp.<op>.<type> %rcond,op1,op2; 4956 1.1 mrg Lskip: 4957 1.1 mrg selp.u32 %rcondu32,1,0,%rcond; 4958 1.1 mrg shfl.idx.b32 %rcondu32,%rcondu32,0,31; 4959 1.1 mrg setp.ne.u32 %rcond,%rcondu32,0; 4960 1.1 mrg 4961 1.1 mrg There seems to be a bug in the ptx JIT compiler (observed at driver 4962 1.1 mrg version 381.22, at -O1 and higher for sm_61), that drops the shfl 4963 1.1 mrg unless %rcond is initialized to something before 'bra Lskip'. The 4964 1.1 mrg bug is not observed with ptxas from cuda 8.0.61. 4965 1.1 mrg 4966 1.1 mrg It is true that the code is non-trivial: at Lskip, %rcond is 4967 1.1 mrg uninitialized in threads 1-31, and after the selp the same holds 4968 1.1 mrg for %rcondu32. But shfl propagates the defined value in thread 0 4969 1.1 mrg to threads 1-31, so after the shfl %rcondu32 is defined in threads 4970 1.1 mrg 0-31, and after the setp.ne %rcond is defined in threads 0-31. 4971 1.1 mrg 4972 1.1 mrg There is nothing in the PTX spec to suggest that this is wrong, or 4973 1.1 mrg to explain why the extra initialization is needed. So, we classify 4974 1.1 mrg it as a JIT bug, and the extra initialization as workaround: 4975 1.1 mrg 4976 1.1 mrg { 4977 1.1 mrg .reg .u32 %x; 4978 1.1 mrg mov.u32 %x,%tid.x; 4979 1.1 mrg setp.ne.u32 %rnotvzero,%x,0; 4980 1.1 mrg } 4981 1.1 mrg 4982 1.1 mrg +.reg .pred %rcond2; 4983 1.1 mrg +setp.eq.u32 %rcond2, 1, 0; 4984 1.1 mrg 4985 1.1 mrg @%rnotvzero bra Lskip; 4986 1.1 mrg setp.<op>.<type> %rcond,op1,op2; 4987 1.1 mrg +mov.pred %rcond2, %rcond; 4988 1.1 mrg Lskip: 4989 1.1 mrg +mov.pred %rcond, %rcond2; 4990 1.1 mrg selp.u32 %rcondu32,1,0,%rcond; 4991 1.1 mrg shfl.idx.b32 %rcondu32,%rcondu32,0,31; 4992 1.1 mrg setp.ne.u32 %rcond,%rcondu32,0; 4993 1.1 mrg */ 4994 1.1 mrg rtx_insn *label = PREV_INSN (tail); 4995 1.1 mrg if (label == warp_sync) 4996 1.1 mrg label = PREV_INSN (label); 4997 1.1 mrg gcc_assert (label && LABEL_P (label)); 4998 1.1 mrg rtx tmp = gen_reg_rtx (BImode); 4999 1.1 mrg emit_insn_before (gen_movbi (tmp, const0_rtx), 5000 1.1 mrg bb_first_real_insn (from)); 5001 1.1 mrg emit_insn_before (gen_rtx_SET (tmp, pvar), label); 5002 1.1 mrg emit_insn_before (gen_rtx_SET (pvar, tmp), tail); 5003 1.1 mrg #endif 5004 1.1 mrg emit_insn_before (nvptx_gen_warp_bcast (pvar), tail); 5005 1.1 mrg } 5006 1.1 mrg else 5007 1.1 mrg { 5008 1.1 mrg /* Includes worker mode, do spill & fill. By construction 5009 1.1 mrg we should never have worker mode only. */ 5010 1.1 mrg broadcast_data_t data; 5011 1.1 mrg unsigned size = GET_MODE_SIZE (SImode); 5012 1.1 mrg bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0; 5013 1.1 mrg bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0; 5014 1.1 mrg rtx barrier = GEN_INT (0); 5015 1.1 mrg int threads = 0; 5016 1.1 mrg 5017 1.1 mrg data.base = oacc_bcast_sym; 5018 1.1 mrg data.ptr = 0; 5019 1.1 mrg 5020 1.1 mrg bool use_partitioning_p = (vector && !worker 5021 1.1 mrg && nvptx_mach_max_workers () > 1 5022 1.1 mrg && cfun->machine->bcast_partition); 5023 1.1 mrg if (use_partitioning_p) 5024 1.1 mrg { 5025 1.1 mrg data.base = cfun->machine->bcast_partition; 5026 1.1 mrg barrier = cfun->machine->sync_bar; 5027 1.1 mrg threads = nvptx_mach_vector_length (); 5028 1.1 mrg } 5029 1.1 mrg gcc_assert (data.base != NULL); 5030 1.1 mrg gcc_assert (barrier); 5031 1.1 mrg 5032 1.1 mrg unsigned int psize = ROUND_UP (size, oacc_bcast_align); 5033 1.1 mrg unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE 5034 1.1 mrg ? nvptx_mach_max_workers () + 1 5035 1.1 mrg : 1); 5036 1.1 mrg 5037 1.1 mrg oacc_bcast_partition = MAX (oacc_bcast_partition, psize); 5038 1.1 mrg oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum); 5039 1.1 mrg 5040 1.1 mrg data.offset = 0; 5041 1.1 mrg emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data, 5042 1.1 mrg vector), 5043 1.1 mrg before); 5044 1.1 mrg 5045 1.1 mrg /* Barrier so other workers can see the write. */ 5046 1.1 mrg emit_insn_before (nvptx_cta_sync (barrier, threads), tail); 5047 1.1 mrg data.offset = 0; 5048 1.1 mrg emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data, 5049 1.1 mrg vector), 5050 1.1 mrg tail); 5051 1.1 mrg /* This barrier is needed to avoid worker zero clobbering 5052 1.1 mrg the broadcast buffer before all the other workers have 5053 1.1 mrg had a chance to read this instance of it. */ 5054 1.1 mrg emit_insn_before (nvptx_cta_sync (barrier, threads), tail); 5055 1.1 mrg } 5056 1.1 mrg 5057 1.1 mrg extract_insn (tail); 5058 1.1 mrg rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar), 5059 1.1 mrg UNSPEC_BR_UNIFIED); 5060 1.1 mrg validate_change (tail, recog_data.operand_loc[0], unsp, false); 5061 1.1 mrg } 5062 1.1 mrg 5063 1.1 mrg bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump, 5064 1.1 mrg vector_label, worker_label); 5065 1.1 mrg if (!seen_label) 5066 1.1 mrg verify_neutering_labels (to, vector_label, worker_label); 5067 1.1 mrg } 5068 1.1 mrg 5069 1.1 mrg /* PAR is a parallel that is being skipped in its entirety according to 5070 1.1 mrg MASK. Treat this as skipping a superblock starting at forked 5071 1.1 mrg and ending at joining. */ 5072 1.1 mrg 5073 1.1 mrg static void 5074 1.1 mrg nvptx_skip_par (unsigned mask, parallel *par) 5075 1.1 mrg { 5076 1.1 mrg basic_block tail = par->join_block; 5077 1.1 mrg gcc_assert (tail->preds->length () == 1); 5078 1.1 mrg 5079 1.1 mrg basic_block pre_tail = (*tail->preds)[0]->src; 5080 1.1 mrg gcc_assert (pre_tail->succs->length () == 1); 5081 1.1 mrg 5082 1.1 mrg nvptx_single (mask, par->forked_block, pre_tail); 5083 1.1 mrg } 5084 1.1 mrg 5085 1.1 mrg /* If PAR has a single inner parallel and PAR itself only contains 5086 1.1 mrg empty entry and exit blocks, swallow the inner PAR. */ 5087 1.1 mrg 5088 1.1 mrg static void 5089 1.1 mrg nvptx_optimize_inner (parallel *par) 5090 1.1 mrg { 5091 1.1 mrg parallel *inner = par->inner; 5092 1.1 mrg 5093 1.1 mrg /* We mustn't be the outer dummy par. */ 5094 1.1 mrg if (!par->mask) 5095 1.1 mrg return; 5096 1.1 mrg 5097 1.1 mrg /* We must have a single inner par. */ 5098 1.1 mrg if (!inner || inner->next) 5099 1.1 mrg return; 5100 1.1 mrg 5101 1.1 mrg /* We must only contain 2 blocks ourselves -- the head and tail of 5102 1.1 mrg the inner par. */ 5103 1.1 mrg if (par->blocks.length () != 2) 5104 1.1 mrg return; 5105 1.1 mrg 5106 1.1 mrg /* We must be disjoint partitioning. As we only have vector and 5107 1.1 mrg worker partitioning, this is sufficient to guarantee the pars 5108 1.1 mrg have adjacent partitioning. */ 5109 1.1 mrg if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)) 5110 1.1 mrg /* This indicates malformed code generation. */ 5111 1.1 mrg return; 5112 1.1 mrg 5113 1.1 mrg /* The outer forked insn should be immediately followed by the inner 5114 1.1 mrg fork insn. */ 5115 1.1 mrg rtx_insn *forked = par->forked_insn; 5116 1.1 mrg rtx_insn *fork = BB_END (par->forked_block); 5117 1.1 mrg 5118 1.1 mrg if (NEXT_INSN (forked) != fork) 5119 1.1 mrg return; 5120 1.1 mrg gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork); 5121 1.1 mrg 5122 1.1 mrg /* The outer joining insn must immediately follow the inner join 5123 1.1 mrg insn. */ 5124 1.1 mrg rtx_insn *joining = par->joining_insn; 5125 1.1 mrg rtx_insn *join = inner->join_insn; 5126 1.1 mrg if (NEXT_INSN (join) != joining) 5127 1.1 mrg return; 5128 1.1 mrg 5129 1.1 mrg /* Preconditions met. Swallow the inner par. */ 5130 1.1 mrg if (dump_file) 5131 1.1 mrg fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n", 5132 1.1 mrg inner->mask, inner->forked_block->index, 5133 1.1 mrg inner->join_block->index, 5134 1.1 mrg par->mask, par->forked_block->index, par->join_block->index); 5135 1.1 mrg 5136 1.1 mrg par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1); 5137 1.1 mrg 5138 1.1 mrg par->blocks.reserve (inner->blocks.length ()); 5139 1.1 mrg while (inner->blocks.length ()) 5140 1.1 mrg par->blocks.quick_push (inner->blocks.pop ()); 5141 1.1 mrg 5142 1.1 mrg par->inner = inner->inner; 5143 1.1 mrg inner->inner = NULL; 5144 1.1 mrg 5145 1.1 mrg delete inner; 5146 1.1 mrg } 5147 1.1 mrg 5148 1.1 mrg /* Process the parallel PAR and all its contained 5149 1.1 mrg parallels. We do everything but the neutering. Return mask of 5150 1.1 mrg partitioned modes used within this parallel. */ 5151 1.1 mrg 5152 1.1 mrg static unsigned 5153 1.1 mrg nvptx_process_pars (parallel *par) 5154 1.1 mrg { 5155 1.1 mrg if (nvptx_optimize) 5156 1.1 mrg nvptx_optimize_inner (par); 5157 1.1 mrg 5158 1.1 mrg unsigned inner_mask = par->mask; 5159 1.1 mrg 5160 1.1 mrg /* Do the inner parallels first. */ 5161 1.1 mrg if (par->inner) 5162 1.1 mrg { 5163 1.1 mrg par->inner_mask = nvptx_process_pars (par->inner); 5164 1.1 mrg inner_mask |= par->inner_mask; 5165 1.1 mrg } 5166 1.1 mrg 5167 1.1 mrg bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0; 5168 1.1 mrg bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)); 5169 1.1 mrg bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) 5170 1.1 mrg && nvptx_mach_vector_length () > PTX_WARP_SIZE); 5171 1.1 mrg 5172 1.1 mrg if (worker || large_vector) 5173 1.1 mrg { 5174 1.1 mrg nvptx_shared_propagate (false, is_call, par->forked_block, 5175 1.1 mrg par->forked_insn, !worker); 5176 1.1 mrg bool no_prop_p 5177 1.1 mrg = nvptx_shared_propagate (true, is_call, par->forked_block, 5178 1.1 mrg par->fork_insn, !worker); 5179 1.1 mrg bool empty_loop_p 5180 1.1 mrg = !is_call && (NEXT_INSN (par->forked_insn) 5181 1.1 mrg && NEXT_INSN (par->forked_insn) == par->joining_insn); 5182 1.1 mrg rtx barrier = GEN_INT (0); 5183 1.1 mrg int threads = 0; 5184 1.1 mrg 5185 1.1 mrg if (!worker && cfun->machine->sync_bar) 5186 1.1 mrg { 5187 1.1 mrg barrier = cfun->machine->sync_bar; 5188 1.1 mrg threads = nvptx_mach_vector_length (); 5189 1.1 mrg } 5190 1.1 mrg 5191 1.1 mrg if (no_prop_p && empty_loop_p) 5192 1.1 mrg ; 5193 1.1 mrg else if (no_prop_p && is_call) 5194 1.1 mrg ; 5195 1.1 mrg else 5196 1.1 mrg { 5197 1.1 mrg /* Insert begin and end synchronizations. */ 5198 1.1 mrg emit_insn_before (nvptx_cta_sync (barrier, threads), 5199 1.1 mrg par->forked_insn); 5200 1.1 mrg emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn); 5201 1.1 mrg } 5202 1.1 mrg } 5203 1.1 mrg else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) 5204 1.1 mrg nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn); 5205 1.1 mrg 5206 1.1 mrg /* Now do siblings. */ 5207 1.1 mrg if (par->next) 5208 1.1 mrg inner_mask |= nvptx_process_pars (par->next); 5209 1.1 mrg return inner_mask; 5210 1.1 mrg } 5211 1.1 mrg 5212 1.1 mrg /* Neuter the parallel described by PAR. We recurse in depth-first 5213 1.1 mrg order. MODES are the partitioning of the execution and OUTER is 5214 1.1 mrg the partitioning of the parallels we are contained in. */ 5215 1.1 mrg 5216 1.1 mrg static void 5217 1.1 mrg nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer) 5218 1.1 mrg { 5219 1.1 mrg unsigned me = (par->mask 5220 1.1 mrg & (GOMP_DIM_MASK (GOMP_DIM_WORKER) 5221 1.1 mrg | GOMP_DIM_MASK (GOMP_DIM_VECTOR))); 5222 1.1 mrg unsigned skip_mask = 0, neuter_mask = 0; 5223 1.1 mrg 5224 1.1 mrg if (par->inner) 5225 1.1 mrg nvptx_neuter_pars (par->inner, modes, outer | me); 5226 1.1 mrg 5227 1.1 mrg for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) 5228 1.1 mrg { 5229 1.1 mrg if ((outer | me) & GOMP_DIM_MASK (mode)) 5230 1.1 mrg {} /* Mode is partitioned: no neutering. */ 5231 1.1 mrg else if (!(modes & GOMP_DIM_MASK (mode))) 5232 1.1 mrg {} /* Mode is not used: nothing to do. */ 5233 1.1 mrg else if (par->inner_mask & GOMP_DIM_MASK (mode) 5234 1.1 mrg || !par->forked_insn) 5235 1.1 mrg /* Partitioned in inner parallels, or we're not a partitioned 5236 1.1 mrg at all: neuter individual blocks. */ 5237 1.1 mrg neuter_mask |= GOMP_DIM_MASK (mode); 5238 1.1 mrg else if (!par->parent || !par->parent->forked_insn 5239 1.1 mrg || par->parent->inner_mask & GOMP_DIM_MASK (mode)) 5240 1.1 mrg /* Parent isn't a parallel or contains this paralleling: skip 5241 1.1 mrg parallel at this level. */ 5242 1.1 mrg skip_mask |= GOMP_DIM_MASK (mode); 5243 1.1 mrg else 5244 1.1 mrg {} /* Parent will skip this parallel itself. */ 5245 1.1 mrg } 5246 1.1 mrg 5247 1.1 mrg if (neuter_mask) 5248 1.1 mrg { 5249 1.1 mrg int ix, len; 5250 1.1 mrg 5251 1.1 mrg if (nvptx_optimize) 5252 1.1 mrg { 5253 1.1 mrg /* Neuter whole SESE regions. */ 5254 1.1 mrg bb_pair_vec_t regions; 5255 1.1 mrg 5256 1.1 mrg nvptx_find_sese (par->blocks, regions); 5257 1.1 mrg len = regions.length (); 5258 1.1 mrg for (ix = 0; ix != len; ix++) 5259 1.1 mrg { 5260 1.1 mrg basic_block from = regions[ix].first; 5261 1.1 mrg basic_block to = regions[ix].second; 5262 1.1 mrg 5263 1.1 mrg if (from) 5264 1.1 mrg nvptx_single (neuter_mask, from, to); 5265 1.1 mrg else 5266 1.1 mrg gcc_assert (!to); 5267 1.1 mrg } 5268 1.1 mrg } 5269 1.1 mrg else 5270 1.1 mrg { 5271 1.1 mrg /* Neuter each BB individually. */ 5272 1.1 mrg len = par->blocks.length (); 5273 1.1 mrg for (ix = 0; ix != len; ix++) 5274 1.1 mrg { 5275 1.1 mrg basic_block block = par->blocks[ix]; 5276 1.1 mrg 5277 1.1 mrg nvptx_single (neuter_mask, block, block); 5278 1.1 mrg } 5279 1.1 mrg } 5280 1.1 mrg } 5281 1.1 mrg 5282 1.1 mrg if (skip_mask) 5283 1.1 mrg nvptx_skip_par (skip_mask, par); 5284 1.1 mrg 5285 1.1 mrg if (par->next) 5286 1.1 mrg nvptx_neuter_pars (par->next, modes, outer); 5287 1.1 mrg } 5288 1.1 mrg 5289 1.1 mrg static void 5290 1.1 mrg populate_offload_attrs (offload_attrs *oa) 5291 1.1 mrg { 5292 1.1 mrg tree attr = oacc_get_fn_attrib (current_function_decl); 5293 1.1 mrg tree dims = TREE_VALUE (attr); 5294 1.1 mrg unsigned ix; 5295 1.1 mrg 5296 1.1 mrg oa->mask = 0; 5297 1.1 mrg 5298 1.1 mrg for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) 5299 1.1 mrg { 5300 1.1 mrg tree t = TREE_VALUE (dims); 5301 1.1 mrg int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t); 5302 1.1 mrg tree allowed = TREE_PURPOSE (dims); 5303 1.1 mrg 5304 1.1 mrg if (size != 1 && !(allowed && integer_zerop (allowed))) 5305 1.1 mrg oa->mask |= GOMP_DIM_MASK (ix); 5306 1.1 mrg 5307 1.1 mrg switch (ix) 5308 1.1 mrg { 5309 1.1 mrg case GOMP_DIM_GANG: 5310 1.1 mrg oa->num_gangs = size; 5311 1.1 mrg break; 5312 1.1 mrg 5313 1.1 mrg case GOMP_DIM_WORKER: 5314 1.1 mrg oa->num_workers = size; 5315 1.1 mrg break; 5316 1.1 mrg 5317 1.1 mrg case GOMP_DIM_VECTOR: 5318 1.1 mrg oa->vector_length = size; 5319 1.1 mrg break; 5320 1.1 mrg } 5321 1.1 mrg } 5322 1.1 mrg } 5323 1.1 mrg 5324 1.1 mrg #if WORKAROUND_PTXJIT_BUG_2 5325 1.1 mrg /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant 5326 1.1 mrg is needed in the nvptx target because the branches generated for 5327 1.1 mrg parititioning are NONJUMP_INSN_P, not JUMP_P. */ 5328 1.1 mrg 5329 1.1 mrg static rtx 5330 1.1 mrg nvptx_pc_set (const rtx_insn *insn, bool strict = true) 5331 1.1 mrg { 5332 1.1 mrg rtx pat; 5333 1.1 mrg if ((strict && !JUMP_P (insn)) 5334 1.1 mrg || (!strict && !INSN_P (insn))) 5335 1.1 mrg return NULL_RTX; 5336 1.1 mrg pat = PATTERN (insn); 5337 1.1 mrg 5338 1.1 mrg /* The set is allowed to appear either as the insn pattern or 5339 1.1 mrg the first set in a PARALLEL. */ 5340 1.1 mrg if (GET_CODE (pat) == PARALLEL) 5341 1.1 mrg pat = XVECEXP (pat, 0, 0); 5342 1.1 mrg if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC) 5343 1.1 mrg return pat; 5344 1.1 mrg 5345 1.1 mrg return NULL_RTX; 5346 1.1 mrg } 5347 1.1 mrg 5348 1.1 mrg /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */ 5349 1.1 mrg 5350 1.1 mrg static rtx 5351 1.1 mrg nvptx_condjump_label (const rtx_insn *insn, bool strict = true) 5352 1.1 mrg { 5353 1.1 mrg rtx x = nvptx_pc_set (insn, strict); 5354 1.1 mrg 5355 1.1 mrg if (!x) 5356 1.1 mrg return NULL_RTX; 5357 1.1 mrg x = SET_SRC (x); 5358 1.1 mrg if (GET_CODE (x) == LABEL_REF) 5359 1.1 mrg return x; 5360 1.1 mrg if (GET_CODE (x) != IF_THEN_ELSE) 5361 1.1 mrg return NULL_RTX; 5362 1.1 mrg if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF) 5363 1.1 mrg return XEXP (x, 1); 5364 1.1 mrg if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF) 5365 1.1 mrg return XEXP (x, 2); 5366 1.1 mrg return NULL_RTX; 5367 1.1 mrg } 5368 1.1 mrg 5369 1.1 mrg /* Insert a dummy ptx insn when encountering a branch to a label with no ptx 5370 1.1 mrg insn inbetween the branch and the label. This works around a JIT bug 5371 1.1 mrg observed at driver version 384.111, at -O0 for sm_50. */ 5372 1.1 mrg 5373 1.1 mrg static void 5374 1.1 mrg prevent_branch_around_nothing (void) 5375 1.1 mrg { 5376 1.1 mrg rtx_insn *seen_label = NULL; 5377 1.1 mrg for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn)) 5378 1.1 mrg { 5379 1.1 mrg if (INSN_P (insn) && condjump_p (insn)) 5380 1.1 mrg { 5381 1.1 mrg seen_label = label_ref_label (nvptx_condjump_label (insn, false)); 5382 1.1 mrg continue; 5383 1.1 mrg } 5384 1.1 mrg 5385 1.1 mrg if (seen_label == NULL) 5386 1.1 mrg continue; 5387 1.1 mrg 5388 1.1 mrg if (NOTE_P (insn) || DEBUG_INSN_P (insn)) 5389 1.1 mrg continue; 5390 1.1 mrg 5391 1.1 mrg if (INSN_P (insn)) 5392 1.1 mrg switch (recog_memoized (insn)) 5393 1.1 mrg { 5394 1.1 mrg case CODE_FOR_nvptx_fork: 5395 1.1 mrg case CODE_FOR_nvptx_forked: 5396 1.1 mrg case CODE_FOR_nvptx_joining: 5397 1.1 mrg case CODE_FOR_nvptx_join: 5398 1.1 mrg case CODE_FOR_nop: 5399 1.1 mrg continue; 5400 1.1 mrg case -1: 5401 1.1 mrg /* Handle asm ("") and similar. */ 5402 1.1 mrg if (GET_CODE (PATTERN (insn)) == ASM_INPUT 5403 1.1 mrg || GET_CODE (PATTERN (insn)) == ASM_OPERANDS 5404 1.1 mrg || (GET_CODE (PATTERN (insn)) == PARALLEL 5405 1.1 mrg && asm_noperands (PATTERN (insn)) >= 0)) 5406 1.1 mrg continue; 5407 1.1 mrg /* FALLTHROUGH. */ 5408 1.1 mrg default: 5409 1.1 mrg seen_label = NULL; 5410 1.1 mrg continue; 5411 1.1 mrg } 5412 1.1 mrg 5413 1.1 mrg if (LABEL_P (insn) && insn == seen_label) 5414 1.1 mrg emit_insn_before (gen_fake_nop (), insn); 5415 1.1 mrg 5416 1.1 mrg seen_label = NULL; 5417 1.1 mrg } 5418 1.1 mrg } 5419 1.1 mrg #endif 5420 1.1 mrg 5421 1.1 mrg #ifdef WORKAROUND_PTXJIT_BUG_3 5422 1.1 mrg /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This 5423 1.1 mrg works around a hang observed at driver version 390.48 for sm_50. */ 5424 1.1 mrg 5425 1.1 mrg static void 5426 1.1 mrg workaround_barsyncs (void) 5427 1.1 mrg { 5428 1.1 mrg bool seen_barsync = false; 5429 1.1 mrg for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn)) 5430 1.1 mrg { 5431 1.1 mrg if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync) 5432 1.1 mrg { 5433 1.1 mrg if (seen_barsync) 5434 1.1 mrg { 5435 1.1 mrg emit_insn_before (gen_nvptx_membar_cta (), insn); 5436 1.1 mrg emit_insn_before (gen_nvptx_membar_cta (), insn); 5437 1.1 mrg } 5438 1.1 mrg 5439 1.1 mrg seen_barsync = true; 5440 1.1 mrg continue; 5441 1.1 mrg } 5442 1.1 mrg 5443 1.1 mrg if (!seen_barsync) 5444 1.1 mrg continue; 5445 1.1 mrg 5446 1.1 mrg if (NOTE_P (insn) || DEBUG_INSN_P (insn)) 5447 1.1 mrg continue; 5448 1.1 mrg else if (INSN_P (insn)) 5449 1.1 mrg switch (recog_memoized (insn)) 5450 1.1 mrg { 5451 1.1 mrg case CODE_FOR_nvptx_fork: 5452 1.1 mrg case CODE_FOR_nvptx_forked: 5453 1.1 mrg case CODE_FOR_nvptx_joining: 5454 1.1 mrg case CODE_FOR_nvptx_join: 5455 1.1 mrg continue; 5456 1.1 mrg default: 5457 1.1 mrg break; 5458 1.1 mrg } 5459 1.1 mrg 5460 1.1 mrg seen_barsync = false; 5461 1.1 mrg } 5462 1.1 mrg } 5463 1.1 mrg #endif 5464 1.1 mrg 5465 1.1 mrg static rtx 5466 1.1 mrg gen_comment (const char *s) 5467 1.1 mrg { 5468 1.1 mrg const char *sep = " "; 5469 1.1 mrg size_t len = strlen (ASM_COMMENT_START) + strlen (sep) + strlen (s) + 1; 5470 1.1 mrg char *comment = (char *) alloca (len); 5471 1.1 mrg snprintf (comment, len, "%s%s%s", ASM_COMMENT_START, sep, s); 5472 1.1 mrg return gen_rtx_ASM_INPUT_loc (VOIDmode, ggc_strdup (comment), 5473 1.1 mrg DECL_SOURCE_LOCATION (cfun->decl)); 5474 1.1 mrg } 5475 1.1 mrg 5476 1.1 mrg /* Initialize all declared regs at function entry. 5477 1.1 mrg Advantage : Fool-proof. 5478 1.1 mrg Disadvantage: Potentially creates a lot of long live ranges and adds a lot 5479 1.1 mrg of insns. */ 5480 1.1 mrg 5481 1.1 mrg static void 5482 1.1 mrg workaround_uninit_method_1 (void) 5483 1.1 mrg { 5484 1.1 mrg rtx_insn *first = get_insns (); 5485 1.1 mrg rtx_insn *insert_here = NULL; 5486 1.1 mrg 5487 1.1 mrg for (int ix = LAST_VIRTUAL_REGISTER + 1; ix < max_reg_num (); ix++) 5488 1.1 mrg { 5489 1.1 mrg rtx reg = regno_reg_rtx[ix]; 5490 1.1 mrg 5491 1.1 mrg /* Skip undeclared registers. */ 5492 1.1 mrg if (reg == const0_rtx) 5493 1.1 mrg continue; 5494 1.1 mrg 5495 1.1 mrg gcc_assert (CONST0_RTX (GET_MODE (reg))); 5496 1.1 mrg 5497 1.1 mrg start_sequence (); 5498 1.1 mrg if (nvptx_comment && first != NULL) 5499 1.1 mrg emit_insn (gen_comment ("Start: Added by -minit-regs=1")); 5500 1.1 mrg emit_move_insn (reg, CONST0_RTX (GET_MODE (reg))); 5501 1.1 mrg rtx_insn *inits = get_insns (); 5502 1.1 mrg end_sequence (); 5503 1.1 mrg 5504 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 5505 1.1 mrg for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init)) 5506 1.1 mrg fprintf (dump_file, "Default init of reg %u inserted: insn %u\n", 5507 1.1 mrg ix, INSN_UID (init)); 5508 1.1 mrg 5509 1.1 mrg if (first != NULL) 5510 1.1 mrg { 5511 1.1 mrg insert_here = emit_insn_before (inits, first); 5512 1.1 mrg first = NULL; 5513 1.1 mrg } 5514 1.1 mrg else 5515 1.1 mrg insert_here = emit_insn_after (inits, insert_here); 5516 1.1 mrg } 5517 1.1 mrg 5518 1.1 mrg if (nvptx_comment && insert_here != NULL) 5519 1.1 mrg emit_insn_after (gen_comment ("End: Added by -minit-regs=1"), insert_here); 5520 1.1 mrg } 5521 1.1 mrg 5522 1.1 mrg /* Find uses of regs that are not defined on all incoming paths, and insert a 5523 1.1 mrg corresponding def at function entry. 5524 1.1 mrg Advantage : Simple. 5525 1.1 mrg Disadvantage: Potentially creates long live ranges. 5526 1.1 mrg May not catch all cases. F.i. a clobber cuts a live range in 5527 1.1 mrg the compiler and may prevent entry_lr_in from being set for a 5528 1.1 mrg reg, but the clobber does not translate to a ptx insn, so in 5529 1.1 mrg ptx there still may be an uninitialized ptx reg. See f.i. 5530 1.1 mrg gcc.c-torture/compile/20020926-1.c. */ 5531 1.1 mrg 5532 1.1 mrg static void 5533 1.1 mrg workaround_uninit_method_2 (void) 5534 1.1 mrg { 5535 1.1 mrg auto_bitmap entry_pseudo_uninit; 5536 1.1 mrg { 5537 1.1 mrg auto_bitmap not_pseudo; 5538 1.1 mrg bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER); 5539 1.1 mrg 5540 1.1 mrg bitmap entry_lr_in = DF_LR_IN (ENTRY_BLOCK_PTR_FOR_FN (cfun)); 5541 1.1 mrg bitmap_and_compl (entry_pseudo_uninit, entry_lr_in, not_pseudo); 5542 1.1 mrg } 5543 1.1 mrg 5544 1.1 mrg rtx_insn *first = get_insns (); 5545 1.1 mrg rtx_insn *insert_here = NULL; 5546 1.1 mrg 5547 1.1 mrg bitmap_iterator iterator; 5548 1.1 mrg unsigned ix; 5549 1.1 mrg EXECUTE_IF_SET_IN_BITMAP (entry_pseudo_uninit, 0, ix, iterator) 5550 1.1 mrg { 5551 1.1 mrg rtx reg = regno_reg_rtx[ix]; 5552 1.1 mrg gcc_assert (CONST0_RTX (GET_MODE (reg))); 5553 1.1 mrg 5554 1.1 mrg start_sequence (); 5555 1.1 mrg if (nvptx_comment && first != NULL) 5556 1.1 mrg emit_insn (gen_comment ("Start: Added by -minit-regs=2:")); 5557 1.1 mrg emit_move_insn (reg, CONST0_RTX (GET_MODE (reg))); 5558 1.1 mrg rtx_insn *inits = get_insns (); 5559 1.1 mrg end_sequence (); 5560 1.1 mrg 5561 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 5562 1.1 mrg for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init)) 5563 1.1 mrg fprintf (dump_file, "Missing init of reg %u inserted: insn %u\n", 5564 1.1 mrg ix, INSN_UID (init)); 5565 1.1 mrg 5566 1.1 mrg if (first != NULL) 5567 1.1 mrg { 5568 1.1 mrg insert_here = emit_insn_before (inits, first); 5569 1.1 mrg first = NULL; 5570 1.1 mrg } 5571 1.1 mrg else 5572 1.1 mrg insert_here = emit_insn_after (inits, insert_here); 5573 1.1 mrg } 5574 1.1 mrg 5575 1.1 mrg if (nvptx_comment && insert_here != NULL) 5576 1.1 mrg emit_insn_after (gen_comment ("End: Added by -minit-regs=2"), insert_here); 5577 1.1 mrg } 5578 1.1 mrg 5579 1.1 mrg /* Find uses of regs that are not defined on all incoming paths, and insert a 5580 1.1 mrg corresponding def on those. 5581 1.1 mrg Advantage : Doesn't create long live ranges. 5582 1.1 mrg Disadvantage: More complex, and potentially also more defs. */ 5583 1.1 mrg 5584 1.1 mrg static void 5585 1.1 mrg workaround_uninit_method_3 (void) 5586 1.1 mrg { 5587 1.1 mrg auto_bitmap not_pseudo; 5588 1.1 mrg bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER); 5589 1.1 mrg 5590 1.1 mrg basic_block bb; 5591 1.1 mrg FOR_EACH_BB_FN (bb, cfun) 5592 1.1 mrg { 5593 1.1 mrg if (single_pred_p (bb)) 5594 1.1 mrg continue; 5595 1.1 mrg 5596 1.1 mrg auto_bitmap bb_pseudo_uninit; 5597 1.1 mrg bitmap_and_compl (bb_pseudo_uninit, DF_LIVE_IN (bb), DF_MIR_IN (bb)); 5598 1.1 mrg bitmap_and_compl_into (bb_pseudo_uninit, not_pseudo); 5599 1.1 mrg 5600 1.1 mrg bitmap_iterator iterator; 5601 1.1 mrg unsigned ix; 5602 1.1 mrg EXECUTE_IF_SET_IN_BITMAP (bb_pseudo_uninit, 0, ix, iterator) 5603 1.1 mrg { 5604 1.1 mrg bool have_false = false; 5605 1.1 mrg bool have_true = false; 5606 1.1 mrg 5607 1.1 mrg edge e; 5608 1.1 mrg edge_iterator ei; 5609 1.1 mrg FOR_EACH_EDGE (e, ei, bb->preds) 5610 1.1 mrg { 5611 1.1 mrg if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix)) 5612 1.1 mrg have_true = true; 5613 1.1 mrg else 5614 1.1 mrg have_false = true; 5615 1.1 mrg } 5616 1.1 mrg if (have_false ^ have_true) 5617 1.1 mrg continue; 5618 1.1 mrg 5619 1.1 mrg FOR_EACH_EDGE (e, ei, bb->preds) 5620 1.1 mrg { 5621 1.1 mrg if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix)) 5622 1.1 mrg continue; 5623 1.1 mrg 5624 1.1 mrg rtx reg = regno_reg_rtx[ix]; 5625 1.1 mrg gcc_assert (CONST0_RTX (GET_MODE (reg))); 5626 1.1 mrg 5627 1.1 mrg start_sequence (); 5628 1.1 mrg emit_move_insn (reg, CONST0_RTX (GET_MODE (reg))); 5629 1.1 mrg rtx_insn *inits = get_insns (); 5630 1.1 mrg end_sequence (); 5631 1.1 mrg 5632 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 5633 1.1 mrg for (rtx_insn *init = inits; init != NULL; 5634 1.1 mrg init = NEXT_INSN (init)) 5635 1.1 mrg fprintf (dump_file, 5636 1.1 mrg "Missing init of reg %u inserted on edge: %d -> %d:" 5637 1.1 mrg " insn %u\n", ix, e->src->index, e->dest->index, 5638 1.1 mrg INSN_UID (init)); 5639 1.1 mrg 5640 1.1 mrg insert_insn_on_edge (inits, e); 5641 1.1 mrg } 5642 1.1 mrg } 5643 1.1 mrg } 5644 1.1 mrg 5645 1.1 mrg if (nvptx_comment) 5646 1.1 mrg FOR_EACH_BB_FN (bb, cfun) 5647 1.1 mrg { 5648 1.1 mrg if (single_pred_p (bb)) 5649 1.1 mrg continue; 5650 1.1 mrg 5651 1.1 mrg edge e; 5652 1.1 mrg edge_iterator ei; 5653 1.1 mrg FOR_EACH_EDGE (e, ei, bb->preds) 5654 1.1 mrg { 5655 1.1 mrg if (e->insns.r == NULL_RTX) 5656 1.1 mrg continue; 5657 1.1 mrg start_sequence (); 5658 1.1 mrg emit_insn (gen_comment ("Start: Added by -minit-regs=3:")); 5659 1.1 mrg emit_insn (e->insns.r); 5660 1.1 mrg emit_insn (gen_comment ("End: Added by -minit-regs=3:")); 5661 1.1 mrg e->insns.r = get_insns (); 5662 1.1 mrg end_sequence (); 5663 1.1 mrg } 5664 1.1 mrg } 5665 1.1 mrg 5666 1.1 mrg commit_edge_insertions (); 5667 1.1 mrg } 5668 1.1 mrg 5669 1.1 mrg static void 5670 1.1 mrg workaround_uninit (void) 5671 1.1 mrg { 5672 1.1 mrg switch (nvptx_init_regs) 5673 1.1 mrg { 5674 1.1 mrg case 0: 5675 1.1 mrg /* Skip. */ 5676 1.1 mrg break; 5677 1.1 mrg case 1: 5678 1.1 mrg workaround_uninit_method_1 (); 5679 1.1 mrg break; 5680 1.1 mrg case 2: 5681 1.1 mrg workaround_uninit_method_2 (); 5682 1.1 mrg break; 5683 1.1 mrg case 3: 5684 1.1 mrg workaround_uninit_method_3 (); 5685 1.1 mrg break; 5686 1.1 mrg default: 5687 1.1 mrg gcc_unreachable (); 5688 1.1 mrg } 5689 1.1 mrg } 5690 1.1 mrg 5691 1.1 mrg /* PTX-specific reorganization 5692 1.1 mrg - Split blocks at fork and join instructions 5693 1.1 mrg - Compute live registers 5694 1.1 mrg - Mark now-unused registers, so function begin doesn't declare 5695 1.1 mrg unused registers. 5696 1.1 mrg - Insert state propagation when entering partitioned mode 5697 1.1 mrg - Insert neutering instructions when in single mode 5698 1.1 mrg - Replace subregs with suitable sequences. 5699 1.1 mrg */ 5700 1.1 mrg 5701 1.1 mrg static void 5702 1.1 mrg nvptx_reorg (void) 5703 1.1 mrg { 5704 1.1 mrg /* We are freeing block_for_insn in the toplev to keep compatibility 5705 1.1 mrg with old MDEP_REORGS that are not CFG based. Recompute it now. */ 5706 1.1 mrg compute_bb_for_insn (); 5707 1.1 mrg 5708 1.1 mrg thread_prologue_and_epilogue_insns (); 5709 1.1 mrg 5710 1.1 mrg /* Split blocks and record interesting unspecs. */ 5711 1.1 mrg bb_insn_map_t bb_insn_map; 5712 1.1 mrg 5713 1.1 mrg nvptx_split_blocks (&bb_insn_map); 5714 1.1 mrg 5715 1.1 mrg /* Compute live regs */ 5716 1.1 mrg df_clear_flags (DF_LR_RUN_DCE); 5717 1.1 mrg df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS); 5718 1.1 mrg df_live_add_problem (); 5719 1.1 mrg df_live_set_all_dirty (); 5720 1.1 mrg if (nvptx_init_regs == 3) 5721 1.1 mrg df_mir_add_problem (); 5722 1.1 mrg df_analyze (); 5723 1.1 mrg regstat_init_n_sets_and_refs (); 5724 1.1 mrg 5725 1.1 mrg if (dump_file) 5726 1.1 mrg df_dump (dump_file); 5727 1.1 mrg 5728 1.1 mrg /* Mark unused regs as unused. */ 5729 1.1 mrg int max_regs = max_reg_num (); 5730 1.1 mrg for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++) 5731 1.1 mrg if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0) 5732 1.1 mrg regno_reg_rtx[i] = const0_rtx; 5733 1.1 mrg 5734 1.1 mrg workaround_uninit (); 5735 1.1 mrg 5736 1.1 mrg /* Determine launch dimensions of the function. If it is not an 5737 1.1 mrg offloaded function (i.e. this is a regular compiler), the 5738 1.1 mrg function has no neutering. */ 5739 1.1 mrg tree attr = oacc_get_fn_attrib (current_function_decl); 5740 1.1 mrg if (attr) 5741 1.1 mrg { 5742 1.1 mrg /* If we determined this mask before RTL expansion, we could 5743 1.1 mrg elide emission of some levels of forks and joins. */ 5744 1.1 mrg offload_attrs oa; 5745 1.1 mrg 5746 1.1 mrg populate_offload_attrs (&oa); 5747 1.1 mrg 5748 1.1 mrg /* If there is worker neutering, there must be vector 5749 1.1 mrg neutering. Otherwise the hardware will fail. */ 5750 1.1 mrg gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) 5751 1.1 mrg || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))); 5752 1.1 mrg 5753 1.1 mrg /* Discover & process partitioned regions. */ 5754 1.1 mrg parallel *pars = nvptx_discover_pars (&bb_insn_map); 5755 1.1 mrg nvptx_process_pars (pars); 5756 1.1 mrg nvptx_neuter_pars (pars, oa.mask, 0); 5757 1.1 mrg delete pars; 5758 1.1 mrg } 5759 1.1 mrg 5760 1.1 mrg /* Replace subregs. */ 5761 1.1 mrg nvptx_reorg_subreg (); 5762 1.1 mrg 5763 1.1 mrg if (TARGET_UNIFORM_SIMT) 5764 1.1 mrg nvptx_reorg_uniform_simt (); 5765 1.1 mrg 5766 1.1 mrg #if WORKAROUND_PTXJIT_BUG_2 5767 1.1 mrg prevent_branch_around_nothing (); 5768 1.1 mrg #endif 5769 1.1 mrg 5770 1.1 mrg #ifdef WORKAROUND_PTXJIT_BUG_3 5771 1.1 mrg workaround_barsyncs (); 5772 1.1 mrg #endif 5773 1.1 mrg 5774 1.1 mrg regstat_free_n_sets_and_refs (); 5775 1.1 mrg 5776 1.1 mrg df_finish_pass (true); 5777 1.1 mrg } 5778 1.1 mrg 5779 1.1 mrg /* Handle a "kernel" attribute; arguments as in 5781 1.1 mrg struct attribute_spec.handler. */ 5782 1.1 mrg 5783 1.1 mrg static tree 5784 1.1 mrg nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args), 5785 1.1 mrg int ARG_UNUSED (flags), bool *no_add_attrs) 5786 1.1 mrg { 5787 1.1 mrg tree decl = *node; 5788 1.1 mrg 5789 1.1 mrg if (TREE_CODE (decl) != FUNCTION_DECL) 5790 1.1 mrg { 5791 1.1 mrg error ("%qE attribute only applies to functions", name); 5792 1.1 mrg *no_add_attrs = true; 5793 1.1 mrg } 5794 1.1 mrg else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl)))) 5795 1.1 mrg { 5796 1.1 mrg error ("%qE attribute requires a void return type", name); 5797 1.1 mrg *no_add_attrs = true; 5798 1.1 mrg } 5799 1.1 mrg 5800 1.1 mrg return NULL_TREE; 5801 1.1 mrg } 5802 1.1 mrg 5803 1.1 mrg /* Handle a "shared" attribute; arguments as in 5804 1.1 mrg struct attribute_spec.handler. */ 5805 1.1 mrg 5806 1.1 mrg static tree 5807 1.1 mrg nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args), 5808 1.1 mrg int ARG_UNUSED (flags), bool *no_add_attrs) 5809 1.1 mrg { 5810 1.1 mrg tree decl = *node; 5811 1.1 mrg 5812 1.1 mrg if (TREE_CODE (decl) != VAR_DECL) 5813 1.1 mrg { 5814 1.1 mrg error ("%qE attribute only applies to variables", name); 5815 1.1 mrg *no_add_attrs = true; 5816 1.1 mrg } 5817 1.1 mrg else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl))) 5818 1.1 mrg { 5819 1.1 mrg error ("%qE attribute not allowed with auto storage class", name); 5820 1.1 mrg *no_add_attrs = true; 5821 1.1 mrg } 5822 1.1 mrg 5823 1.1 mrg return NULL_TREE; 5824 1.1 mrg } 5825 1.1 mrg 5826 1.1 mrg /* Table of valid machine attributes. */ 5827 1.1 mrg static const struct attribute_spec nvptx_attribute_table[] = 5828 1.1 mrg { 5829 1.1 mrg /* { name, min_len, max_len, decl_req, type_req, fn_type_req, 5830 1.1 mrg affects_type_identity, handler, exclude } */ 5831 1.1 mrg { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute, 5832 1.1 mrg NULL }, 5833 1.1 mrg { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute, 5834 1.1 mrg NULL }, 5835 1.1 mrg { NULL, 0, 0, false, false, false, false, NULL, NULL } 5836 1.1 mrg }; 5837 1.1 mrg 5838 1.1 mrg /* Limit vector alignments to BIGGEST_ALIGNMENT. */ 5840 1.1 mrg 5841 1.1 mrg static HOST_WIDE_INT 5842 1.1 mrg nvptx_vector_alignment (const_tree type) 5843 1.1 mrg { 5844 1.1 mrg unsigned HOST_WIDE_INT align; 5845 1.1 mrg tree size = TYPE_SIZE (type); 5846 1.1 mrg 5847 1.1 mrg /* Ensure align is not bigger than BIGGEST_ALIGNMENT. */ 5848 1.1 mrg if (tree_fits_uhwi_p (size)) 5849 1.1 mrg { 5850 1.1 mrg align = tree_to_uhwi (size); 5851 1.1 mrg align = MIN (align, BIGGEST_ALIGNMENT); 5852 1.1 mrg } 5853 1.1 mrg else 5854 1.1 mrg align = BIGGEST_ALIGNMENT; 5855 1.1 mrg 5856 1.1 mrg /* Ensure align is not smaller than mode alignment. */ 5857 1.1 mrg align = MAX (align, GET_MODE_ALIGNMENT (TYPE_MODE (type))); 5858 1.1 mrg 5859 1.1 mrg return align; 5860 1.1 mrg } 5861 1.1 mrg 5862 1.1 mrg /* Indicate that INSN cannot be duplicated. */ 5863 1.1 mrg 5864 1.1 mrg static bool 5865 1.1 mrg nvptx_cannot_copy_insn_p (rtx_insn *insn) 5866 1.1 mrg { 5867 1.1 mrg switch (recog_memoized (insn)) 5868 1.1 mrg { 5869 1.1 mrg case CODE_FOR_nvptx_shufflesi: 5870 1.1 mrg case CODE_FOR_nvptx_shufflesf: 5871 1.1 mrg case CODE_FOR_nvptx_barsync: 5872 1.1 mrg case CODE_FOR_nvptx_fork: 5873 1.1 mrg case CODE_FOR_nvptx_forked: 5874 1.1 mrg case CODE_FOR_nvptx_joining: 5875 1.1 mrg case CODE_FOR_nvptx_join: 5876 1.1 mrg return true; 5877 1.1 mrg default: 5878 1.1 mrg return false; 5879 1.1 mrg } 5880 1.1 mrg } 5881 1.1 mrg 5882 1.1 mrg /* Section anchors do not work. Initialization for flag_section_anchor 5883 1.1 mrg probes the existence of the anchoring target hooks and prevents 5884 1.1 mrg anchoring if they don't exist. However, we may be being used with 5885 1.1 mrg a host-side compiler that does support anchoring, and hence see 5886 1.1 mrg the anchor flag set (as it's not recalculated). So provide an 5887 1.1 mrg implementation denying anchoring. */ 5888 1.1 mrg 5889 1.1 mrg static bool 5890 1.1 mrg nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a)) 5891 1.1 mrg { 5892 1.1 mrg return false; 5893 1.1 mrg } 5894 1.1 mrg 5895 1.1 mrg /* Record a symbol for mkoffload to enter into the mapping table. */ 5897 1.1 mrg 5898 1.1 mrg static void 5899 1.1 mrg nvptx_record_offload_symbol (tree decl) 5900 1.1 mrg { 5901 1.1 mrg switch (TREE_CODE (decl)) 5902 1.1 mrg { 5903 1.1 mrg case VAR_DECL: 5904 1.1 mrg fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n", 5905 1.1 mrg IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl))); 5906 1.1 mrg break; 5907 1.1 mrg 5908 1.1 mrg case FUNCTION_DECL: 5909 1.1 mrg { 5910 1.1 mrg tree attr = oacc_get_fn_attrib (decl); 5911 1.1 mrg /* OpenMP offloading does not set this attribute. */ 5912 1.1 mrg tree dims = attr ? TREE_VALUE (attr) : NULL_TREE; 5913 1.1 mrg 5914 1.1 mrg fprintf (asm_out_file, "//:FUNC_MAP \"%s\"", 5915 1.1 mrg IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl))); 5916 1.1 mrg 5917 1.1 mrg for (; dims; dims = TREE_CHAIN (dims)) 5918 1.1 mrg { 5919 1.1 mrg int size = TREE_INT_CST_LOW (TREE_VALUE (dims)); 5920 1.1 mrg 5921 1.1 mrg gcc_assert (!TREE_PURPOSE (dims)); 5922 1.1 mrg fprintf (asm_out_file, ", %#x", size); 5923 1.1 mrg } 5924 1.1 mrg 5925 1.1 mrg fprintf (asm_out_file, "\n"); 5926 1.1 mrg } 5927 1.1 mrg break; 5928 1.1 mrg 5929 1.1 mrg default: 5930 1.1 mrg gcc_unreachable (); 5931 1.1 mrg } 5932 1.1 mrg } 5933 1.1 mrg 5934 1.1 mrg /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects 5935 1.1 mrg at the start of a file. */ 5936 1.1 mrg 5937 1.1 mrg static void 5938 1.1 mrg nvptx_file_start (void) 5939 1.1 mrg { 5940 1.1 mrg fputs ("// BEGIN PREAMBLE\n", asm_out_file); 5941 1.1 mrg 5942 1.1 mrg fputs ("\t.version\t", asm_out_file); 5943 1.1 mrg fputs (ptx_version_to_string ((enum ptx_version)ptx_version_option), 5944 1.1 mrg asm_out_file); 5945 1.1 mrg fputs ("\n", asm_out_file); 5946 1.1 mrg 5947 1.1 mrg fputs ("\t.target\tsm_", asm_out_file); 5948 1.1 mrg fputs (sm_version_to_string ((enum ptx_isa)ptx_isa_option), 5949 1.1 mrg asm_out_file); 5950 1.1 mrg fputs ("\n", asm_out_file); 5951 1.1 mrg 5952 1.1 mrg fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode)); 5953 1.1 mrg 5954 1.1 mrg fputs ("// END PREAMBLE\n", asm_out_file); 5955 1.1 mrg } 5956 1.1 mrg 5957 1.1 mrg /* Emit a declaration for a worker and vector-level buffer in .shared 5958 1.1 mrg memory. */ 5959 1.1 mrg 5960 1.1 mrg static void 5961 1.1 mrg write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size) 5962 1.1 mrg { 5963 1.1 mrg const char *name = XSTR (sym, 0); 5964 1.1 mrg 5965 1.1 mrg write_var_marker (file, true, false, name); 5966 1.1 mrg fprintf (file, ".shared .align %d .u8 %s[%d];\n", 5967 1.1 mrg align, name, size); 5968 1.1 mrg } 5969 1.1 mrg 5970 1.1 mrg /* Write out the function declarations we've collected and declare storage 5971 1.1 mrg for the broadcast buffer. */ 5972 1.1 mrg 5973 1.1 mrg static void 5974 1.1 mrg nvptx_file_end (void) 5975 1.1 mrg { 5976 1.1 mrg hash_table<tree_hasher>::iterator iter; 5977 1.1 mrg tree decl; 5978 1.1 mrg FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter) 5979 1.1 mrg nvptx_record_fndecl (decl); 5980 1.1 mrg fputs (func_decls.str().c_str(), asm_out_file); 5981 1.1 mrg 5982 1.1 mrg if (oacc_bcast_size) 5983 1.1 mrg write_shared_buffer (asm_out_file, oacc_bcast_sym, 5984 1.1 mrg oacc_bcast_align, oacc_bcast_size); 5985 1.1 mrg 5986 1.1 mrg if (worker_red_size) 5987 1.1 mrg write_shared_buffer (asm_out_file, worker_red_sym, 5988 1.1 mrg worker_red_align, worker_red_size); 5989 1.1 mrg 5990 1.1 mrg if (vector_red_size) 5991 1.1 mrg write_shared_buffer (asm_out_file, vector_red_sym, 5992 1.1 mrg vector_red_align, vector_red_size); 5993 1.1 mrg 5994 1.1 mrg if (gang_private_shared_size) 5995 1.1 mrg write_shared_buffer (asm_out_file, gang_private_shared_sym, 5996 1.1 mrg gang_private_shared_align, gang_private_shared_size); 5997 1.1 mrg 5998 1.1 mrg if (need_softstack_decl) 5999 1.1 mrg { 6000 1.1 mrg write_var_marker (asm_out_file, false, true, "__nvptx_stacks"); 6001 1.1 mrg /* 32 is the maximum number of warps in a block. Even though it's an 6002 1.1 mrg external declaration, emit the array size explicitly; otherwise, it 6003 1.1 mrg may fail at PTX JIT time if the definition is later in link order. */ 6004 1.1 mrg fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n", 6005 1.1 mrg POINTER_SIZE); 6006 1.1 mrg } 6007 1.1 mrg if (need_unisimt_decl) 6008 1.1 mrg { 6009 1.1 mrg write_var_marker (asm_out_file, false, true, "__nvptx_uni"); 6010 1.1 mrg fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n"); 6011 1.1 mrg } 6012 1.1 mrg } 6013 1.1 mrg 6014 1.1 mrg /* Expander for the shuffle builtins. */ 6015 1.1 mrg 6016 1.1 mrg static rtx 6017 1.1 mrg nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore) 6018 1.1 mrg { 6019 1.1 mrg if (ignore) 6020 1.1 mrg return target; 6021 1.1 mrg 6022 1.1 mrg rtx src = expand_expr (CALL_EXPR_ARG (exp, 0), 6023 1.1 mrg NULL_RTX, mode, EXPAND_NORMAL); 6024 1.1 mrg if (!REG_P (src)) 6025 1.1 mrg src = copy_to_mode_reg (mode, src); 6026 1.1 mrg 6027 1.1 mrg rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1), 6028 1.1 mrg NULL_RTX, SImode, EXPAND_NORMAL); 6029 1.1 mrg rtx op = expand_expr (CALL_EXPR_ARG (exp, 2), 6030 1.1 mrg NULL_RTX, SImode, EXPAND_NORMAL); 6031 1.1 mrg 6032 1.1 mrg if (!REG_P (idx) && GET_CODE (idx) != CONST_INT) 6033 1.1 mrg idx = copy_to_mode_reg (SImode, idx); 6034 1.1 mrg 6035 1.1 mrg rtx pat = nvptx_gen_shuffle (target, src, idx, 6036 1.1 mrg (nvptx_shuffle_kind) INTVAL (op)); 6037 1.1 mrg if (pat) 6038 1.1 mrg emit_insn (pat); 6039 1.1 mrg 6040 1.1 mrg return target; 6041 1.1 mrg } 6042 1.1 mrg 6043 1.1 mrg const char * 6044 1.1 mrg nvptx_output_red_partition (rtx dst, rtx offset) 6045 1.1 mrg { 6046 1.1 mrg const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n"; 6047 1.1 mrg const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n"; 6048 1.1 mrg 6049 1.1 mrg if (offset == const0_rtx) 6050 1.1 mrg fprintf (asm_out_file, zero_offset, REGNO (dst), 6051 1.1 mrg REGNO (cfun->machine->red_partition)); 6052 1.1 mrg else 6053 1.1 mrg fprintf (asm_out_file, with_offset, REGNO (dst), 6054 1.1 mrg REGNO (cfun->machine->red_partition), UINTVAL (offset)); 6055 1.1 mrg 6056 1.1 mrg return ""; 6057 1.1 mrg } 6058 1.1 mrg 6059 1.1 mrg /* Shared-memory reduction address expander. */ 6060 1.1 mrg 6061 1.1 mrg static rtx 6062 1.1 mrg nvptx_expand_shared_addr (tree exp, rtx target, 6063 1.1 mrg machine_mode ARG_UNUSED (mode), int ignore, 6064 1.1 mrg int vector) 6065 1.1 mrg { 6066 1.1 mrg if (ignore) 6067 1.1 mrg return target; 6068 1.1 mrg 6069 1.1 mrg unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2)); 6070 1.1 mrg unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0)); 6071 1.1 mrg unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1)); 6072 1.1 mrg rtx addr = worker_red_sym; 6073 1.1 mrg 6074 1.1 mrg if (vector) 6075 1.1 mrg { 6076 1.1 mrg offload_attrs oa; 6077 1.1 mrg 6078 1.1 mrg populate_offload_attrs (&oa); 6079 1.1 mrg 6080 1.1 mrg unsigned int psize = ROUND_UP (size + offset, align); 6081 1.1 mrg unsigned int pnum = nvptx_mach_max_workers (); 6082 1.1 mrg vector_red_partition = MAX (vector_red_partition, psize); 6083 1.1 mrg vector_red_size = MAX (vector_red_size, psize * pnum); 6084 1.1 mrg vector_red_align = MAX (vector_red_align, align); 6085 1.1 mrg 6086 1.1 mrg if (cfun->machine->red_partition == NULL) 6087 1.1 mrg cfun->machine->red_partition = gen_reg_rtx (Pmode); 6088 1.1 mrg 6089 1.1 mrg addr = gen_reg_rtx (Pmode); 6090 1.1 mrg emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset))); 6091 1.1 mrg } 6092 1.1 mrg else 6093 1.1 mrg { 6094 1.1 mrg worker_red_align = MAX (worker_red_align, align); 6095 1.1 mrg worker_red_size = MAX (worker_red_size, size + offset); 6096 1.1 mrg 6097 1.1 mrg if (offset) 6098 1.1 mrg { 6099 1.1 mrg addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset)); 6100 1.1 mrg addr = gen_rtx_CONST (Pmode, addr); 6101 1.1 mrg } 6102 1.1 mrg } 6103 1.1 mrg 6104 1.1 mrg emit_move_insn (target, addr); 6105 1.1 mrg return target; 6106 1.1 mrg } 6107 1.1 mrg 6108 1.1 mrg /* Expand the CMP_SWAP PTX builtins. We have our own versions that do 6109 1.1 mrg not require taking the address of any object, other than the memory 6110 1.1 mrg cell being operated on. */ 6111 1.1 mrg 6112 1.1 mrg static rtx 6113 1.1 mrg nvptx_expand_cmp_swap (tree exp, rtx target, 6114 1.1 mrg machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore)) 6115 1.1 mrg { 6116 1.1 mrg machine_mode mode = TYPE_MODE (TREE_TYPE (exp)); 6117 1.1 mrg 6118 1.1 mrg if (!target) 6119 1.1 mrg target = gen_reg_rtx (mode); 6120 1.1 mrg 6121 1.1 mrg rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0), 6122 1.1 mrg NULL_RTX, Pmode, EXPAND_NORMAL); 6123 1.1 mrg rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1), 6124 1.1 mrg NULL_RTX, mode, EXPAND_NORMAL); 6125 1.1 mrg rtx src = expand_expr (CALL_EXPR_ARG (exp, 2), 6126 1.1 mrg NULL_RTX, mode, EXPAND_NORMAL); 6127 1.1 mrg rtx pat; 6128 1.1 mrg 6129 1.1 mrg mem = gen_rtx_MEM (mode, mem); 6130 1.1 mrg if (!REG_P (cmp)) 6131 1.1 mrg cmp = copy_to_mode_reg (mode, cmp); 6132 1.1 mrg if (!REG_P (src)) 6133 1.1 mrg src = copy_to_mode_reg (mode, src); 6134 1.1 mrg 6135 1.1 mrg if (mode == SImode) 6136 1.1 mrg pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx); 6137 1.1 mrg else 6138 1.1 mrg pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx); 6139 1.1 mrg 6140 1.1 mrg emit_insn (pat); 6141 1.1 mrg 6142 1.1 mrg return target; 6143 1.1 mrg } 6144 1.1 mrg 6145 1.1 mrg 6146 1.1 mrg /* Codes for all the NVPTX builtins. */ 6147 1.1 mrg enum nvptx_builtins 6148 1.1 mrg { 6149 1.1 mrg NVPTX_BUILTIN_SHUFFLE, 6150 1.1 mrg NVPTX_BUILTIN_SHUFFLELL, 6151 1.1 mrg NVPTX_BUILTIN_WORKER_ADDR, 6152 1.1 mrg NVPTX_BUILTIN_VECTOR_ADDR, 6153 1.1 mrg NVPTX_BUILTIN_CMP_SWAP, 6154 1.1 mrg NVPTX_BUILTIN_CMP_SWAPLL, 6155 1.1 mrg NVPTX_BUILTIN_MEMBAR_GL, 6156 1.1 mrg NVPTX_BUILTIN_MEMBAR_CTA, 6157 1.1 mrg NVPTX_BUILTIN_MAX 6158 1.1 mrg }; 6159 1.1 mrg 6160 1.1 mrg static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX]; 6161 1.1 mrg 6162 1.1 mrg /* Return the NVPTX builtin for CODE. */ 6163 1.1 mrg 6164 1.1 mrg static tree 6165 1.1 mrg nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p)) 6166 1.1 mrg { 6167 1.1 mrg if (code >= NVPTX_BUILTIN_MAX) 6168 1.1 mrg return error_mark_node; 6169 1.1 mrg 6170 1.1 mrg return nvptx_builtin_decls[code]; 6171 1.1 mrg } 6172 1.1 mrg 6173 1.1 mrg /* Set up all builtin functions for this target. */ 6174 1.1 mrg 6175 1.1 mrg static void 6176 1.1 mrg nvptx_init_builtins (void) 6177 1.1 mrg { 6178 1.1 mrg #define DEF(ID, NAME, T) \ 6179 1.1 mrg (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \ 6180 1.1 mrg = add_builtin_function ("__builtin_nvptx_" NAME, \ 6181 1.1 mrg build_function_type_list T, \ 6182 1.1 mrg NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL)) 6183 1.1 mrg #define ST sizetype 6184 1.1 mrg #define UINT unsigned_type_node 6185 1.1 mrg #define LLUINT long_long_unsigned_type_node 6186 1.1 mrg #define PTRVOID ptr_type_node 6187 1.1 mrg #define VOID void_type_node 6188 1.1 mrg 6189 1.1 mrg DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE)); 6190 1.1 mrg DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE)); 6191 1.1 mrg DEF (WORKER_ADDR, "worker_addr", 6192 1.1 mrg (PTRVOID, ST, UINT, UINT, NULL_TREE)); 6193 1.1 mrg DEF (VECTOR_ADDR, "vector_addr", 6194 1.1 mrg (PTRVOID, ST, UINT, UINT, NULL_TREE)); 6195 1.1 mrg DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE)); 6196 1.1 mrg DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE)); 6197 1.1 mrg DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE)); 6198 1.1 mrg DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE)); 6199 1.1 mrg 6200 1.1 mrg #undef DEF 6201 1.1 mrg #undef ST 6202 1.1 mrg #undef UINT 6203 1.1 mrg #undef LLUINT 6204 1.1 mrg #undef PTRVOID 6205 1.1 mrg } 6206 1.1 mrg 6207 1.1 mrg /* Expand an expression EXP that calls a built-in function, 6208 1.1 mrg with result going to TARGET if that's convenient 6209 1.1 mrg (and in mode MODE if that's convenient). 6210 1.1 mrg SUBTARGET may be used as the target for computing one of EXP's operands. 6211 1.1 mrg IGNORE is nonzero if the value is to be ignored. */ 6212 1.1 mrg 6213 1.1 mrg static rtx 6214 1.1 mrg nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), 6215 1.1 mrg machine_mode mode, int ignore) 6216 1.1 mrg { 6217 1.1 mrg tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); 6218 1.1 mrg switch (DECL_MD_FUNCTION_CODE (fndecl)) 6219 1.1 mrg { 6220 1.1 mrg case NVPTX_BUILTIN_SHUFFLE: 6221 1.1 mrg case NVPTX_BUILTIN_SHUFFLELL: 6222 1.1 mrg return nvptx_expand_shuffle (exp, target, mode, ignore); 6223 1.1 mrg 6224 1.1 mrg case NVPTX_BUILTIN_WORKER_ADDR: 6225 1.1 mrg return nvptx_expand_shared_addr (exp, target, mode, ignore, false); 6226 1.1 mrg 6227 1.1 mrg case NVPTX_BUILTIN_VECTOR_ADDR: 6228 1.1 mrg return nvptx_expand_shared_addr (exp, target, mode, ignore, true); 6229 1.1 mrg 6230 1.1 mrg case NVPTX_BUILTIN_CMP_SWAP: 6231 1.1 mrg case NVPTX_BUILTIN_CMP_SWAPLL: 6232 1.1 mrg return nvptx_expand_cmp_swap (exp, target, mode, ignore); 6233 1.1 mrg 6234 1.1 mrg case NVPTX_BUILTIN_MEMBAR_GL: 6235 1.1 mrg emit_insn (gen_nvptx_membar_gl ()); 6236 1.1 mrg return NULL_RTX; 6237 1.1 mrg 6238 1.1 mrg case NVPTX_BUILTIN_MEMBAR_CTA: 6239 1.1 mrg emit_insn (gen_nvptx_membar_cta ()); 6240 1.1 mrg return NULL_RTX; 6241 1.1 mrg 6242 1.1 mrg default: gcc_unreachable (); 6243 1.1 mrg } 6244 1.1 mrg } 6245 1.1 mrg 6246 1.1 mrg /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */ 6247 1.1 mrg 6248 1.1 mrg static int 6249 1.1 mrg nvptx_simt_vf () 6250 1.1 mrg { 6251 1.1 mrg return PTX_WARP_SIZE; 6252 1.1 mrg } 6253 1.1 mrg 6254 1.1 mrg /* Return 1 if TRAIT NAME is present in the OpenMP context's 6255 1.1 mrg device trait set, return 0 if not present in any OpenMP context in the 6256 1.1 mrg whole translation unit, or -1 if not present in the current OpenMP context 6257 1.1 mrg but might be present in another OpenMP context in the same TU. */ 6258 1.1 mrg 6259 1.1 mrg int 6260 1.1 mrg nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait, 6261 1.1 mrg const char *name) 6262 1.1 mrg { 6263 1.1 mrg switch (trait) 6264 1.1 mrg { 6265 1.1 mrg case omp_device_kind: 6266 1.1 mrg return strcmp (name, "gpu") == 0; 6267 1.1 mrg case omp_device_arch: 6268 1.1 mrg return strcmp (name, "nvptx") == 0; 6269 1.1 mrg case omp_device_isa: 6270 1.1 mrg #define NVPTX_SM(XX, SEP) \ 6271 1.1 mrg { \ 6272 1.1 mrg if (strcmp (name, "sm_" #XX) == 0) \ 6273 1.1 mrg return ptx_isa_option == PTX_ISA_SM ## XX; \ 6274 1.1 mrg } 6275 1.1 mrg #include "nvptx-sm.def" 6276 1.1 mrg #undef NVPTX_SM 6277 1.1 mrg return 0; 6278 1.1 mrg default: 6279 1.1 mrg gcc_unreachable (); 6280 1.1 mrg } 6281 1.1 mrg } 6282 1.1 mrg 6283 1.1 mrg static bool 6284 1.1 mrg nvptx_welformed_vector_length_p (int l) 6285 1.1 mrg { 6286 1.1 mrg gcc_assert (l > 0); 6287 1.1 mrg return l % PTX_WARP_SIZE == 0; 6288 1.1 mrg } 6289 1.1 mrg 6290 1.1 mrg static void 6291 1.1 mrg nvptx_apply_dim_limits (int dims[]) 6292 1.1 mrg { 6293 1.1 mrg /* Check that the vector_length is not too large. */ 6294 1.1 mrg if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH) 6295 1.1 mrg dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH; 6296 1.1 mrg 6297 1.1 mrg /* Check that the number of workers is not too large. */ 6298 1.1 mrg if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH) 6299 1.1 mrg dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH; 6300 1.1 mrg 6301 1.1 mrg /* Ensure that num_worker * vector_length <= cta size. */ 6302 1.1 mrg if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0 6303 1.1 mrg && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE) 6304 1.1 mrg dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; 6305 1.1 mrg 6306 1.1 mrg /* If we need a per-worker barrier ... . */ 6307 1.1 mrg if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0 6308 1.1 mrg && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE) 6309 1.1 mrg /* Don't use more barriers than available. */ 6310 1.1 mrg dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER], 6311 1.1 mrg PTX_NUM_PER_WORKER_BARRIERS); 6312 1.1 mrg } 6313 1.1 mrg 6314 1.1 mrg /* Return true if FNDECL contains calls to vector-partitionable routines. */ 6315 1.1 mrg 6316 1.1 mrg static bool 6317 1.1 mrg has_vector_partitionable_routine_calls_p (tree fndecl) 6318 1.1 mrg { 6319 1.1 mrg if (!fndecl) 6320 1.1 mrg return false; 6321 1.1 mrg 6322 1.1 mrg basic_block bb; 6323 1.1 mrg FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl)) 6324 1.1 mrg for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i); 6325 1.1 mrg gsi_next_nondebug (&i)) 6326 1.1 mrg { 6327 1.1 mrg gimple *stmt = gsi_stmt (i); 6328 1.1 mrg if (gimple_code (stmt) != GIMPLE_CALL) 6329 1.1 mrg continue; 6330 1.1 mrg 6331 1.1 mrg tree callee = gimple_call_fndecl (stmt); 6332 1.1 mrg if (!callee) 6333 1.1 mrg continue; 6334 1.1 mrg 6335 1.1 mrg tree attrs = oacc_get_fn_attrib (callee); 6336 1.1 mrg if (attrs == NULL_TREE) 6337 1.1 mrg return false; 6338 1.1 mrg 6339 1.1 mrg int partition_level = oacc_fn_attrib_level (attrs); 6340 1.1 mrg bool seq_routine_p = partition_level == GOMP_DIM_MAX; 6341 1.1 mrg if (!seq_routine_p) 6342 1.1 mrg return true; 6343 1.1 mrg } 6344 1.1 mrg 6345 1.1 mrg return false; 6346 1.1 mrg } 6347 1.1 mrg 6348 1.1 mrg /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether 6349 1.1 mrg DIMS has changed. */ 6350 1.1 mrg 6351 1.1 mrg static void 6352 1.1 mrg nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used) 6353 1.1 mrg { 6354 1.1 mrg bool oacc_default_dims_p = false; 6355 1.1 mrg bool oacc_min_dims_p = false; 6356 1.1 mrg bool offload_region_p = false; 6357 1.1 mrg bool routine_p = false; 6358 1.1 mrg bool routine_seq_p = false; 6359 1.1 mrg int default_vector_length = -1; 6360 1.1 mrg 6361 1.1 mrg if (decl == NULL_TREE) 6362 1.1 mrg { 6363 1.1 mrg if (fn_level == -1) 6364 1.1 mrg oacc_default_dims_p = true; 6365 1.1 mrg else if (fn_level == -2) 6366 1.1 mrg oacc_min_dims_p = true; 6367 1.1 mrg else 6368 1.1 mrg gcc_unreachable (); 6369 1.1 mrg } 6370 1.1 mrg else if (fn_level == -1) 6371 1.1 mrg offload_region_p = true; 6372 1.1 mrg else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX) 6373 1.1 mrg { 6374 1.1 mrg routine_p = true; 6375 1.1 mrg routine_seq_p = fn_level == GOMP_DIM_MAX; 6376 1.1 mrg } 6377 1.1 mrg else 6378 1.1 mrg gcc_unreachable (); 6379 1.1 mrg 6380 1.1 mrg if (oacc_min_dims_p) 6381 1.1 mrg { 6382 1.1 mrg gcc_assert (dims[GOMP_DIM_VECTOR] == 1); 6383 1.1 mrg gcc_assert (dims[GOMP_DIM_WORKER] == 1); 6384 1.1 mrg gcc_assert (dims[GOMP_DIM_GANG] == 1); 6385 1.1 mrg 6386 1.1 mrg dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; 6387 1.1 mrg return; 6388 1.1 mrg } 6389 1.1 mrg 6390 1.1 mrg if (routine_p) 6391 1.1 mrg { 6392 1.1 mrg if (!routine_seq_p) 6393 1.1 mrg dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; 6394 1.1 mrg 6395 1.1 mrg return; 6396 1.1 mrg } 6397 1.1 mrg 6398 1.1 mrg if (oacc_default_dims_p) 6399 1.1 mrg { 6400 1.1 mrg /* -1 : not set 6401 1.1 mrg 0 : set at runtime, f.i. -fopenacc-dims=- 6402 1.1 mrg >= 1: set at compile time, f.i. -fopenacc-dims=1. */ 6403 1.1 mrg gcc_assert (dims[GOMP_DIM_VECTOR] >= -1); 6404 1.1 mrg gcc_assert (dims[GOMP_DIM_WORKER] >= -1); 6405 1.1 mrg gcc_assert (dims[GOMP_DIM_GANG] >= -1); 6406 1.1 mrg 6407 1.1 mrg /* But -fopenacc-dims=- is not yet supported on trunk. */ 6408 1.1 mrg gcc_assert (dims[GOMP_DIM_VECTOR] != 0); 6409 1.1 mrg gcc_assert (dims[GOMP_DIM_WORKER] != 0); 6410 1.1 mrg gcc_assert (dims[GOMP_DIM_GANG] != 0); 6411 1.1 mrg } 6412 1.1 mrg 6413 1.1 mrg if (offload_region_p) 6414 1.1 mrg { 6415 1.1 mrg /* -1 : not set 6416 1.1 mrg 0 : set using variable, f.i. num_gangs (n) 6417 1.1 mrg >= 1: set using constant, f.i. num_gangs (1). */ 6418 1.1 mrg gcc_assert (dims[GOMP_DIM_VECTOR] >= -1); 6419 1.1 mrg gcc_assert (dims[GOMP_DIM_WORKER] >= -1); 6420 1.1 mrg gcc_assert (dims[GOMP_DIM_GANG] >= -1); 6421 1.1 mrg } 6422 1.1 mrg 6423 1.1 mrg if (offload_region_p) 6424 1.1 mrg default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR); 6425 1.1 mrg else 6426 1.1 mrg /* oacc_default_dims_p. */ 6427 1.1 mrg default_vector_length = PTX_DEFAULT_VECTOR_LENGTH; 6428 1.1 mrg 6429 1.1 mrg int old_dims[GOMP_DIM_MAX]; 6430 1.1 mrg unsigned int i; 6431 1.1 mrg for (i = 0; i < GOMP_DIM_MAX; ++i) 6432 1.1 mrg old_dims[i] = dims[i]; 6433 1.1 mrg 6434 1.1 mrg const char *vector_reason = NULL; 6435 1.1 mrg if (offload_region_p && has_vector_partitionable_routine_calls_p (decl)) 6436 1.1 mrg { 6437 1.1 mrg default_vector_length = PTX_WARP_SIZE; 6438 1.1 mrg 6439 1.1 mrg if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE) 6440 1.1 mrg { 6441 1.1 mrg vector_reason = G_("using %<vector_length (%d)%> due to call to" 6442 1.1 mrg " vector-partitionable routine, ignoring %d"); 6443 1.1 mrg dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; 6444 1.1 mrg } 6445 1.1 mrg } 6446 1.1 mrg 6447 1.1 mrg if (dims[GOMP_DIM_VECTOR] == 0) 6448 1.1 mrg { 6449 1.1 mrg vector_reason = G_("using %<vector_length (%d)%>, ignoring runtime setting"); 6450 1.1 mrg dims[GOMP_DIM_VECTOR] = default_vector_length; 6451 1.1 mrg } 6452 1.1 mrg 6453 1.1 mrg if (dims[GOMP_DIM_VECTOR] > 0 6454 1.1 mrg && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR])) 6455 1.1 mrg dims[GOMP_DIM_VECTOR] = default_vector_length; 6456 1.1 mrg 6457 1.1 mrg nvptx_apply_dim_limits (dims); 6458 1.1 mrg 6459 1.1 mrg if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR]) 6460 1.1 mrg warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0, 6461 1.1 mrg vector_reason != NULL 6462 1.1 mrg ? vector_reason 6463 1.1 mrg : G_("using %<vector_length (%d)%>, ignoring %d"), 6464 1.1 mrg dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]); 6465 1.1 mrg 6466 1.1 mrg if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER]) 6467 1.1 mrg warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0, 6468 1.1 mrg G_("using %<num_workers (%d)%>, ignoring %d"), 6469 1.1 mrg dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]); 6470 1.1 mrg 6471 1.1 mrg if (oacc_default_dims_p) 6472 1.1 mrg { 6473 1.1 mrg if (dims[GOMP_DIM_VECTOR] < 0) 6474 1.1 mrg dims[GOMP_DIM_VECTOR] = default_vector_length; 6475 1.1 mrg if (dims[GOMP_DIM_WORKER] < 0) 6476 1.1 mrg dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM; 6477 1.1 mrg if (dims[GOMP_DIM_GANG] < 0) 6478 1.1 mrg dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM; 6479 1.1 mrg nvptx_apply_dim_limits (dims); 6480 1.1 mrg } 6481 1.1 mrg 6482 1.1 mrg if (offload_region_p) 6483 1.1 mrg { 6484 1.1 mrg for (i = 0; i < GOMP_DIM_MAX; i++) 6485 1.1 mrg { 6486 1.1 mrg if (!(dims[i] < 0)) 6487 1.1 mrg continue; 6488 1.1 mrg 6489 1.1 mrg if ((used & GOMP_DIM_MASK (i)) == 0) 6490 1.1 mrg /* Function oacc_validate_dims will apply the minimal dimension. */ 6491 1.1 mrg continue; 6492 1.1 mrg 6493 1.1 mrg dims[i] = (i == GOMP_DIM_VECTOR 6494 1.1 mrg ? default_vector_length 6495 1.1 mrg : oacc_get_default_dim (i)); 6496 1.1 mrg } 6497 1.1 mrg 6498 1.1 mrg nvptx_apply_dim_limits (dims); 6499 1.1 mrg } 6500 1.1 mrg } 6501 1.1 mrg 6502 1.1 mrg /* Validate compute dimensions of an OpenACC offload or routine, fill 6503 1.1 mrg in non-unity defaults. FN_LEVEL indicates the level at which a 6504 1.1 mrg routine might spawn a loop. It is negative for non-routines. If 6505 1.1 mrg DECL is null, we are validating the default dimensions. */ 6506 1.1 mrg 6507 1.1 mrg static bool 6508 1.1 mrg nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used) 6509 1.1 mrg { 6510 1.1 mrg int old_dims[GOMP_DIM_MAX]; 6511 1.1 mrg unsigned int i; 6512 1.1 mrg 6513 1.1 mrg for (i = 0; i < GOMP_DIM_MAX; ++i) 6514 1.1 mrg old_dims[i] = dims[i]; 6515 1.1 mrg 6516 1.1 mrg nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used); 6517 1.1 mrg 6518 1.1 mrg gcc_assert (dims[GOMP_DIM_VECTOR] != 0); 6519 1.1 mrg if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0) 6520 1.1 mrg gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE); 6521 1.1 mrg 6522 1.1 mrg for (i = 0; i < GOMP_DIM_MAX; ++i) 6523 1.1 mrg if (old_dims[i] != dims[i]) 6524 1.1 mrg return true; 6525 1.1 mrg 6526 1.1 mrg return false; 6527 1.1 mrg } 6528 1.1 mrg 6529 1.1 mrg /* Return maximum dimension size, or zero for unbounded. */ 6530 1.1 mrg 6531 1.1 mrg static int 6532 1.1 mrg nvptx_dim_limit (int axis) 6533 1.1 mrg { 6534 1.1 mrg switch (axis) 6535 1.1 mrg { 6536 1.1 mrg case GOMP_DIM_VECTOR: 6537 1.1 mrg return PTX_MAX_VECTOR_LENGTH; 6538 1.1 mrg 6539 1.1 mrg default: 6540 1.1 mrg break; 6541 1.1 mrg } 6542 1.1 mrg return 0; 6543 1.1 mrg } 6544 1.1 mrg 6545 1.1 mrg /* Determine whether fork & joins are needed. */ 6546 1.1 mrg 6547 1.1 mrg static bool 6548 1.1 mrg nvptx_goacc_fork_join (gcall *call, const int dims[], 6549 1.1 mrg bool ARG_UNUSED (is_fork)) 6550 1.1 mrg { 6551 1.1 mrg tree arg = gimple_call_arg (call, 2); 6552 1.1 mrg unsigned axis = TREE_INT_CST_LOW (arg); 6553 1.1 mrg 6554 1.1 mrg /* We only care about worker and vector partitioning. */ 6555 1.1 mrg if (axis < GOMP_DIM_WORKER) 6556 1.1 mrg return false; 6557 1.1 mrg 6558 1.1 mrg /* If the size is 1, there's no partitioning. */ 6559 1.1 mrg if (dims[axis] == 1) 6560 1.1 mrg return false; 6561 1.1 mrg 6562 1.1 mrg return true; 6563 1.1 mrg } 6564 1.1 mrg 6565 1.1 mrg /* Generate a PTX builtin function call that returns the address in 6566 1.1 mrg the worker reduction buffer at OFFSET. TYPE is the type of the 6567 1.1 mrg data at that location. */ 6568 1.1 mrg 6569 1.1 mrg static tree 6570 1.1 mrg nvptx_get_shared_red_addr (tree type, tree offset, bool vector) 6571 1.1 mrg { 6572 1.1 mrg enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR; 6573 1.1 mrg if (vector) 6574 1.1 mrg addr_dim = NVPTX_BUILTIN_VECTOR_ADDR; 6575 1.1 mrg machine_mode mode = TYPE_MODE (type); 6576 1.1 mrg tree fndecl = nvptx_builtin_decl (addr_dim, true); 6577 1.1 mrg tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode)); 6578 1.1 mrg tree align = build_int_cst (unsigned_type_node, 6579 1.1 mrg GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT); 6580 1.1 mrg tree call = build_call_expr (fndecl, 3, offset, size, align); 6581 1.1 mrg 6582 1.1 mrg return fold_convert (build_pointer_type (type), call); 6583 1.1 mrg } 6584 1.1 mrg 6585 1.1 mrg /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function 6586 1.1 mrg will cast the variable if necessary. */ 6587 1.1 mrg 6588 1.1 mrg static void 6589 1.1 mrg nvptx_generate_vector_shuffle (location_t loc, 6590 1.1 mrg tree dest_var, tree var, unsigned shift, 6591 1.1 mrg gimple_seq *seq) 6592 1.1 mrg { 6593 1.1 mrg unsigned fn = NVPTX_BUILTIN_SHUFFLE; 6594 1.1 mrg tree_code code = NOP_EXPR; 6595 1.1 mrg tree arg_type = unsigned_type_node; 6596 1.1 mrg tree var_type = TREE_TYPE (var); 6597 1.1 mrg tree dest_type = var_type; 6598 1.1 mrg 6599 1.1 mrg if (TREE_CODE (var_type) == COMPLEX_TYPE) 6600 1.1 mrg var_type = TREE_TYPE (var_type); 6601 1.1 mrg 6602 1.1 mrg if (TREE_CODE (var_type) == REAL_TYPE) 6603 1.1 mrg code = VIEW_CONVERT_EXPR; 6604 1.1 mrg 6605 1.1 mrg if (TYPE_SIZE (var_type) 6606 1.1 mrg == TYPE_SIZE (long_long_unsigned_type_node)) 6607 1.1 mrg { 6608 1.1 mrg fn = NVPTX_BUILTIN_SHUFFLELL; 6609 1.1 mrg arg_type = long_long_unsigned_type_node; 6610 1.1 mrg } 6611 1.1 mrg 6612 1.1 mrg tree call = nvptx_builtin_decl (fn, true); 6613 1.1 mrg tree bits = build_int_cst (unsigned_type_node, shift); 6614 1.1 mrg tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN); 6615 1.1 mrg tree expr; 6616 1.1 mrg 6617 1.1 mrg if (var_type != dest_type) 6618 1.1 mrg { 6619 1.1 mrg /* Do real and imaginary parts separately. */ 6620 1.1 mrg tree real = fold_build1 (REALPART_EXPR, var_type, var); 6621 1.1 mrg real = fold_build1 (code, arg_type, real); 6622 1.1 mrg real = build_call_expr_loc (loc, call, 3, real, bits, kind); 6623 1.1 mrg real = fold_build1 (code, var_type, real); 6624 1.1 mrg 6625 1.1 mrg tree imag = fold_build1 (IMAGPART_EXPR, var_type, var); 6626 1.1 mrg imag = fold_build1 (code, arg_type, imag); 6627 1.1 mrg imag = build_call_expr_loc (loc, call, 3, imag, bits, kind); 6628 1.1 mrg imag = fold_build1 (code, var_type, imag); 6629 1.1 mrg 6630 1.1 mrg expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag); 6631 1.1 mrg } 6632 1.1 mrg else 6633 1.1 mrg { 6634 1.1 mrg expr = fold_build1 (code, arg_type, var); 6635 1.1 mrg expr = build_call_expr_loc (loc, call, 3, expr, bits, kind); 6636 1.1 mrg expr = fold_build1 (code, dest_type, expr); 6637 1.1 mrg } 6638 1.1 mrg 6639 1.1 mrg gimplify_assign (dest_var, expr, seq); 6640 1.1 mrg } 6641 1.1 mrg 6642 1.1 mrg /* Lazily generate the global lock var decl and return its address. */ 6643 1.1 mrg 6644 1.1 mrg static tree 6645 1.1 mrg nvptx_global_lock_addr () 6646 1.1 mrg { 6647 1.1 mrg tree v = global_lock_var; 6648 1.1 mrg 6649 1.1 mrg if (!v) 6650 1.1 mrg { 6651 1.1 mrg tree name = get_identifier ("__reduction_lock"); 6652 1.1 mrg tree type = build_qualified_type (unsigned_type_node, 6653 1.1 mrg TYPE_QUAL_VOLATILE); 6654 1.1 mrg v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type); 6655 1.1 mrg global_lock_var = v; 6656 1.1 mrg DECL_ARTIFICIAL (v) = 1; 6657 1.1 mrg DECL_EXTERNAL (v) = 1; 6658 1.1 mrg TREE_STATIC (v) = 1; 6659 1.1 mrg TREE_PUBLIC (v) = 1; 6660 1.1 mrg TREE_USED (v) = 1; 6661 1.1 mrg mark_addressable (v); 6662 1.1 mrg mark_decl_referenced (v); 6663 1.1 mrg } 6664 1.1 mrg 6665 1.1 mrg return build_fold_addr_expr (v); 6666 1.1 mrg } 6667 1.1 mrg 6668 1.1 mrg /* Insert code to locklessly update *PTR with *PTR OP VAR just before 6669 1.1 mrg GSI. We use a lockless scheme for nearly all case, which looks 6670 1.1 mrg like: 6671 1.1 mrg actual = initval(OP); 6672 1.1 mrg do { 6673 1.1 mrg guess = actual; 6674 1.1 mrg write = guess OP myval; 6675 1.1 mrg actual = cmp&swap (ptr, guess, write) 6676 1.1 mrg } while (actual bit-different-to guess); 6677 1.1 mrg return write; 6678 1.1 mrg 6679 1.1 mrg This relies on a cmp&swap instruction, which is available for 32- 6680 1.1 mrg and 64-bit types. Larger types must use a locking scheme. */ 6681 1.1 mrg 6682 1.1 mrg static tree 6683 1.1 mrg nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, 6684 1.1 mrg tree ptr, tree var, tree_code op) 6685 1.1 mrg { 6686 1.1 mrg unsigned fn = NVPTX_BUILTIN_CMP_SWAP; 6687 1.1 mrg tree_code code = NOP_EXPR; 6688 1.1 mrg tree arg_type = unsigned_type_node; 6689 1.1 mrg tree var_type = TREE_TYPE (var); 6690 1.1 mrg 6691 1.1 mrg if (TREE_CODE (var_type) == COMPLEX_TYPE 6692 1.1 mrg || TREE_CODE (var_type) == REAL_TYPE) 6693 1.1 mrg code = VIEW_CONVERT_EXPR; 6694 1.1 mrg 6695 1.1 mrg if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node)) 6696 1.1 mrg { 6697 1.1 mrg arg_type = long_long_unsigned_type_node; 6698 1.1 mrg fn = NVPTX_BUILTIN_CMP_SWAPLL; 6699 1.1 mrg } 6700 1.1 mrg 6701 1.1 mrg tree swap_fn = nvptx_builtin_decl (fn, true); 6702 1.1 mrg 6703 1.1 mrg gimple_seq init_seq = NULL; 6704 1.1 mrg tree init_var = make_ssa_name (arg_type); 6705 1.1 mrg tree init_expr = omp_reduction_init_op (loc, op, var_type); 6706 1.1 mrg init_expr = fold_build1 (code, arg_type, init_expr); 6707 1.1 mrg gimplify_assign (init_var, init_expr, &init_seq); 6708 1.1 mrg gimple *init_end = gimple_seq_last (init_seq); 6709 1.1 mrg 6710 1.1 mrg gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT); 6711 1.1 mrg 6712 1.1 mrg /* Split the block just after the init stmts. */ 6713 1.1 mrg basic_block pre_bb = gsi_bb (*gsi); 6714 1.1 mrg edge pre_edge = split_block (pre_bb, init_end); 6715 1.1 mrg basic_block loop_bb = pre_edge->dest; 6716 1.1 mrg pre_bb = pre_edge->src; 6717 1.1 mrg /* Reset the iterator. */ 6718 1.1 mrg *gsi = gsi_for_stmt (gsi_stmt (*gsi)); 6719 1.1 mrg 6720 1.1 mrg tree expect_var = make_ssa_name (arg_type); 6721 1.1 mrg tree actual_var = make_ssa_name (arg_type); 6722 1.1 mrg tree write_var = make_ssa_name (arg_type); 6723 1.1 mrg 6724 1.1 mrg /* Build and insert the reduction calculation. */ 6725 1.1 mrg gimple_seq red_seq = NULL; 6726 1.1 mrg tree write_expr = fold_build1 (code, var_type, expect_var); 6727 1.1 mrg write_expr = fold_build2 (op, var_type, write_expr, var); 6728 1.1 mrg write_expr = fold_build1 (code, arg_type, write_expr); 6729 1.1 mrg gimplify_assign (write_var, write_expr, &red_seq); 6730 1.1 mrg 6731 1.1 mrg gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); 6732 1.1 mrg 6733 1.1 mrg /* Build & insert the cmp&swap sequence. */ 6734 1.1 mrg gimple_seq latch_seq = NULL; 6735 1.1 mrg tree swap_expr = build_call_expr_loc (loc, swap_fn, 3, 6736 1.1 mrg ptr, expect_var, write_var); 6737 1.1 mrg gimplify_assign (actual_var, swap_expr, &latch_seq); 6738 1.1 mrg 6739 1.1 mrg gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var, 6740 1.1 mrg NULL_TREE, NULL_TREE); 6741 1.1 mrg gimple_seq_add_stmt (&latch_seq, cond); 6742 1.1 mrg 6743 1.1 mrg gimple *latch_end = gimple_seq_last (latch_seq); 6744 1.1 mrg gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT); 6745 1.1 mrg 6746 1.1 mrg /* Split the block just after the latch stmts. */ 6747 1.1 mrg edge post_edge = split_block (loop_bb, latch_end); 6748 1.1 mrg basic_block post_bb = post_edge->dest; 6749 1.1 mrg loop_bb = post_edge->src; 6750 1.1 mrg *gsi = gsi_for_stmt (gsi_stmt (*gsi)); 6751 1.1 mrg 6752 1.1 mrg post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; 6753 1.1 mrg post_edge->probability = profile_probability::even (); 6754 1.1 mrg edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE); 6755 1.1 mrg loop_edge->probability = profile_probability::even (); 6756 1.1 mrg set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb); 6757 1.1 mrg set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb); 6758 1.1 mrg 6759 1.1 mrg gphi *phi = create_phi_node (expect_var, loop_bb); 6760 1.1 mrg add_phi_arg (phi, init_var, pre_edge, loc); 6761 1.1 mrg add_phi_arg (phi, actual_var, loop_edge, loc); 6762 1.1 mrg 6763 1.1 mrg loop *loop = alloc_loop (); 6764 1.1 mrg loop->header = loop_bb; 6765 1.1 mrg loop->latch = loop_bb; 6766 1.1 mrg add_loop (loop, loop_bb->loop_father); 6767 1.1 mrg 6768 1.1 mrg return fold_build1 (code, var_type, write_var); 6769 1.1 mrg } 6770 1.1 mrg 6771 1.1 mrg /* Insert code to lockfully update *PTR with *PTR OP VAR just before 6772 1.1 mrg GSI. This is necessary for types larger than 64 bits, where there 6773 1.1 mrg is no cmp&swap instruction to implement a lockless scheme. We use 6774 1.1 mrg a lock variable in global memory. 6775 1.1 mrg 6776 1.1 mrg while (cmp&swap (&lock_var, 0, 1)) 6777 1.1 mrg continue; 6778 1.1 mrg T accum = *ptr; 6779 1.1 mrg accum = accum OP var; 6780 1.1 mrg *ptr = accum; 6781 1.1 mrg cmp&swap (&lock_var, 1, 0); 6782 1.1 mrg return accum; 6783 1.1 mrg 6784 1.1 mrg A lock in global memory is necessary to force execution engine 6785 1.1 mrg descheduling and avoid resource starvation that can occur if the 6786 1.1 mrg lock is in .shared memory. */ 6787 1.1 mrg 6788 1.1 mrg static tree 6789 1.1 mrg nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi, 6790 1.1 mrg tree ptr, tree var, tree_code op, int level) 6791 1.1 mrg { 6792 1.1 mrg tree var_type = TREE_TYPE (var); 6793 1.1 mrg tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true); 6794 1.1 mrg tree uns_unlocked = build_int_cst (unsigned_type_node, 0); 6795 1.1 mrg tree uns_locked = build_int_cst (unsigned_type_node, 1); 6796 1.1 mrg 6797 1.1 mrg /* Split the block just before the gsi. Insert a gimple nop to make 6798 1.1 mrg this easier. */ 6799 1.1 mrg gimple *nop = gimple_build_nop (); 6800 1.1 mrg gsi_insert_before (gsi, nop, GSI_SAME_STMT); 6801 1.1 mrg basic_block entry_bb = gsi_bb (*gsi); 6802 1.1 mrg edge entry_edge = split_block (entry_bb, nop); 6803 1.1 mrg basic_block lock_bb = entry_edge->dest; 6804 1.1 mrg /* Reset the iterator. */ 6805 1.1 mrg *gsi = gsi_for_stmt (gsi_stmt (*gsi)); 6806 1.1 mrg 6807 1.1 mrg /* Build and insert the locking sequence. */ 6808 1.1 mrg gimple_seq lock_seq = NULL; 6809 1.1 mrg tree lock_var = make_ssa_name (unsigned_type_node); 6810 1.1 mrg tree lock_expr = nvptx_global_lock_addr (); 6811 1.1 mrg lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr, 6812 1.1 mrg uns_unlocked, uns_locked); 6813 1.1 mrg gimplify_assign (lock_var, lock_expr, &lock_seq); 6814 1.1 mrg gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked, 6815 1.1 mrg NULL_TREE, NULL_TREE); 6816 1.1 mrg gimple_seq_add_stmt (&lock_seq, cond); 6817 1.1 mrg gimple *lock_end = gimple_seq_last (lock_seq); 6818 1.1 mrg gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT); 6819 1.1 mrg 6820 1.1 mrg /* Split the block just after the lock sequence. */ 6821 1.1 mrg edge locked_edge = split_block (lock_bb, lock_end); 6822 1.1 mrg basic_block update_bb = locked_edge->dest; 6823 1.1 mrg lock_bb = locked_edge->src; 6824 1.1 mrg *gsi = gsi_for_stmt (gsi_stmt (*gsi)); 6825 1.1 mrg 6826 1.1 mrg /* Create the lock loop ... */ 6827 1.1 mrg locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; 6828 1.1 mrg locked_edge->probability = profile_probability::even (); 6829 1.1 mrg edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE); 6830 1.1 mrg loop_edge->probability = profile_probability::even (); 6831 1.1 mrg set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb); 6832 1.1 mrg set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb); 6833 1.1 mrg 6834 1.1 mrg /* ... and the loop structure. */ 6835 1.1 mrg loop *lock_loop = alloc_loop (); 6836 1.1 mrg lock_loop->header = lock_bb; 6837 1.1 mrg lock_loop->latch = lock_bb; 6838 1.1 mrg lock_loop->nb_iterations_estimate = 1; 6839 1.1 mrg lock_loop->any_estimate = true; 6840 1.1 mrg add_loop (lock_loop, entry_bb->loop_father); 6841 1.1 mrg 6842 1.1 mrg /* Build the pre-barrier. */ 6843 1.1 mrg gimple_seq red_seq = NULL; 6844 1.1 mrg enum nvptx_builtins barrier_builtin 6845 1.1 mrg = (level == GOMP_DIM_GANG 6846 1.1 mrg ? NVPTX_BUILTIN_MEMBAR_GL 6847 1.1 mrg : NVPTX_BUILTIN_MEMBAR_CTA); 6848 1.1 mrg tree barrier_fn = nvptx_builtin_decl (barrier_builtin, true); 6849 1.1 mrg tree barrier_expr = build_call_expr_loc (loc, barrier_fn, 0); 6850 1.1 mrg gimplify_stmt (&barrier_expr, &red_seq); 6851 1.1 mrg 6852 1.1 mrg /* Build the reduction calculation. */ 6853 1.1 mrg tree acc_in = make_ssa_name (var_type); 6854 1.1 mrg tree ref_in = build_simple_mem_ref (ptr); 6855 1.1 mrg TREE_THIS_VOLATILE (ref_in) = 1; 6856 1.1 mrg gimplify_assign (acc_in, ref_in, &red_seq); 6857 1.1 mrg 6858 1.1 mrg tree acc_out = make_ssa_name (var_type); 6859 1.1 mrg tree update_expr = fold_build2 (op, var_type, ref_in, var); 6860 1.1 mrg gimplify_assign (acc_out, update_expr, &red_seq); 6861 1.1 mrg 6862 1.1 mrg tree ref_out = build_simple_mem_ref (ptr); 6863 1.1 mrg TREE_THIS_VOLATILE (ref_out) = 1; 6864 1.1 mrg gimplify_assign (ref_out, acc_out, &red_seq); 6865 1.1 mrg 6866 1.1 mrg /* Build the post-barrier. */ 6867 1.1 mrg barrier_expr = build_call_expr_loc (loc, barrier_fn, 0); 6868 1.1 mrg gimplify_stmt (&barrier_expr, &red_seq); 6869 1.1 mrg 6870 1.1 mrg /* Insert the reduction calculation. */ 6871 1.1 mrg gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); 6872 1.1 mrg 6873 1.1 mrg /* Build & insert the unlock sequence. */ 6874 1.1 mrg gimple_seq unlock_seq = NULL; 6875 1.1 mrg tree unlock_expr = nvptx_global_lock_addr (); 6876 1.1 mrg unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr, 6877 1.1 mrg uns_locked, uns_unlocked); 6878 1.1 mrg gimplify_and_add (unlock_expr, &unlock_seq); 6879 1.1 mrg gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT); 6880 1.1 mrg 6881 1.1 mrg return acc_out; 6882 1.1 mrg } 6883 1.1 mrg 6884 1.1 mrg /* Emit a sequence to update a reduction accumlator at *PTR with the 6885 1.1 mrg value held in VAR using operator OP. Return the updated value. 6886 1.1 mrg 6887 1.1 mrg TODO: optimize for atomic ops and indepedent complex ops. */ 6888 1.1 mrg 6889 1.1 mrg static tree 6890 1.1 mrg nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi, 6891 1.1 mrg tree ptr, tree var, tree_code op, int level) 6892 1.1 mrg { 6893 1.1 mrg tree type = TREE_TYPE (var); 6894 1.1 mrg tree size = TYPE_SIZE (type); 6895 1.1 mrg 6896 1.1 mrg if (size == TYPE_SIZE (unsigned_type_node) 6897 1.1 mrg || size == TYPE_SIZE (long_long_unsigned_type_node)) 6898 1.1 mrg return nvptx_lockless_update (loc, gsi, ptr, var, op); 6899 1.1 mrg else 6900 1.1 mrg return nvptx_lockfull_update (loc, gsi, ptr, var, op, level); 6901 1.1 mrg } 6902 1.1 mrg 6903 1.1 mrg /* NVPTX implementation of GOACC_REDUCTION_SETUP. */ 6904 1.1 mrg 6905 1.1 mrg static void 6906 1.1 mrg nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa) 6907 1.1 mrg { 6908 1.1 mrg gimple_stmt_iterator gsi = gsi_for_stmt (call); 6909 1.1 mrg tree lhs = gimple_call_lhs (call); 6910 1.1 mrg tree var = gimple_call_arg (call, 2); 6911 1.1 mrg int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); 6912 1.1 mrg gimple_seq seq = NULL; 6913 1.1 mrg 6914 1.1 mrg push_gimplify_context (true); 6915 1.1 mrg 6916 1.1 mrg if (level != GOMP_DIM_GANG) 6917 1.1 mrg { 6918 1.1 mrg /* Copy the receiver object. */ 6919 1.1 mrg tree ref_to_res = gimple_call_arg (call, 1); 6920 1.1 mrg 6921 1.1 mrg if (!integer_zerop (ref_to_res)) 6922 1.1 mrg var = build_simple_mem_ref (ref_to_res); 6923 1.1 mrg } 6924 1.1 mrg 6925 1.1 mrg if (level == GOMP_DIM_WORKER 6926 1.1 mrg || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE)) 6927 1.1 mrg { 6928 1.1 mrg /* Store incoming value to worker reduction buffer. */ 6929 1.1 mrg tree offset = gimple_call_arg (call, 5); 6930 1.1 mrg tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset, 6931 1.1 mrg level == GOMP_DIM_VECTOR); 6932 1.1 mrg tree ptr = make_ssa_name (TREE_TYPE (call)); 6933 1.1 mrg 6934 1.1 mrg gimplify_assign (ptr, call, &seq); 6935 1.1 mrg tree ref = build_simple_mem_ref (ptr); 6936 1.1 mrg TREE_THIS_VOLATILE (ref) = 1; 6937 1.1 mrg gimplify_assign (ref, var, &seq); 6938 1.1 mrg } 6939 1.1 mrg 6940 1.1 mrg if (lhs) 6941 1.1 mrg gimplify_assign (lhs, var, &seq); 6942 1.1 mrg 6943 1.1 mrg pop_gimplify_context (NULL); 6944 1.1 mrg gsi_replace_with_seq (&gsi, seq, true); 6945 1.1 mrg } 6946 1.1 mrg 6947 1.1 mrg /* NVPTX implementation of GOACC_REDUCTION_INIT. */ 6948 1.1 mrg 6949 1.1 mrg static void 6950 1.1 mrg nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa) 6951 1.1 mrg { 6952 1.1 mrg gimple_stmt_iterator gsi = gsi_for_stmt (call); 6953 1.1 mrg tree lhs = gimple_call_lhs (call); 6954 1.1 mrg tree var = gimple_call_arg (call, 2); 6955 1.1 mrg int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); 6956 1.1 mrg enum tree_code rcode 6957 1.1 mrg = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4)); 6958 1.1 mrg tree init = omp_reduction_init_op (gimple_location (call), rcode, 6959 1.1 mrg TREE_TYPE (var)); 6960 1.1 mrg gimple_seq seq = NULL; 6961 1.1 mrg 6962 1.1 mrg push_gimplify_context (true); 6963 1.1 mrg 6964 1.1 mrg if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE) 6965 1.1 mrg { 6966 1.1 mrg /* Initialize vector-non-zeroes to INIT_VAL (OP). */ 6967 1.1 mrg tree tid = make_ssa_name (integer_type_node); 6968 1.1 mrg tree dim_vector = gimple_call_arg (call, 3); 6969 1.1 mrg gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, 6970 1.1 mrg dim_vector); 6971 1.1 mrg gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node, 6972 1.1 mrg NULL_TREE, NULL_TREE); 6973 1.1 mrg 6974 1.1 mrg gimple_call_set_lhs (tid_call, tid); 6975 1.1 mrg gimple_seq_add_stmt (&seq, tid_call); 6976 1.1 mrg gimple_seq_add_stmt (&seq, cond_stmt); 6977 1.1 mrg 6978 1.1 mrg /* Split the block just after the call. */ 6979 1.1 mrg edge init_edge = split_block (gsi_bb (gsi), call); 6980 1.1 mrg basic_block init_bb = init_edge->dest; 6981 1.1 mrg basic_block call_bb = init_edge->src; 6982 1.1 mrg 6983 1.1 mrg /* Fixup flags from call_bb to init_bb. */ 6984 1.1 mrg init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE; 6985 1.1 mrg init_edge->probability = profile_probability::even (); 6986 1.1 mrg 6987 1.1 mrg /* Set the initialization stmts. */ 6988 1.1 mrg gimple_seq init_seq = NULL; 6989 1.1 mrg tree init_var = make_ssa_name (TREE_TYPE (var)); 6990 1.1 mrg gimplify_assign (init_var, init, &init_seq); 6991 1.1 mrg gsi = gsi_start_bb (init_bb); 6992 1.1 mrg gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT); 6993 1.1 mrg 6994 1.1 mrg /* Split block just after the init stmt. */ 6995 1.1 mrg gsi_prev (&gsi); 6996 1.1 mrg edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi)); 6997 1.1 mrg basic_block dst_bb = inited_edge->dest; 6998 1.1 mrg 6999 1.1 mrg /* Create false edge from call_bb to dst_bb. */ 7000 1.1 mrg edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE); 7001 1.1 mrg nop_edge->probability = profile_probability::even (); 7002 1.1 mrg 7003 1.1 mrg /* Create phi node in dst block. */ 7004 1.1 mrg gphi *phi = create_phi_node (lhs, dst_bb); 7005 1.1 mrg add_phi_arg (phi, init_var, inited_edge, gimple_location (call)); 7006 1.1 mrg add_phi_arg (phi, var, nop_edge, gimple_location (call)); 7007 1.1 mrg 7008 1.1 mrg /* Reset dominator of dst bb. */ 7009 1.1 mrg set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb); 7010 1.1 mrg 7011 1.1 mrg /* Reset the gsi. */ 7012 1.1 mrg gsi = gsi_for_stmt (call); 7013 1.1 mrg } 7014 1.1 mrg else 7015 1.1 mrg { 7016 1.1 mrg if (level == GOMP_DIM_GANG) 7017 1.1 mrg { 7018 1.1 mrg /* If there's no receiver object, propagate the incoming VAR. */ 7019 1.1 mrg tree ref_to_res = gimple_call_arg (call, 1); 7020 1.1 mrg if (integer_zerop (ref_to_res)) 7021 1.1 mrg init = var; 7022 1.1 mrg } 7023 1.1 mrg 7024 1.1 mrg if (lhs != NULL_TREE) 7025 1.1 mrg gimplify_assign (lhs, init, &seq); 7026 1.1 mrg } 7027 1.1 mrg 7028 1.1 mrg pop_gimplify_context (NULL); 7029 1.1 mrg gsi_replace_with_seq (&gsi, seq, true); 7030 1.1 mrg } 7031 1.1 mrg 7032 1.1 mrg /* NVPTX implementation of GOACC_REDUCTION_FINI. */ 7033 1.1 mrg 7034 1.1 mrg static void 7035 1.1 mrg nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa) 7036 1.1 mrg { 7037 1.1 mrg gimple_stmt_iterator gsi = gsi_for_stmt (call); 7038 1.1 mrg tree lhs = gimple_call_lhs (call); 7039 1.1 mrg tree ref_to_res = gimple_call_arg (call, 1); 7040 1.1 mrg tree var = gimple_call_arg (call, 2); 7041 1.1 mrg int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); 7042 1.1 mrg enum tree_code op 7043 1.1 mrg = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4)); 7044 1.1 mrg gimple_seq seq = NULL; 7045 1.1 mrg tree r = NULL_TREE;; 7046 1.1 mrg 7047 1.1 mrg push_gimplify_context (true); 7048 1.1 mrg 7049 1.1 mrg if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE) 7050 1.1 mrg { 7051 1.1 mrg /* Emit binary shuffle tree. TODO. Emit this as an actual loop, 7052 1.1 mrg but that requires a method of emitting a unified jump at the 7053 1.1 mrg gimple level. */ 7054 1.1 mrg for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1) 7055 1.1 mrg { 7056 1.1 mrg tree other_var = make_ssa_name (TREE_TYPE (var)); 7057 1.1 mrg nvptx_generate_vector_shuffle (gimple_location (call), 7058 1.1 mrg other_var, var, shfl, &seq); 7059 1.1 mrg 7060 1.1 mrg r = make_ssa_name (TREE_TYPE (var)); 7061 1.1 mrg gimplify_assign (r, fold_build2 (op, TREE_TYPE (var), 7062 1.1 mrg var, other_var), &seq); 7063 1.1 mrg var = r; 7064 1.1 mrg } 7065 1.1 mrg } 7066 1.1 mrg else 7067 1.1 mrg { 7068 1.1 mrg tree accum = NULL_TREE; 7069 1.1 mrg 7070 1.1 mrg if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR) 7071 1.1 mrg { 7072 1.1 mrg /* Get reduction buffer address. */ 7073 1.1 mrg tree offset = gimple_call_arg (call, 5); 7074 1.1 mrg tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset, 7075 1.1 mrg level == GOMP_DIM_VECTOR); 7076 1.1 mrg tree ptr = make_ssa_name (TREE_TYPE (call)); 7077 1.1 mrg 7078 1.1 mrg gimplify_assign (ptr, call, &seq); 7079 1.1 mrg accum = ptr; 7080 1.1 mrg } 7081 1.1 mrg else if (integer_zerop (ref_to_res)) 7082 1.1 mrg r = var; 7083 1.1 mrg else 7084 1.1 mrg accum = ref_to_res; 7085 1.1 mrg 7086 1.1 mrg if (accum) 7087 1.1 mrg { 7088 1.1 mrg /* UPDATE the accumulator. */ 7089 1.1 mrg gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); 7090 1.1 mrg seq = NULL; 7091 1.1 mrg r = nvptx_reduction_update (gimple_location (call), &gsi, 7092 1.1 mrg accum, var, op, level); 7093 1.1 mrg } 7094 1.1 mrg } 7095 1.1 mrg 7096 1.1 mrg if (lhs) 7097 1.1 mrg gimplify_assign (lhs, r, &seq); 7098 1.1 mrg pop_gimplify_context (NULL); 7099 1.1 mrg 7100 1.1 mrg gsi_replace_with_seq (&gsi, seq, true); 7101 1.1 mrg } 7102 1.1 mrg 7103 1.1 mrg /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */ 7104 1.1 mrg 7105 1.1 mrg static void 7106 1.1 mrg nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa) 7107 1.1 mrg { 7108 1.1 mrg gimple_stmt_iterator gsi = gsi_for_stmt (call); 7109 1.1 mrg tree lhs = gimple_call_lhs (call); 7110 1.1 mrg tree var = gimple_call_arg (call, 2); 7111 1.1 mrg int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); 7112 1.1 mrg gimple_seq seq = NULL; 7113 1.1 mrg 7114 1.1 mrg push_gimplify_context (true); 7115 1.1 mrg if (level == GOMP_DIM_WORKER 7116 1.1 mrg || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE)) 7117 1.1 mrg { 7118 1.1 mrg /* Read the worker reduction buffer. */ 7119 1.1 mrg tree offset = gimple_call_arg (call, 5); 7120 1.1 mrg tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset, 7121 1.1 mrg level == GOMP_DIM_VECTOR); 7122 1.1 mrg tree ptr = make_ssa_name (TREE_TYPE (call)); 7123 1.1 mrg 7124 1.1 mrg gimplify_assign (ptr, call, &seq); 7125 1.1 mrg var = build_simple_mem_ref (ptr); 7126 1.1 mrg TREE_THIS_VOLATILE (var) = 1; 7127 1.1 mrg } 7128 1.1 mrg 7129 1.1 mrg if (level != GOMP_DIM_GANG) 7130 1.1 mrg { 7131 1.1 mrg /* Write to the receiver object. */ 7132 1.1 mrg tree ref_to_res = gimple_call_arg (call, 1); 7133 1.1 mrg 7134 1.1 mrg if (!integer_zerop (ref_to_res)) 7135 1.1 mrg gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq); 7136 1.1 mrg } 7137 1.1 mrg 7138 1.1 mrg if (lhs) 7139 1.1 mrg gimplify_assign (lhs, var, &seq); 7140 1.1 mrg 7141 1.1 mrg pop_gimplify_context (NULL); 7142 1.1 mrg 7143 1.1 mrg gsi_replace_with_seq (&gsi, seq, true); 7144 1.1 mrg } 7145 1.1 mrg 7146 1.1 mrg /* NVPTX reduction expander. */ 7147 1.1 mrg 7148 1.1 mrg static void 7149 1.1 mrg nvptx_goacc_reduction (gcall *call) 7150 1.1 mrg { 7151 1.1 mrg unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0)); 7152 1.1 mrg offload_attrs oa; 7153 1.1 mrg 7154 1.1 mrg populate_offload_attrs (&oa); 7155 1.1 mrg 7156 1.1 mrg switch (code) 7157 1.1 mrg { 7158 1.1 mrg case IFN_GOACC_REDUCTION_SETUP: 7159 1.1 mrg nvptx_goacc_reduction_setup (call, &oa); 7160 1.1 mrg break; 7161 1.1 mrg 7162 1.1 mrg case IFN_GOACC_REDUCTION_INIT: 7163 1.1 mrg nvptx_goacc_reduction_init (call, &oa); 7164 1.1 mrg break; 7165 1.1 mrg 7166 1.1 mrg case IFN_GOACC_REDUCTION_FINI: 7167 1.1 mrg nvptx_goacc_reduction_fini (call, &oa); 7168 1.1 mrg break; 7169 1.1 mrg 7170 1.1 mrg case IFN_GOACC_REDUCTION_TEARDOWN: 7171 1.1 mrg nvptx_goacc_reduction_teardown (call, &oa); 7172 1.1 mrg break; 7173 1.1 mrg 7174 1.1 mrg default: 7175 1.1 mrg gcc_unreachable (); 7176 1.1 mrg } 7177 1.1 mrg } 7178 1.1 mrg 7179 1.1 mrg static bool 7180 1.1 mrg nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, 7181 1.1 mrg rtx x ATTRIBUTE_UNUSED) 7182 1.1 mrg { 7183 1.1 mrg return true; 7184 1.1 mrg } 7185 1.1 mrg 7186 1.1 mrg static bool 7187 1.1 mrg nvptx_scalar_mode_supported_p (scalar_mode mode) 7188 1.1 mrg { 7189 1.1 mrg if (nvptx_experimental && mode == HFmode && TARGET_SM53) 7190 1.1 mrg return true; 7191 1.1 mrg 7192 1.1 mrg return default_scalar_mode_supported_p (mode); 7193 1.1 mrg } 7194 1.1 mrg 7195 1.1 mrg static bool 7196 1.1 mrg nvptx_libgcc_floating_mode_supported_p (scalar_float_mode mode) 7197 1.1 mrg { 7198 1.1 mrg if (nvptx_experimental && mode == HFmode && TARGET_SM53) 7199 1.1 mrg return true; 7200 1.1 mrg 7201 1.1 mrg return default_libgcc_floating_mode_supported_p (mode); 7202 1.1 mrg } 7203 1.1 mrg 7204 1.1 mrg static bool 7205 1.1 mrg nvptx_vector_mode_supported (machine_mode mode) 7206 1.1 mrg { 7207 1.1 mrg return (mode == V2SImode 7208 1.1 mrg || mode == V2DImode); 7209 1.1 mrg } 7210 1.1 mrg 7211 1.1 mrg /* Return the preferred mode for vectorizing scalar MODE. */ 7212 1.1 mrg 7213 1.1 mrg static machine_mode 7214 1.1 mrg nvptx_preferred_simd_mode (scalar_mode mode) 7215 1.1 mrg { 7216 1.1 mrg switch (mode) 7217 1.1 mrg { 7218 1.1 mrg case E_DImode: 7219 1.1 mrg return V2DImode; 7220 1.1 mrg case E_SImode: 7221 1.1 mrg return V2SImode; 7222 1.1 mrg 7223 1.1 mrg default: 7224 1.1 mrg return default_preferred_simd_mode (mode); 7225 1.1 mrg } 7226 1.1 mrg } 7227 1.1 mrg 7228 1.1 mrg unsigned int 7229 1.1 mrg nvptx_data_alignment (const_tree type, unsigned int basic_align) 7230 1.1 mrg { 7231 1.1 mrg if (TREE_CODE (type) == INTEGER_TYPE) 7232 1.1 mrg { 7233 1.1 mrg unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type)); 7234 1.1 mrg if (size == GET_MODE_SIZE (TImode)) 7235 1.1 mrg return GET_MODE_BITSIZE (maybe_split_mode (TImode)); 7236 1.1 mrg } 7237 1.1 mrg 7238 1.1 mrg return basic_align; 7239 1.1 mrg } 7240 1.1 mrg 7241 1.1 mrg /* Implement TARGET_MODES_TIEABLE_P. */ 7242 1.1 mrg 7243 1.1 mrg static bool 7244 1.1 mrg nvptx_modes_tieable_p (machine_mode, machine_mode) 7245 1.1 mrg { 7246 1.1 mrg return false; 7247 1.1 mrg } 7248 1.1 mrg 7249 1.1 mrg /* Implement TARGET_HARD_REGNO_NREGS. */ 7250 1.1 mrg 7251 1.1 mrg static unsigned int 7252 1.1 mrg nvptx_hard_regno_nregs (unsigned int, machine_mode) 7253 1.1 mrg { 7254 1.1 mrg return 1; 7255 1.1 mrg } 7256 1.1 mrg 7257 1.1 mrg /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */ 7258 1.1 mrg 7259 1.1 mrg static bool 7260 1.1 mrg nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t) 7261 1.1 mrg { 7262 1.1 mrg return false; 7263 1.1 mrg } 7264 1.1 mrg 7265 1.1 mrg /* Implement TARGET_TRULY_NOOP_TRUNCATION. */ 7266 1.1 mrg 7267 1.1 mrg static bool 7268 1.1 mrg nvptx_truly_noop_truncation (poly_uint64, poly_uint64) 7269 1.1 mrg { 7270 1.1 mrg return false; 7271 1.1 mrg } 7272 1.1 mrg 7273 1.1 mrg /* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL. */ 7274 1.1 mrg 7275 1.1 mrg static tree 7276 1.1 mrg nvptx_goacc_adjust_private_decl (location_t loc, tree decl, int level) 7277 1.1 mrg { 7278 1.1 mrg gcc_checking_assert (!lookup_attribute ("oacc gang-private", 7279 1.1 mrg DECL_ATTRIBUTES (decl))); 7280 1.1 mrg 7281 1.1 mrg /* Set "oacc gang-private" attribute for gang-private variable 7282 1.1 mrg declarations. */ 7283 1.1 mrg if (level == GOMP_DIM_GANG) 7284 1.1 mrg { 7285 1.1 mrg tree id = get_identifier ("oacc gang-private"); 7286 1.1 mrg /* For later diagnostic purposes, pass LOC as VALUE (wrapped as a 7287 1.1 mrg TREE). */ 7288 1.1 mrg tree loc_tree = build_empty_stmt (loc); 7289 1.1 mrg DECL_ATTRIBUTES (decl) 7290 1.1 mrg = tree_cons (id, loc_tree, DECL_ATTRIBUTES (decl)); 7291 1.1 mrg } 7292 1.1 mrg 7293 1.1 mrg return decl; 7294 1.1 mrg } 7295 1.1 mrg 7296 1.1 mrg /* Implement TARGET_GOACC_EXPAND_VAR_DECL. */ 7297 1.1 mrg 7298 1.1 mrg static rtx 7299 1.1 mrg nvptx_goacc_expand_var_decl (tree var) 7300 1.1 mrg { 7301 1.1 mrg /* Place "oacc gang-private" variables in shared memory. */ 7302 1.1 mrg if (tree attr = lookup_attribute ("oacc gang-private", 7303 1.1 mrg DECL_ATTRIBUTES (var))) 7304 1.1 mrg { 7305 1.1 mrg gcc_checking_assert (VAR_P (var)); 7306 1.1 mrg 7307 1.1 mrg unsigned int offset, *poffset; 7308 1.1 mrg poffset = gang_private_shared_hmap.get (var); 7309 1.1 mrg if (poffset) 7310 1.1 mrg offset = *poffset; 7311 1.1 mrg else 7312 1.1 mrg { 7313 1.1 mrg unsigned HOST_WIDE_INT align = DECL_ALIGN (var); 7314 1.1 mrg gang_private_shared_size 7315 1.1 mrg = (gang_private_shared_size + align - 1) & ~(align - 1); 7316 1.1 mrg if (gang_private_shared_align < align) 7317 1.1 mrg gang_private_shared_align = align; 7318 1.1 mrg 7319 1.1 mrg offset = gang_private_shared_size; 7320 1.1 mrg bool existed = gang_private_shared_hmap.put (var, offset); 7321 1.1 mrg gcc_checking_assert (!existed); 7322 1.1 mrg gang_private_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var)); 7323 1.1 mrg 7324 1.1 mrg location_t loc = EXPR_LOCATION (TREE_VALUE (attr)); 7325 1.1 mrg #if 0 /* For some reason, this doesn't work. */ 7326 1.1 mrg if (dump_enabled_p ()) 7327 1.1 mrg { 7328 1.1 mrg dump_flags_t l_dump_flags 7329 1.1 mrg = get_openacc_privatization_dump_flags (); 7330 1.1 mrg 7331 1.1 mrg const dump_user_location_t d_u_loc 7332 1.1 mrg = dump_user_location_t::from_location_t (loc); 7333 1.1 mrg /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */ 7334 1.1 mrg #if __GNUC__ >= 10 7335 1.1 mrg # pragma GCC diagnostic push 7336 1.1 mrg # pragma GCC diagnostic ignored "-Wformat" 7337 1.1 mrg #endif 7338 1.1 mrg dump_printf_loc (l_dump_flags, d_u_loc, 7339 1.1 mrg "variable %<%T%> adjusted for OpenACC" 7340 1.1 mrg " privatization level: %qs\n", 7341 1.1 mrg var, "gang"); 7342 1.1 mrg #if __GNUC__ >= 10 7343 1.1 mrg # pragma GCC diagnostic pop 7344 1.1 mrg #endif 7345 1.1 mrg } 7346 1.1 mrg #else /* ..., thus emulate that, good enough for testsuite usage. */ 7347 1.1 mrg if (param_openacc_privatization != OPENACC_PRIVATIZATION_QUIET) 7348 1.1 mrg inform (loc, 7349 1.1 mrg "variable %qD adjusted for OpenACC privatization level:" 7350 1.1 mrg " %qs", 7351 1.1 mrg var, "gang"); 7352 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 7353 1.1 mrg { 7354 1.1 mrg /* 'dumpfile.cc:dump_loc' */ 7355 1.1 mrg fprintf (dump_file, "%s:%d:%d: ", LOCATION_FILE (loc), 7356 1.1 mrg LOCATION_LINE (loc), LOCATION_COLUMN (loc)); 7357 1.1 mrg fprintf (dump_file, "%s: ", "note"); 7358 1.1 mrg 7359 1.1 mrg fprintf (dump_file, 7360 1.1 mrg "variable '"); 7361 1.1 mrg print_generic_expr (dump_file, var, TDF_SLIM); 7362 1.1 mrg fprintf (dump_file, 7363 1.1 mrg "' adjusted for OpenACC privatization level: '%s'\n", 7364 1.1 mrg "gang"); 7365 1.1 mrg } 7366 1.1 mrg #endif 7367 1.1 mrg } 7368 1.1 mrg rtx addr = plus_constant (Pmode, gang_private_shared_sym, offset); 7369 1.1 mrg return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr); 7370 1.1 mrg } 7371 1.1 mrg 7372 1.1 mrg return NULL_RTX; 7373 1.1 mrg } 7374 1.1 mrg 7375 1.1 mrg static GTY(()) tree nvptx_previous_fndecl; 7376 1.1 mrg 7377 1.1 mrg static void 7378 1.1 mrg nvptx_set_current_function (tree fndecl) 7379 1.1 mrg { 7380 1.1 mrg if (!fndecl || fndecl == nvptx_previous_fndecl) 7381 1.1 mrg return; 7382 1.1 mrg 7383 1.1 mrg gang_private_shared_hmap.empty (); 7384 1.1 mrg nvptx_previous_fndecl = fndecl; 7385 1.1 mrg vector_red_partition = 0; 7386 1.1 mrg oacc_bcast_partition = 0; 7387 1.1 mrg } 7388 1.1 mrg 7389 1.1 mrg /* Implement TARGET_LIBC_HAS_FUNCTION. */ 7390 1.1 mrg 7391 1.1 mrg bool 7392 1.1 mrg nvptx_libc_has_function (enum function_class fn_class, tree type) 7393 1.1 mrg { 7394 1.1 mrg if (fn_class == function_sincos) 7395 1.1 mrg { 7396 1.1 mrg if (type != NULL_TREE) 7397 1.1 mrg /* Currently, newlib does not support sincosl. */ 7398 1.1 mrg return type == float_type_node || type == double_type_node; 7399 1.1 mrg else 7400 1.1 mrg return true; 7401 1.1 mrg } 7402 1.1 mrg 7403 1.1 mrg return default_libc_has_function (fn_class, type); 7404 1.1 mrg } 7405 1.1 mrg 7406 1.1 mrg bool 7407 1.1 mrg nvptx_mem_local_p (rtx mem) 7408 1.1 mrg { 7409 1.1 mrg gcc_assert (GET_CODE (mem) == MEM); 7410 1.1 mrg 7411 1.1 mrg struct address_info info; 7412 1.1 mrg decompose_mem_address (&info, mem); 7413 1.1 mrg 7414 1.1 mrg if (info.base != NULL && REG_P (*info.base) 7415 1.1 mrg && REGNO_PTR_FRAME_P (REGNO (*info.base))) 7416 1.1 mrg { 7417 1.1 mrg if (TARGET_SOFT_STACK) 7418 1.1 mrg { 7419 1.1 mrg /* Frame-related doesn't mean local. */ 7420 1.1 mrg } 7421 1.1 mrg else 7422 1.1 mrg return true; 7423 1.1 mrg } 7424 1.1 mrg 7425 1.1 mrg return false; 7426 1.1 mrg } 7427 1.1 mrg 7428 1.1 mrg /* Define locally, for use in NVPTX_ASM_OUTPUT_DEF. */ 7429 1.1 mrg #define SET_ASM_OP ".alias " 7430 1.1 mrg 7431 1.1 mrg /* Define locally, for use in nvptx_asm_output_def_from_decls. Add NVPTX_ 7432 1.1 mrg prefix to avoid clash with ASM_OUTPUT_DEF from nvptx.h. 7433 1.1 mrg Copy of ASM_OUTPUT_DEF from defaults.h, with added terminating 7434 1.1 mrg semicolon. */ 7435 1.1 mrg #define NVPTX_ASM_OUTPUT_DEF(FILE,LABEL1,LABEL2) \ 7436 1.1 mrg do \ 7437 1.1 mrg { \ 7438 1.1 mrg fprintf ((FILE), "%s", SET_ASM_OP); \ 7439 1.1 mrg assemble_name (FILE, LABEL1); \ 7440 1.1 mrg fprintf (FILE, ","); \ 7441 1.1 mrg assemble_name (FILE, LABEL2); \ 7442 1.1 mrg fprintf (FILE, ";\n"); \ 7443 1.1 mrg } \ 7444 1.1 mrg while (0) 7445 1.1 mrg 7446 1.1 mrg void 7447 1.1 mrg nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value) 7448 1.1 mrg { 7449 1.1 mrg if (nvptx_alias == 0 || !TARGET_PTX_6_3) 7450 1.1 mrg { 7451 1.1 mrg /* Copied from assemble_alias. */ 7452 1.1 mrg error_at (DECL_SOURCE_LOCATION (name), 7453 1.1 mrg "alias definitions not supported in this configuration"); 7454 1.1 mrg TREE_ASM_WRITTEN (name) = 1; 7455 1.1 mrg return; 7456 1.1 mrg } 7457 1.1 mrg 7458 1.1 mrg if (lookup_attribute ("weak", DECL_ATTRIBUTES (name))) 7459 1.1 mrg { 7460 1.1 mrg /* Prevent execution FAILs for gcc.dg/globalalias.c and 7461 1.1 mrg gcc.dg/pr77587.c. */ 7462 1.1 mrg error_at (DECL_SOURCE_LOCATION (name), 7463 1.1 mrg "weak alias definitions not supported in this configuration"); 7464 1.1 mrg TREE_ASM_WRITTEN (name) = 1; 7465 1.1 mrg return; 7466 1.1 mrg } 7467 1.1 mrg 7468 1.1 mrg /* Ptx also doesn't support value having weak linkage, but we can't detect 7469 1.1 mrg that here, so we'll end up with: 7470 1.1 mrg "error: Function test with .weak scope cannot be aliased". 7471 1.1 mrg See gcc.dg/localalias.c. */ 7472 1.1 mrg 7473 1.1 mrg if (TREE_CODE (name) != FUNCTION_DECL) 7474 1.1 mrg { 7475 1.1 mrg error_at (DECL_SOURCE_LOCATION (name), 7476 1.1 mrg "non-function alias definitions not supported" 7477 1.1 mrg " in this configuration"); 7478 1.1 mrg TREE_ASM_WRITTEN (name) = 1; 7479 1.1 mrg return; 7480 1.1 mrg } 7481 1.1 mrg 7482 1.1 mrg if (!cgraph_node::get (name)->referred_to_p ()) 7483 1.1 mrg /* Prevent "Internal error: reference to deleted section". */ 7484 1.1 mrg return; 7485 1.1 mrg 7486 1.1 mrg std::stringstream s; 7487 1.1 mrg write_fn_proto (s, false, get_fnname_from_decl (name), name); 7488 1.1 mrg fputs (s.str ().c_str (), stream); 7489 1.1 mrg 7490 1.1 mrg tree id = DECL_ASSEMBLER_NAME (name); 7491 1.1 mrg NVPTX_ASM_OUTPUT_DEF (stream, IDENTIFIER_POINTER (id), 7492 1.1 mrg IDENTIFIER_POINTER (value)); 7493 1.1 mrg } 7494 1.1 mrg 7495 1.1 mrg #undef NVPTX_ASM_OUTPUT_DEF 7496 1.1 mrg #undef SET_ASM_OP 7497 1.1 mrg 7498 1.1 mrg #undef TARGET_OPTION_OVERRIDE 7499 1.1 mrg #define TARGET_OPTION_OVERRIDE nvptx_option_override 7500 1.1 mrg 7501 1.1 mrg #undef TARGET_ATTRIBUTE_TABLE 7502 1.1 mrg #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table 7503 1.1 mrg 7504 1.1 mrg #undef TARGET_LRA_P 7505 1.1 mrg #define TARGET_LRA_P hook_bool_void_false 7506 1.1 mrg 7507 1.1 mrg #undef TARGET_LEGITIMATE_ADDRESS_P 7508 1.1 mrg #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p 7509 1.1 mrg 7510 1.1 mrg #undef TARGET_PROMOTE_FUNCTION_MODE 7511 1.1 mrg #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode 7512 1.1 mrg 7513 1.1 mrg #undef TARGET_FUNCTION_ARG 7514 1.1 mrg #define TARGET_FUNCTION_ARG nvptx_function_arg 7515 1.1 mrg #undef TARGET_FUNCTION_INCOMING_ARG 7516 1.1 mrg #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg 7517 1.1 mrg #undef TARGET_FUNCTION_ARG_ADVANCE 7518 1.1 mrg #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance 7519 1.1 mrg #undef TARGET_FUNCTION_ARG_BOUNDARY 7520 1.1 mrg #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary 7521 1.1 mrg #undef TARGET_PASS_BY_REFERENCE 7522 1.1 mrg #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference 7523 1.1 mrg #undef TARGET_FUNCTION_VALUE_REGNO_P 7524 1.1 mrg #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p 7525 1.1 mrg #undef TARGET_FUNCTION_VALUE 7526 1.1 mrg #define TARGET_FUNCTION_VALUE nvptx_function_value 7527 1.1 mrg #undef TARGET_LIBCALL_VALUE 7528 1.1 mrg #define TARGET_LIBCALL_VALUE nvptx_libcall_value 7529 1.1 mrg #undef TARGET_FUNCTION_OK_FOR_SIBCALL 7530 1.1 mrg #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall 7531 1.1 mrg #undef TARGET_GET_DRAP_RTX 7532 1.1 mrg #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx 7533 1.1 mrg #undef TARGET_SPLIT_COMPLEX_ARG 7534 1.1 mrg #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true 7535 1.1 mrg #undef TARGET_RETURN_IN_MEMORY 7536 1.1 mrg #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory 7537 1.1 mrg #undef TARGET_OMIT_STRUCT_RETURN_REG 7538 1.1 mrg #define TARGET_OMIT_STRUCT_RETURN_REG true 7539 1.1 mrg #undef TARGET_STRICT_ARGUMENT_NAMING 7540 1.1 mrg #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming 7541 1.1 mrg #undef TARGET_CALL_ARGS 7542 1.1 mrg #define TARGET_CALL_ARGS nvptx_call_args 7543 1.1 mrg #undef TARGET_END_CALL_ARGS 7544 1.1 mrg #define TARGET_END_CALL_ARGS nvptx_end_call_args 7545 1.1 mrg 7546 1.1 mrg #undef TARGET_ASM_FILE_START 7547 1.1 mrg #define TARGET_ASM_FILE_START nvptx_file_start 7548 1.1 mrg #undef TARGET_ASM_FILE_END 7549 1.1 mrg #define TARGET_ASM_FILE_END nvptx_file_end 7550 1.1 mrg #undef TARGET_ASM_GLOBALIZE_LABEL 7551 1.1 mrg #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label 7552 1.1 mrg #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL 7553 1.1 mrg #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl 7554 1.1 mrg #undef TARGET_PRINT_OPERAND 7555 1.1 mrg #define TARGET_PRINT_OPERAND nvptx_print_operand 7556 1.1 mrg #undef TARGET_PRINT_OPERAND_ADDRESS 7557 1.1 mrg #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address 7558 1.1 mrg #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P 7559 1.1 mrg #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p 7560 1.1 mrg #undef TARGET_ASM_INTEGER 7561 1.1 mrg #define TARGET_ASM_INTEGER nvptx_assemble_integer 7562 1.1 mrg #undef TARGET_ASM_DECL_END 7563 1.1 mrg #define TARGET_ASM_DECL_END nvptx_assemble_decl_end 7564 1.1 mrg #undef TARGET_ASM_DECLARE_CONSTANT_NAME 7565 1.1 mrg #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name 7566 1.1 mrg #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P 7567 1.1 mrg #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true 7568 1.1 mrg #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE 7569 1.1 mrg #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true 7570 1.1 mrg 7571 1.1 mrg #undef TARGET_MACHINE_DEPENDENT_REORG 7572 1.1 mrg #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg 7573 1.1 mrg #undef TARGET_NO_REGISTER_ALLOCATION 7574 1.1 mrg #define TARGET_NO_REGISTER_ALLOCATION true 7575 1.1 mrg 7576 1.1 mrg #undef TARGET_ENCODE_SECTION_INFO 7577 1.1 mrg #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info 7578 1.1 mrg #undef TARGET_RECORD_OFFLOAD_SYMBOL 7579 1.1 mrg #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol 7580 1.1 mrg 7581 1.1 mrg #undef TARGET_VECTOR_ALIGNMENT 7582 1.1 mrg #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment 7583 1.1 mrg 7584 1.1 mrg #undef TARGET_CANNOT_COPY_INSN_P 7585 1.1 mrg #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p 7586 1.1 mrg 7587 1.1 mrg #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P 7588 1.1 mrg #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p 7589 1.1 mrg 7590 1.1 mrg #undef TARGET_INIT_BUILTINS 7591 1.1 mrg #define TARGET_INIT_BUILTINS nvptx_init_builtins 7592 1.1 mrg #undef TARGET_EXPAND_BUILTIN 7593 1.1 mrg #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin 7594 1.1 mrg #undef TARGET_BUILTIN_DECL 7595 1.1 mrg #define TARGET_BUILTIN_DECL nvptx_builtin_decl 7596 1.1 mrg 7597 1.1 mrg #undef TARGET_SIMT_VF 7598 1.1 mrg #define TARGET_SIMT_VF nvptx_simt_vf 7599 1.1 mrg 7600 1.1 mrg #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA 7601 1.1 mrg #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa 7602 1.1 mrg 7603 1.1 mrg #undef TARGET_GOACC_VALIDATE_DIMS 7604 1.1 mrg #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims 7605 1.1 mrg 7606 1.1 mrg #undef TARGET_GOACC_DIM_LIMIT 7607 1.1 mrg #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit 7608 1.1 mrg 7609 1.1 mrg #undef TARGET_GOACC_FORK_JOIN 7610 1.1 mrg #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join 7611 1.1 mrg 7612 1.1 mrg #undef TARGET_GOACC_REDUCTION 7613 1.1 mrg #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction 7614 1.1 mrg 7615 1.1 mrg #undef TARGET_CANNOT_FORCE_CONST_MEM 7616 1.1 mrg #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem 7617 1.1 mrg 7618 1.1 mrg #undef TARGET_SCALAR_MODE_SUPPORTED_P 7619 1.1 mrg #define TARGET_SCALAR_MODE_SUPPORTED_P nvptx_scalar_mode_supported_p 7620 1.1 mrg 7621 1.1 mrg #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P 7622 1.1 mrg #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \ 7623 1.1 mrg nvptx_libgcc_floating_mode_supported_p 7624 1.1 mrg 7625 1.1 mrg #undef TARGET_VECTOR_MODE_SUPPORTED_P 7626 1.1 mrg #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported 7627 1.1 mrg 7628 1.1 mrg #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE 7629 1.1 mrg #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \ 7630 1.1 mrg nvptx_preferred_simd_mode 7631 1.1 mrg 7632 1.1 mrg #undef TARGET_MODES_TIEABLE_P 7633 1.1 mrg #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p 7634 1.1 mrg 7635 1.1 mrg #undef TARGET_HARD_REGNO_NREGS 7636 1.1 mrg #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs 7637 1.1 mrg 7638 1.1 mrg #undef TARGET_CAN_CHANGE_MODE_CLASS 7639 1.1 mrg #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class 7640 1.1 mrg 7641 1.1 mrg #undef TARGET_TRULY_NOOP_TRUNCATION 7642 1.1 mrg #define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation 7643 1.1 mrg 7644 1.1 mrg #undef TARGET_HAVE_SPECULATION_SAFE_VALUE 7645 1.1 mrg #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed 7646 1.1 mrg 7647 1.1 mrg #undef TARGET_GOACC_ADJUST_PRIVATE_DECL 7648 1.1 mrg #define TARGET_GOACC_ADJUST_PRIVATE_DECL nvptx_goacc_adjust_private_decl 7649 1.1 mrg 7650 1.1 mrg #undef TARGET_GOACC_EXPAND_VAR_DECL 7651 1.1 mrg #define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl 7652 1.1 mrg 7653 1.1 mrg #undef TARGET_SET_CURRENT_FUNCTION 7654 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function 7655 7656 #undef TARGET_LIBC_HAS_FUNCTION 7657 #define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function 7658 7659 struct gcc_target targetm = TARGET_INITIALIZER; 7660 7661 #include "gt-nvptx.h" 7662