nvptx.cc revision 1.1 1 1.1 mrg /* Target code for NVPTX.
2 1.1 mrg Copyright (C) 2014-2022 Free Software Foundation, Inc.
3 1.1 mrg Contributed by Bernd Schmidt <bernds (at) codesourcery.com>
4 1.1 mrg
5 1.1 mrg This file is part of GCC.
6 1.1 mrg
7 1.1 mrg GCC is free software; you can redistribute it and/or modify it
8 1.1 mrg under the terms of the GNU General Public License as published
9 1.1 mrg by the Free Software Foundation; either version 3, or (at your
10 1.1 mrg option) any later version.
11 1.1 mrg
12 1.1 mrg GCC is distributed in the hope that it will be useful, but WITHOUT
13 1.1 mrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 1.1 mrg License for more details.
16 1.1 mrg
17 1.1 mrg You should have received a copy of the GNU General Public License
18 1.1 mrg along with GCC; see the file COPYING3. If not see
19 1.1 mrg <http://www.gnu.org/licenses/>. */
20 1.1 mrg
21 1.1 mrg #define IN_TARGET_CODE 1
22 1.1 mrg
23 1.1 mrg #include "config.h"
24 1.1 mrg #include <sstream>
25 1.1 mrg #include "system.h"
26 1.1 mrg #include "coretypes.h"
27 1.1 mrg #include "backend.h"
28 1.1 mrg #include "target.h"
29 1.1 mrg #include "rtl.h"
30 1.1 mrg #include "tree.h"
31 1.1 mrg #include "cfghooks.h"
32 1.1 mrg #include "df.h"
33 1.1 mrg #include "memmodel.h"
34 1.1 mrg #include "tm_p.h"
35 1.1 mrg #include "expmed.h"
36 1.1 mrg #include "optabs.h"
37 1.1 mrg #include "regs.h"
38 1.1 mrg #include "emit-rtl.h"
39 1.1 mrg #include "recog.h"
40 1.1 mrg #include "diagnostic.h"
41 1.1 mrg #include "alias.h"
42 1.1 mrg #include "insn-flags.h"
43 1.1 mrg #include "output.h"
44 1.1 mrg #include "insn-attr.h"
45 1.1 mrg #include "flags.h"
46 1.1 mrg #include "dojump.h"
47 1.1 mrg #include "explow.h"
48 1.1 mrg #include "calls.h"
49 1.1 mrg #include "varasm.h"
50 1.1 mrg #include "stmt.h"
51 1.1 mrg #include "expr.h"
52 1.1 mrg #include "tm-preds.h"
53 1.1 mrg #include "tm-constrs.h"
54 1.1 mrg #include "langhooks.h"
55 1.1 mrg #include "dbxout.h"
56 1.1 mrg #include "cfgrtl.h"
57 1.1 mrg #include "gimple.h"
58 1.1 mrg #include "stor-layout.h"
59 1.1 mrg #include "builtins.h"
60 1.1 mrg #include "omp-general.h"
61 1.1 mrg #include "omp-low.h"
62 1.1 mrg #include "omp-offload.h"
63 1.1 mrg #include "gomp-constants.h"
64 1.1 mrg #include "dumpfile.h"
65 1.1 mrg #include "internal-fn.h"
66 1.1 mrg #include "gimple-iterator.h"
67 1.1 mrg #include "stringpool.h"
68 1.1 mrg #include "attribs.h"
69 1.1 mrg #include "tree-vrp.h"
70 1.1 mrg #include "tree-ssa-operands.h"
71 1.1 mrg #include "tree-ssanames.h"
72 1.1 mrg #include "gimplify.h"
73 1.1 mrg #include "tree-phinodes.h"
74 1.1 mrg #include "cfgloop.h"
75 1.1 mrg #include "fold-const.h"
76 1.1 mrg #include "intl.h"
77 1.1 mrg #include "opts.h"
78 1.1 mrg #include "tree-pretty-print.h"
79 1.1 mrg #include "rtl-iter.h"
80 1.1 mrg #include "cgraph.h"
81 1.1 mrg
82 1.1 mrg /* This file should be included last. */
83 1.1 mrg #include "target-def.h"
84 1.1 mrg
85 1.1 mrg #define WORKAROUND_PTXJIT_BUG 1
86 1.1 mrg #define WORKAROUND_PTXJIT_BUG_2 1
87 1.1 mrg #define WORKAROUND_PTXJIT_BUG_3 1
88 1.1 mrg
89 1.1 mrg /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
90 1.1 mrg block, which has had a maximum number of threads of 1024 since CUDA version
91 1.1 mrg 2.x. */
92 1.1 mrg #define PTX_CTA_SIZE 1024
93 1.1 mrg
94 1.1 mrg #define PTX_CTA_NUM_BARRIERS 16
95 1.1 mrg #define PTX_WARP_SIZE 32
96 1.1 mrg
97 1.1 mrg #define PTX_PER_CTA_BARRIER 0
98 1.1 mrg #define PTX_NUM_PER_CTA_BARRIERS 1
99 1.1 mrg #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
100 1.1 mrg #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
101 1.1 mrg
102 1.1 mrg #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
103 1.1 mrg #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
104 1.1 mrg #define PTX_WORKER_LENGTH 32
105 1.1 mrg #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
106 1.1 mrg
107 1.1 mrg /* The various PTX memory areas an object might reside in. */
108 1.1 mrg enum nvptx_data_area
109 1.1 mrg {
110 1.1 mrg DATA_AREA_GENERIC,
111 1.1 mrg DATA_AREA_GLOBAL,
112 1.1 mrg DATA_AREA_SHARED,
113 1.1 mrg DATA_AREA_LOCAL,
114 1.1 mrg DATA_AREA_CONST,
115 1.1 mrg DATA_AREA_PARAM,
116 1.1 mrg DATA_AREA_MAX
117 1.1 mrg };
118 1.1 mrg
119 1.1 mrg /* We record the data area in the target symbol flags. */
120 1.1 mrg #define SYMBOL_DATA_AREA(SYM) \
121 1.1 mrg (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
122 1.1 mrg & 7)
123 1.1 mrg #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
124 1.1 mrg (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
125 1.1 mrg
126 1.1 mrg /* Record the function decls we've written, and the libfuncs and function
127 1.1 mrg decls corresponding to them. */
128 1.1 mrg static std::stringstream func_decls;
129 1.1 mrg
130 1.1 mrg struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
131 1.1 mrg {
132 1.1 mrg static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
133 1.1 mrg static bool equal (rtx a, rtx b) { return a == b; }
134 1.1 mrg };
135 1.1 mrg
136 1.1 mrg static GTY((cache))
137 1.1 mrg hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
138 1.1 mrg
139 1.1 mrg struct tree_hasher : ggc_cache_ptr_hash<tree_node>
140 1.1 mrg {
141 1.1 mrg static hashval_t hash (tree t) { return htab_hash_pointer (t); }
142 1.1 mrg static bool equal (tree a, tree b) { return a == b; }
143 1.1 mrg };
144 1.1 mrg
145 1.1 mrg static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
146 1.1 mrg static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
147 1.1 mrg
148 1.1 mrg /* Buffer needed to broadcast across workers and vectors. This is
149 1.1 mrg used for both worker-neutering and worker broadcasting, and
150 1.1 mrg vector-neutering and boardcasting when vector_length > 32. It is
151 1.1 mrg shared by all functions emitted. The buffer is placed in shared
152 1.1 mrg memory. It'd be nice if PTX supported common blocks, because then
153 1.1 mrg this could be shared across TUs (taking the largest size). */
154 1.1 mrg static unsigned oacc_bcast_size;
155 1.1 mrg static unsigned oacc_bcast_partition;
156 1.1 mrg static unsigned oacc_bcast_align;
157 1.1 mrg static GTY(()) rtx oacc_bcast_sym;
158 1.1 mrg
159 1.1 mrg /* Buffer needed for worker reductions. This has to be distinct from
160 1.1 mrg the worker broadcast array, as both may be live concurrently. */
161 1.1 mrg static unsigned worker_red_size;
162 1.1 mrg static unsigned worker_red_align;
163 1.1 mrg static GTY(()) rtx worker_red_sym;
164 1.1 mrg
165 1.1 mrg /* Buffer needed for vector reductions, when vector_length >
166 1.1 mrg PTX_WARP_SIZE. This has to be distinct from the worker broadcast
167 1.1 mrg array, as both may be live concurrently. */
168 1.1 mrg static unsigned vector_red_size;
169 1.1 mrg static unsigned vector_red_align;
170 1.1 mrg static unsigned vector_red_partition;
171 1.1 mrg static GTY(()) rtx vector_red_sym;
172 1.1 mrg
173 1.1 mrg /* Shared memory block for gang-private variables. */
174 1.1 mrg static unsigned gang_private_shared_size;
175 1.1 mrg static unsigned gang_private_shared_align;
176 1.1 mrg static GTY(()) rtx gang_private_shared_sym;
177 1.1 mrg static hash_map<tree_decl_hash, unsigned int> gang_private_shared_hmap;
178 1.1 mrg
179 1.1 mrg /* Global lock variable, needed for 128bit worker & gang reductions. */
180 1.1 mrg static GTY(()) tree global_lock_var;
181 1.1 mrg
182 1.1 mrg /* True if any function references __nvptx_stacks. */
183 1.1 mrg static bool need_softstack_decl;
184 1.1 mrg
185 1.1 mrg /* True if any function references __nvptx_uni. */
186 1.1 mrg static bool need_unisimt_decl;
187 1.1 mrg
188 1.1 mrg static int nvptx_mach_max_workers ();
189 1.1 mrg
190 1.1 mrg /* Allocate a new, cleared machine_function structure. */
191 1.1 mrg
192 1.1 mrg static struct machine_function *
193 1.1 mrg nvptx_init_machine_status (void)
194 1.1 mrg {
195 1.1 mrg struct machine_function *p = ggc_cleared_alloc<machine_function> ();
196 1.1 mrg p->return_mode = VOIDmode;
197 1.1 mrg return p;
198 1.1 mrg }
199 1.1 mrg
200 1.1 mrg /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
201 1.1 mrg and -fopenacc is also enabled. */
202 1.1 mrg
203 1.1 mrg static void
204 1.1 mrg diagnose_openacc_conflict (bool optval, const char *optname)
205 1.1 mrg {
206 1.1 mrg if (flag_openacc && optval)
207 1.1 mrg error ("option %s is not supported together with %<-fopenacc%>", optname);
208 1.1 mrg }
209 1.1 mrg
210 1.1 mrg static enum ptx_version
211 1.1 mrg first_ptx_version_supporting_sm (enum ptx_isa sm)
212 1.1 mrg {
213 1.1 mrg switch (sm)
214 1.1 mrg {
215 1.1 mrg case PTX_ISA_SM30:
216 1.1 mrg return PTX_VERSION_3_0;
217 1.1 mrg case PTX_ISA_SM35:
218 1.1 mrg return PTX_VERSION_3_1;
219 1.1 mrg case PTX_ISA_SM53:
220 1.1 mrg return PTX_VERSION_4_2;
221 1.1 mrg case PTX_ISA_SM70:
222 1.1 mrg return PTX_VERSION_6_0;
223 1.1 mrg case PTX_ISA_SM75:
224 1.1 mrg return PTX_VERSION_6_3;
225 1.1 mrg case PTX_ISA_SM80:
226 1.1 mrg return PTX_VERSION_7_0;
227 1.1 mrg default:
228 1.1 mrg gcc_unreachable ();
229 1.1 mrg }
230 1.1 mrg }
231 1.1 mrg
232 1.1 mrg static enum ptx_version
233 1.1 mrg default_ptx_version_option (void)
234 1.1 mrg {
235 1.1 mrg enum ptx_version first
236 1.1 mrg = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
237 1.1 mrg
238 1.1 mrg /* Pick a version that supports the sm. */
239 1.1 mrg enum ptx_version res = first;
240 1.1 mrg
241 1.1 mrg /* Pick at least 3.1. This has been the smallest version historically. */
242 1.1 mrg res = MAX (res, PTX_VERSION_3_1);
243 1.1 mrg
244 1.1 mrg /* Pick at least 6.0, to enable using bar.warp.sync to have a way to force
245 1.1 mrg warp convergence. */
246 1.1 mrg res = MAX (res, PTX_VERSION_6_0);
247 1.1 mrg
248 1.1 mrg /* Verify that we pick a version that supports the sm. */
249 1.1 mrg gcc_assert (first <= res);
250 1.1 mrg return res;
251 1.1 mrg }
252 1.1 mrg
253 1.1 mrg static const char *
254 1.1 mrg ptx_version_to_string (enum ptx_version v)
255 1.1 mrg {
256 1.1 mrg switch (v)
257 1.1 mrg {
258 1.1 mrg case PTX_VERSION_3_0:
259 1.1 mrg return "3.0";
260 1.1 mrg case PTX_VERSION_3_1:
261 1.1 mrg return "3.1";
262 1.1 mrg case PTX_VERSION_4_2:
263 1.1 mrg return "4.2";
264 1.1 mrg case PTX_VERSION_6_0:
265 1.1 mrg return "6.0";
266 1.1 mrg case PTX_VERSION_6_3:
267 1.1 mrg return "6.3";
268 1.1 mrg case PTX_VERSION_7_0:
269 1.1 mrg return "7.0";
270 1.1 mrg default:
271 1.1 mrg gcc_unreachable ();
272 1.1 mrg }
273 1.1 mrg }
274 1.1 mrg
275 1.1 mrg unsigned int
276 1.1 mrg ptx_version_to_number (enum ptx_version v, bool major_p)
277 1.1 mrg {
278 1.1 mrg switch (v)
279 1.1 mrg {
280 1.1 mrg case PTX_VERSION_3_0:
281 1.1 mrg return major_p ? 3 : 0;
282 1.1 mrg case PTX_VERSION_3_1:
283 1.1 mrg return major_p ? 3 : 1;
284 1.1 mrg case PTX_VERSION_4_2:
285 1.1 mrg return major_p ? 4 : 2;
286 1.1 mrg case PTX_VERSION_6_0:
287 1.1 mrg return major_p ? 6 : 0;
288 1.1 mrg case PTX_VERSION_6_3:
289 1.1 mrg return major_p ? 6 : 3;
290 1.1 mrg case PTX_VERSION_7_0:
291 1.1 mrg return major_p ? 7 : 0;
292 1.1 mrg default:
293 1.1 mrg gcc_unreachable ();
294 1.1 mrg }
295 1.1 mrg }
296 1.1 mrg
297 1.1 mrg static const char *
298 1.1 mrg sm_version_to_string (enum ptx_isa sm)
299 1.1 mrg {
300 1.1 mrg switch (sm)
301 1.1 mrg {
302 1.1 mrg #define NVPTX_SM(XX, SEP) \
303 1.1 mrg case PTX_ISA_SM ## XX: \
304 1.1 mrg return #XX;
305 1.1 mrg #include "nvptx-sm.def"
306 1.1 mrg #undef NVPTX_SM
307 1.1 mrg default:
308 1.1 mrg gcc_unreachable ();
309 1.1 mrg }
310 1.1 mrg }
311 1.1 mrg
312 1.1 mrg static void
313 1.1 mrg handle_ptx_version_option (void)
314 1.1 mrg {
315 1.1 mrg if (!OPTION_SET_P (ptx_version_option)
316 1.1 mrg || ptx_version_option == PTX_VERSION_default)
317 1.1 mrg {
318 1.1 mrg ptx_version_option = default_ptx_version_option ();
319 1.1 mrg return;
320 1.1 mrg }
321 1.1 mrg
322 1.1 mrg enum ptx_version first
323 1.1 mrg = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
324 1.1 mrg
325 1.1 mrg if (ptx_version_option < first)
326 1.1 mrg error ("PTX version (%<-mptx%>) needs to be at least %s to support selected"
327 1.1 mrg " %<-misa%> (sm_%s)", ptx_version_to_string (first),
328 1.1 mrg sm_version_to_string ((enum ptx_isa)ptx_isa_option));
329 1.1 mrg }
330 1.1 mrg
331 1.1 mrg /* Implement TARGET_OPTION_OVERRIDE. */
332 1.1 mrg
333 1.1 mrg static void
334 1.1 mrg nvptx_option_override (void)
335 1.1 mrg {
336 1.1 mrg init_machine_status = nvptx_init_machine_status;
337 1.1 mrg
338 1.1 mrg handle_ptx_version_option ();
339 1.1 mrg
340 1.1 mrg /* Set toplevel_reorder, unless explicitly disabled. We need
341 1.1 mrg reordering so that we emit necessary assembler decls of
342 1.1 mrg undeclared variables. */
343 1.1 mrg if (!OPTION_SET_P (flag_toplevel_reorder))
344 1.1 mrg flag_toplevel_reorder = 1;
345 1.1 mrg
346 1.1 mrg debug_nonbind_markers_p = 0;
347 1.1 mrg
348 1.1 mrg /* Set flag_no_common, unless explicitly disabled. We fake common
349 1.1 mrg using .weak, and that's not entirely accurate, so avoid it
350 1.1 mrg unless forced. */
351 1.1 mrg if (!OPTION_SET_P (flag_no_common))
352 1.1 mrg flag_no_common = 1;
353 1.1 mrg
354 1.1 mrg /* The patch area requires nops, which we don't have. */
355 1.1 mrg HOST_WIDE_INT patch_area_size, patch_area_entry;
356 1.1 mrg parse_and_check_patch_area (flag_patchable_function_entry, false,
357 1.1 mrg &patch_area_size, &patch_area_entry);
358 1.1 mrg if (patch_area_size > 0)
359 1.1 mrg sorry ("not generating patch area, nops not supported");
360 1.1 mrg
361 1.1 mrg /* Assumes that it will see only hard registers. */
362 1.1 mrg flag_var_tracking = 0;
363 1.1 mrg
364 1.1 mrg if (nvptx_optimize < 0)
365 1.1 mrg nvptx_optimize = optimize > 0;
366 1.1 mrg
367 1.1 mrg declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
368 1.1 mrg needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
369 1.1 mrg declared_libfuncs_htab
370 1.1 mrg = hash_table<declared_libfunc_hasher>::create_ggc (17);
371 1.1 mrg
372 1.1 mrg oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
373 1.1 mrg SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
374 1.1 mrg oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
375 1.1 mrg oacc_bcast_partition = 0;
376 1.1 mrg
377 1.1 mrg worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
378 1.1 mrg SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
379 1.1 mrg worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
380 1.1 mrg
381 1.1 mrg vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
382 1.1 mrg SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
383 1.1 mrg vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
384 1.1 mrg vector_red_partition = 0;
385 1.1 mrg
386 1.1 mrg gang_private_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gang_private_shared");
387 1.1 mrg SET_SYMBOL_DATA_AREA (gang_private_shared_sym, DATA_AREA_SHARED);
388 1.1 mrg gang_private_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
389 1.1 mrg
390 1.1 mrg diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
391 1.1 mrg diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
392 1.1 mrg diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
393 1.1 mrg
394 1.1 mrg if (TARGET_GOMP)
395 1.1 mrg target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
396 1.1 mrg }
397 1.1 mrg
398 1.1 mrg /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
399 1.1 mrg deal with ptx ideosyncracies. */
400 1.1 mrg
401 1.1 mrg const char *
402 1.1 mrg nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
403 1.1 mrg {
404 1.1 mrg switch (mode)
405 1.1 mrg {
406 1.1 mrg case E_BLKmode:
407 1.1 mrg return ".b8";
408 1.1 mrg case E_BImode:
409 1.1 mrg return ".pred";
410 1.1 mrg case E_QImode:
411 1.1 mrg if (promote)
412 1.1 mrg return ".u32";
413 1.1 mrg else
414 1.1 mrg return ".u8";
415 1.1 mrg case E_HImode:
416 1.1 mrg return ".u16";
417 1.1 mrg case E_SImode:
418 1.1 mrg return ".u32";
419 1.1 mrg case E_DImode:
420 1.1 mrg return ".u64";
421 1.1 mrg
422 1.1 mrg case E_HFmode:
423 1.1 mrg return ".f16";
424 1.1 mrg case E_SFmode:
425 1.1 mrg return ".f32";
426 1.1 mrg case E_DFmode:
427 1.1 mrg return ".f64";
428 1.1 mrg
429 1.1 mrg case E_V2SImode:
430 1.1 mrg return ".v2.u32";
431 1.1 mrg case E_V2DImode:
432 1.1 mrg return ".v2.u64";
433 1.1 mrg
434 1.1 mrg default:
435 1.1 mrg gcc_unreachable ();
436 1.1 mrg }
437 1.1 mrg }
438 1.1 mrg
439 1.1 mrg /* Encode the PTX data area that DECL (which might not actually be a
440 1.1 mrg _DECL) should reside in. */
441 1.1 mrg
442 1.1 mrg static void
443 1.1 mrg nvptx_encode_section_info (tree decl, rtx rtl, int first)
444 1.1 mrg {
445 1.1 mrg default_encode_section_info (decl, rtl, first);
446 1.1 mrg if (first && MEM_P (rtl))
447 1.1 mrg {
448 1.1 mrg nvptx_data_area area = DATA_AREA_GENERIC;
449 1.1 mrg
450 1.1 mrg if (TREE_CONSTANT (decl))
451 1.1 mrg area = DATA_AREA_CONST;
452 1.1 mrg else if (TREE_CODE (decl) == VAR_DECL)
453 1.1 mrg {
454 1.1 mrg if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
455 1.1 mrg {
456 1.1 mrg area = DATA_AREA_SHARED;
457 1.1 mrg if (DECL_INITIAL (decl))
458 1.1 mrg error ("static initialization of variable %q+D in %<.shared%>"
459 1.1 mrg " memory is not supported", decl);
460 1.1 mrg }
461 1.1 mrg else
462 1.1 mrg area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
463 1.1 mrg }
464 1.1 mrg
465 1.1 mrg SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
466 1.1 mrg }
467 1.1 mrg }
468 1.1 mrg
469 1.1 mrg /* Return the PTX name of the data area in which SYM should be
470 1.1 mrg placed. The symbol must have already been processed by
471 1.1 mrg nvptx_encode_seciton_info, or equivalent. */
472 1.1 mrg
473 1.1 mrg static const char *
474 1.1 mrg section_for_sym (rtx sym)
475 1.1 mrg {
476 1.1 mrg nvptx_data_area area = SYMBOL_DATA_AREA (sym);
477 1.1 mrg /* Same order as nvptx_data_area enum. */
478 1.1 mrg static char const *const areas[] =
479 1.1 mrg {"", ".global", ".shared", ".local", ".const", ".param"};
480 1.1 mrg
481 1.1 mrg return areas[area];
482 1.1 mrg }
483 1.1 mrg
484 1.1 mrg /* Similarly for a decl. */
485 1.1 mrg
486 1.1 mrg static const char *
487 1.1 mrg section_for_decl (const_tree decl)
488 1.1 mrg {
489 1.1 mrg return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
490 1.1 mrg }
491 1.1 mrg
492 1.1 mrg /* Check NAME for special function names and redirect them by returning a
493 1.1 mrg replacement. This applies to malloc, free and realloc, for which we
494 1.1 mrg want to use libgcc wrappers, and call, which triggers a bug in
495 1.1 mrg ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
496 1.1 mrg not active in an offload compiler -- the names are all set by the
497 1.1 mrg host-side compiler. */
498 1.1 mrg
499 1.1 mrg static const char *
500 1.1 mrg nvptx_name_replacement (const char *name)
501 1.1 mrg {
502 1.1 mrg if (strcmp (name, "call") == 0)
503 1.1 mrg return "__nvptx_call";
504 1.1 mrg if (strcmp (name, "malloc") == 0)
505 1.1 mrg return "__nvptx_malloc";
506 1.1 mrg if (strcmp (name, "free") == 0)
507 1.1 mrg return "__nvptx_free";
508 1.1 mrg if (strcmp (name, "realloc") == 0)
509 1.1 mrg return "__nvptx_realloc";
510 1.1 mrg return name;
511 1.1 mrg }
512 1.1 mrg
513 1.1 mrg /* Return NULL if NAME contains no dot. Otherwise return a copy of NAME
514 1.1 mrg with the dots replaced with dollar signs. */
515 1.1 mrg
516 1.1 mrg static char *
517 1.1 mrg nvptx_replace_dot (const char *name)
518 1.1 mrg {
519 1.1 mrg if (strchr (name, '.') == NULL)
520 1.1 mrg return NULL;
521 1.1 mrg
522 1.1 mrg char *p = xstrdup (name);
523 1.1 mrg for (size_t i = 0; i < strlen (p); ++i)
524 1.1 mrg if (p[i] == '.')
525 1.1 mrg p[i] = '$';
526 1.1 mrg return p;
527 1.1 mrg }
528 1.1 mrg
529 1.1 mrg /* If MODE should be treated as two registers of an inner mode, return
530 1.1 mrg that inner mode. Otherwise return VOIDmode. */
531 1.1 mrg
532 1.1 mrg static machine_mode
533 1.1 mrg maybe_split_mode (machine_mode mode)
534 1.1 mrg {
535 1.1 mrg if (COMPLEX_MODE_P (mode))
536 1.1 mrg return GET_MODE_INNER (mode);
537 1.1 mrg
538 1.1 mrg if (mode == TImode)
539 1.1 mrg return DImode;
540 1.1 mrg
541 1.1 mrg return VOIDmode;
542 1.1 mrg }
543 1.1 mrg
544 1.1 mrg /* Return true if mode should be treated as two registers. */
545 1.1 mrg
546 1.1 mrg static bool
547 1.1 mrg split_mode_p (machine_mode mode)
548 1.1 mrg {
549 1.1 mrg return maybe_split_mode (mode) != VOIDmode;
550 1.1 mrg }
551 1.1 mrg
552 1.1 mrg /* Output a register, subreg, or register pair (with optional
553 1.1 mrg enclosing braces). */
554 1.1 mrg
555 1.1 mrg static void
556 1.1 mrg output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
557 1.1 mrg int subreg_offset = -1)
558 1.1 mrg {
559 1.1 mrg if (inner_mode == VOIDmode)
560 1.1 mrg {
561 1.1 mrg if (HARD_REGISTER_NUM_P (regno))
562 1.1 mrg fprintf (file, "%s", reg_names[regno]);
563 1.1 mrg else
564 1.1 mrg fprintf (file, "%%r%d", regno);
565 1.1 mrg }
566 1.1 mrg else if (subreg_offset >= 0)
567 1.1 mrg {
568 1.1 mrg output_reg (file, regno, VOIDmode);
569 1.1 mrg fprintf (file, "$%d", subreg_offset);
570 1.1 mrg }
571 1.1 mrg else
572 1.1 mrg {
573 1.1 mrg if (subreg_offset == -1)
574 1.1 mrg fprintf (file, "{");
575 1.1 mrg output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
576 1.1 mrg fprintf (file, ",");
577 1.1 mrg output_reg (file, regno, inner_mode, 0);
578 1.1 mrg if (subreg_offset == -1)
579 1.1 mrg fprintf (file, "}");
580 1.1 mrg }
581 1.1 mrg }
582 1.1 mrg
583 1.1 mrg /* Emit forking instructions for MASK. */
584 1.1 mrg
585 1.1 mrg static void
586 1.1 mrg nvptx_emit_forking (unsigned mask, bool is_call)
587 1.1 mrg {
588 1.1 mrg mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
589 1.1 mrg | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
590 1.1 mrg if (mask)
591 1.1 mrg {
592 1.1 mrg rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
593 1.1 mrg
594 1.1 mrg /* Emit fork at all levels. This helps form SESE regions, as
595 1.1 mrg it creates a block with a single successor before entering a
596 1.1 mrg partitooned region. That is a good candidate for the end of
597 1.1 mrg an SESE region. */
598 1.1 mrg emit_insn (gen_nvptx_fork (op));
599 1.1 mrg emit_insn (gen_nvptx_forked (op));
600 1.1 mrg }
601 1.1 mrg }
602 1.1 mrg
603 1.1 mrg /* Emit joining instructions for MASK. */
604 1.1 mrg
605 1.1 mrg static void
606 1.1 mrg nvptx_emit_joining (unsigned mask, bool is_call)
607 1.1 mrg {
608 1.1 mrg mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
609 1.1 mrg | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
610 1.1 mrg if (mask)
611 1.1 mrg {
612 1.1 mrg rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
613 1.1 mrg
614 1.1 mrg /* Emit joining for all non-call pars to ensure there's a single
615 1.1 mrg predecessor for the block the join insn ends up in. This is
616 1.1 mrg needed for skipping entire loops. */
617 1.1 mrg emit_insn (gen_nvptx_joining (op));
618 1.1 mrg emit_insn (gen_nvptx_join (op));
619 1.1 mrg }
620 1.1 mrg }
621 1.1 mrg
622 1.1 mrg
623 1.1 mrg /* Determine whether MODE and TYPE (possibly NULL) should be passed or
625 1.1 mrg returned in memory. Integer and floating types supported by the
626 1.1 mrg machine are passed in registers, everything else is passed in
627 1.1 mrg memory. Complex types are split. */
628 1.1 mrg
629 1.1 mrg static bool
630 1.1 mrg pass_in_memory (machine_mode mode, const_tree type, bool for_return)
631 1.1 mrg {
632 1.1 mrg if (type)
633 1.1 mrg {
634 1.1 mrg if (AGGREGATE_TYPE_P (type))
635 1.1 mrg return true;
636 1.1 mrg if (TREE_CODE (type) == VECTOR_TYPE)
637 1.1 mrg return true;
638 1.1 mrg }
639 1.1 mrg
640 1.1 mrg if (!for_return && COMPLEX_MODE_P (mode))
641 1.1 mrg /* Complex types are passed as two underlying args. */
642 1.1 mrg mode = GET_MODE_INNER (mode);
643 1.1 mrg
644 1.1 mrg if (GET_MODE_CLASS (mode) != MODE_INT
645 1.1 mrg && GET_MODE_CLASS (mode) != MODE_FLOAT)
646 1.1 mrg return true;
647 1.1 mrg
648 1.1 mrg if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
649 1.1 mrg return true;
650 1.1 mrg
651 1.1 mrg return false;
652 1.1 mrg }
653 1.1 mrg
654 1.1 mrg /* A non-memory argument of mode MODE is being passed, determine the mode it
655 1.1 mrg should be promoted to. This is also used for determining return
656 1.1 mrg type promotion. */
657 1.1 mrg
658 1.1 mrg static machine_mode
659 1.1 mrg promote_arg (machine_mode mode, bool prototyped)
660 1.1 mrg {
661 1.1 mrg if (!prototyped && mode == SFmode)
662 1.1 mrg /* K&R float promotion for unprototyped functions. */
663 1.1 mrg mode = DFmode;
664 1.1 mrg else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
665 1.1 mrg mode = SImode;
666 1.1 mrg
667 1.1 mrg return mode;
668 1.1 mrg }
669 1.1 mrg
670 1.1 mrg /* A non-memory return type of MODE is being returned. Determine the
671 1.1 mrg mode it should be promoted to. */
672 1.1 mrg
673 1.1 mrg static machine_mode
674 1.1 mrg promote_return (machine_mode mode)
675 1.1 mrg {
676 1.1 mrg return promote_arg (mode, true);
677 1.1 mrg }
678 1.1 mrg
679 1.1 mrg /* Implement TARGET_FUNCTION_ARG. */
680 1.1 mrg
681 1.1 mrg static rtx
682 1.1 mrg nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
683 1.1 mrg {
684 1.1 mrg if (arg.end_marker_p () || !arg.named)
685 1.1 mrg return NULL_RTX;
686 1.1 mrg
687 1.1 mrg return gen_reg_rtx (arg.mode);
688 1.1 mrg }
689 1.1 mrg
690 1.1 mrg /* Implement TARGET_FUNCTION_INCOMING_ARG. */
691 1.1 mrg
692 1.1 mrg static rtx
693 1.1 mrg nvptx_function_incoming_arg (cumulative_args_t cum_v,
694 1.1 mrg const function_arg_info &arg)
695 1.1 mrg {
696 1.1 mrg CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
697 1.1 mrg
698 1.1 mrg if (arg.end_marker_p () || !arg.named)
699 1.1 mrg return NULL_RTX;
700 1.1 mrg
701 1.1 mrg /* No need to deal with split modes here, the only case that can
702 1.1 mrg happen is complex modes and those are dealt with by
703 1.1 mrg TARGET_SPLIT_COMPLEX_ARG. */
704 1.1 mrg return gen_rtx_UNSPEC (arg.mode,
705 1.1 mrg gen_rtvec (1, GEN_INT (cum->count)),
706 1.1 mrg UNSPEC_ARG_REG);
707 1.1 mrg }
708 1.1 mrg
709 1.1 mrg /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
710 1.1 mrg
711 1.1 mrg static void
712 1.1 mrg nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
713 1.1 mrg {
714 1.1 mrg CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
715 1.1 mrg
716 1.1 mrg cum->count++;
717 1.1 mrg }
718 1.1 mrg
719 1.1 mrg /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
720 1.1 mrg
721 1.1 mrg For nvptx This is only used for varadic args. The type has already
722 1.1 mrg been promoted and/or converted to invisible reference. */
723 1.1 mrg
724 1.1 mrg static unsigned
725 1.1 mrg nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
726 1.1 mrg {
727 1.1 mrg return GET_MODE_ALIGNMENT (mode);
728 1.1 mrg }
729 1.1 mrg
730 1.1 mrg /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
731 1.1 mrg
732 1.1 mrg For nvptx, we know how to handle functions declared as stdarg: by
733 1.1 mrg passing an extra pointer to the unnamed arguments. However, the
734 1.1 mrg Fortran frontend can produce a different situation, where a
735 1.1 mrg function pointer is declared with no arguments, but the actual
736 1.1 mrg function and calls to it take more arguments. In that case, we
737 1.1 mrg want to ensure the call matches the definition of the function. */
738 1.1 mrg
739 1.1 mrg static bool
740 1.1 mrg nvptx_strict_argument_naming (cumulative_args_t cum_v)
741 1.1 mrg {
742 1.1 mrg CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
743 1.1 mrg
744 1.1 mrg return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
745 1.1 mrg }
746 1.1 mrg
747 1.1 mrg /* Implement TARGET_LIBCALL_VALUE. */
748 1.1 mrg
749 1.1 mrg static rtx
750 1.1 mrg nvptx_libcall_value (machine_mode mode, const_rtx)
751 1.1 mrg {
752 1.1 mrg if (!cfun || !cfun->machine->doing_call)
753 1.1 mrg /* Pretend to return in a hard reg for early uses before pseudos can be
754 1.1 mrg generated. */
755 1.1 mrg return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
756 1.1 mrg
757 1.1 mrg return gen_reg_rtx (mode);
758 1.1 mrg }
759 1.1 mrg
760 1.1 mrg /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
761 1.1 mrg where function FUNC returns or receives a value of data type TYPE. */
762 1.1 mrg
763 1.1 mrg static rtx
764 1.1 mrg nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
765 1.1 mrg bool outgoing)
766 1.1 mrg {
767 1.1 mrg machine_mode mode = promote_return (TYPE_MODE (type));
768 1.1 mrg
769 1.1 mrg if (outgoing)
770 1.1 mrg {
771 1.1 mrg gcc_assert (cfun);
772 1.1 mrg cfun->machine->return_mode = mode;
773 1.1 mrg return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
774 1.1 mrg }
775 1.1 mrg
776 1.1 mrg return nvptx_libcall_value (mode, NULL_RTX);
777 1.1 mrg }
778 1.1 mrg
779 1.1 mrg /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
780 1.1 mrg
781 1.1 mrg static bool
782 1.1 mrg nvptx_function_value_regno_p (const unsigned int regno)
783 1.1 mrg {
784 1.1 mrg return regno == NVPTX_RETURN_REGNUM;
785 1.1 mrg }
786 1.1 mrg
787 1.1 mrg /* Types with a mode other than those supported by the machine are passed by
788 1.1 mrg reference in memory. */
789 1.1 mrg
790 1.1 mrg static bool
791 1.1 mrg nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
792 1.1 mrg {
793 1.1 mrg return pass_in_memory (arg.mode, arg.type, false);
794 1.1 mrg }
795 1.1 mrg
796 1.1 mrg /* Implement TARGET_RETURN_IN_MEMORY. */
797 1.1 mrg
798 1.1 mrg static bool
799 1.1 mrg nvptx_return_in_memory (const_tree type, const_tree)
800 1.1 mrg {
801 1.1 mrg return pass_in_memory (TYPE_MODE (type), type, true);
802 1.1 mrg }
803 1.1 mrg
804 1.1 mrg /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
805 1.1 mrg
806 1.1 mrg static machine_mode
807 1.1 mrg nvptx_promote_function_mode (const_tree type, machine_mode mode,
808 1.1 mrg int *ARG_UNUSED (punsignedp),
809 1.1 mrg const_tree funtype, int for_return)
810 1.1 mrg {
811 1.1 mrg return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
812 1.1 mrg }
813 1.1 mrg
814 1.1 mrg /* Helper for write_arg. Emit a single PTX argument of MODE, either
815 1.1 mrg in a prototype, or as copy in a function prologue. ARGNO is the
816 1.1 mrg index of this argument in the PTX function. FOR_REG is negative,
817 1.1 mrg if we're emitting the PTX prototype. It is zero if we're copying
818 1.1 mrg to an argument register and it is greater than zero if we're
819 1.1 mrg copying to a specific hard register. */
820 1.1 mrg
821 1.1 mrg static int
822 1.1 mrg write_arg_mode (std::stringstream &s, int for_reg, int argno,
823 1.1 mrg machine_mode mode)
824 1.1 mrg {
825 1.1 mrg const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
826 1.1 mrg
827 1.1 mrg if (for_reg < 0)
828 1.1 mrg {
829 1.1 mrg /* Writing PTX prototype. */
830 1.1 mrg s << (argno ? ", " : " (");
831 1.1 mrg s << ".param" << ptx_type << " %in_ar" << argno;
832 1.1 mrg }
833 1.1 mrg else
834 1.1 mrg {
835 1.1 mrg s << "\t.reg" << ptx_type << " ";
836 1.1 mrg if (for_reg)
837 1.1 mrg s << reg_names[for_reg];
838 1.1 mrg else
839 1.1 mrg s << "%ar" << argno;
840 1.1 mrg s << ";\n";
841 1.1 mrg if (argno >= 0)
842 1.1 mrg {
843 1.1 mrg s << "\tld.param" << ptx_type << " ";
844 1.1 mrg if (for_reg)
845 1.1 mrg s << reg_names[for_reg];
846 1.1 mrg else
847 1.1 mrg s << "%ar" << argno;
848 1.1 mrg s << ", [%in_ar" << argno << "];\n";
849 1.1 mrg }
850 1.1 mrg }
851 1.1 mrg return argno + 1;
852 1.1 mrg }
853 1.1 mrg
854 1.1 mrg /* Process function parameter TYPE to emit one or more PTX
855 1.1 mrg arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
856 1.1 mrg is true, if this is a prototyped function, rather than an old-style
857 1.1 mrg C declaration. Returns the next argument number to use.
858 1.1 mrg
859 1.1 mrg The promotion behavior here must match the regular GCC function
860 1.1 mrg parameter marshalling machinery. */
861 1.1 mrg
862 1.1 mrg static int
863 1.1 mrg write_arg_type (std::stringstream &s, int for_reg, int argno,
864 1.1 mrg tree type, bool prototyped)
865 1.1 mrg {
866 1.1 mrg machine_mode mode = TYPE_MODE (type);
867 1.1 mrg
868 1.1 mrg if (mode == VOIDmode)
869 1.1 mrg return argno;
870 1.1 mrg
871 1.1 mrg if (pass_in_memory (mode, type, false))
872 1.1 mrg mode = Pmode;
873 1.1 mrg else
874 1.1 mrg {
875 1.1 mrg bool split = TREE_CODE (type) == COMPLEX_TYPE;
876 1.1 mrg
877 1.1 mrg if (split)
878 1.1 mrg {
879 1.1 mrg /* Complex types are sent as two separate args. */
880 1.1 mrg type = TREE_TYPE (type);
881 1.1 mrg mode = TYPE_MODE (type);
882 1.1 mrg prototyped = true;
883 1.1 mrg }
884 1.1 mrg
885 1.1 mrg mode = promote_arg (mode, prototyped);
886 1.1 mrg if (split)
887 1.1 mrg argno = write_arg_mode (s, for_reg, argno, mode);
888 1.1 mrg }
889 1.1 mrg
890 1.1 mrg return write_arg_mode (s, for_reg, argno, mode);
891 1.1 mrg }
892 1.1 mrg
893 1.1 mrg /* Emit a PTX return as a prototype or function prologue declaration
894 1.1 mrg for MODE. */
895 1.1 mrg
896 1.1 mrg static void
897 1.1 mrg write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
898 1.1 mrg {
899 1.1 mrg const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
900 1.1 mrg const char *pfx = "\t.reg";
901 1.1 mrg const char *sfx = ";\n";
902 1.1 mrg
903 1.1 mrg if (for_proto)
904 1.1 mrg pfx = "(.param", sfx = "_out) ";
905 1.1 mrg
906 1.1 mrg s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
907 1.1 mrg }
908 1.1 mrg
909 1.1 mrg /* Process a function return TYPE to emit a PTX return as a prototype
910 1.1 mrg or function prologue declaration. Returns true if return is via an
911 1.1 mrg additional pointer parameter. The promotion behavior here must
912 1.1 mrg match the regular GCC function return mashalling. */
913 1.1 mrg
914 1.1 mrg static bool
915 1.1 mrg write_return_type (std::stringstream &s, bool for_proto, tree type)
916 1.1 mrg {
917 1.1 mrg machine_mode mode = TYPE_MODE (type);
918 1.1 mrg
919 1.1 mrg if (mode == VOIDmode)
920 1.1 mrg return false;
921 1.1 mrg
922 1.1 mrg bool return_in_mem = pass_in_memory (mode, type, true);
923 1.1 mrg
924 1.1 mrg if (return_in_mem)
925 1.1 mrg {
926 1.1 mrg if (for_proto)
927 1.1 mrg return return_in_mem;
928 1.1 mrg
929 1.1 mrg /* Named return values can cause us to return a pointer as well
930 1.1 mrg as expect an argument for the return location. This is
931 1.1 mrg optimization-level specific, so no caller can make use of
932 1.1 mrg this data, but more importantly for us, we must ensure it
933 1.1 mrg doesn't change the PTX prototype. */
934 1.1 mrg mode = (machine_mode) cfun->machine->return_mode;
935 1.1 mrg
936 1.1 mrg if (mode == VOIDmode)
937 1.1 mrg return return_in_mem;
938 1.1 mrg
939 1.1 mrg /* Clear return_mode to inhibit copy of retval to non-existent
940 1.1 mrg retval parameter. */
941 1.1 mrg cfun->machine->return_mode = VOIDmode;
942 1.1 mrg }
943 1.1 mrg else
944 1.1 mrg mode = promote_return (mode);
945 1.1 mrg
946 1.1 mrg write_return_mode (s, for_proto, mode);
947 1.1 mrg
948 1.1 mrg return return_in_mem;
949 1.1 mrg }
950 1.1 mrg
951 1.1 mrg /* Look for attributes in ATTRS that would indicate we must write a function
952 1.1 mrg as a .entry kernel rather than a .func. Return true if one is found. */
953 1.1 mrg
954 1.1 mrg static bool
955 1.1 mrg write_as_kernel (tree attrs)
956 1.1 mrg {
957 1.1 mrg return (lookup_attribute ("kernel", attrs) != NULL_TREE
958 1.1 mrg || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
959 1.1 mrg && lookup_attribute ("oacc function", attrs) != NULL_TREE));
960 1.1 mrg /* For OpenMP target regions, the corresponding kernel entry is emitted from
961 1.1 mrg write_omp_entry as a separate function. */
962 1.1 mrg }
963 1.1 mrg
964 1.1 mrg /* Emit a linker marker for a function decl or defn. */
965 1.1 mrg
966 1.1 mrg static void
967 1.1 mrg write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
968 1.1 mrg const char *name)
969 1.1 mrg {
970 1.1 mrg s << "\n// BEGIN";
971 1.1 mrg if (globalize)
972 1.1 mrg s << " GLOBAL";
973 1.1 mrg s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
974 1.1 mrg s << name << "\n";
975 1.1 mrg }
976 1.1 mrg
977 1.1 mrg /* Emit a linker marker for a variable decl or defn. */
978 1.1 mrg
979 1.1 mrg static void
980 1.1 mrg write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
981 1.1 mrg {
982 1.1 mrg fprintf (file, "\n// BEGIN%s VAR %s: ",
983 1.1 mrg globalize ? " GLOBAL" : "",
984 1.1 mrg is_defn ? "DEF" : "DECL");
985 1.1 mrg assemble_name_raw (file, name);
986 1.1 mrg fputs ("\n", file);
987 1.1 mrg }
988 1.1 mrg
989 1.1 mrg /* Helper function for write_fn_proto. */
990 1.1 mrg
991 1.1 mrg static void
992 1.1 mrg write_fn_proto_1 (std::stringstream &s, bool is_defn,
993 1.1 mrg const char *name, const_tree decl)
994 1.1 mrg {
995 1.1 mrg if (lookup_attribute ("alias", DECL_ATTRIBUTES (decl)) == NULL)
996 1.1 mrg write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
997 1.1 mrg
998 1.1 mrg /* PTX declaration. */
999 1.1 mrg if (DECL_EXTERNAL (decl))
1000 1.1 mrg s << ".extern ";
1001 1.1 mrg else if (TREE_PUBLIC (decl))
1002 1.1 mrg s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
1003 1.1 mrg s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
1004 1.1 mrg
1005 1.1 mrg tree fntype = TREE_TYPE (decl);
1006 1.1 mrg tree result_type = TREE_TYPE (fntype);
1007 1.1 mrg
1008 1.1 mrg /* atomic_compare_exchange_$n builtins have an exceptional calling
1009 1.1 mrg convention. */
1010 1.1 mrg int not_atomic_weak_arg = -1;
1011 1.1 mrg if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
1012 1.1 mrg switch (DECL_FUNCTION_CODE (decl))
1013 1.1 mrg {
1014 1.1 mrg case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
1015 1.1 mrg case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
1016 1.1 mrg case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
1017 1.1 mrg case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
1018 1.1 mrg case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
1019 1.1 mrg /* These atomics skip the 'weak' parm in an actual library
1020 1.1 mrg call. We must skip it in the prototype too. */
1021 1.1 mrg not_atomic_weak_arg = 3;
1022 1.1 mrg break;
1023 1.1 mrg
1024 1.1 mrg default:
1025 1.1 mrg break;
1026 1.1 mrg }
1027 1.1 mrg
1028 1.1 mrg /* Declare the result. */
1029 1.1 mrg bool return_in_mem = write_return_type (s, true, result_type);
1030 1.1 mrg
1031 1.1 mrg s << name;
1032 1.1 mrg
1033 1.1 mrg int argno = 0;
1034 1.1 mrg
1035 1.1 mrg /* Emit argument list. */
1036 1.1 mrg if (return_in_mem)
1037 1.1 mrg argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1038 1.1 mrg
1039 1.1 mrg /* We get:
1040 1.1 mrg NULL in TYPE_ARG_TYPES, for old-style functions
1041 1.1 mrg NULL in DECL_ARGUMENTS, for builtin functions without another
1042 1.1 mrg declaration.
1043 1.1 mrg So we have to pick the best one we have. */
1044 1.1 mrg tree args = TYPE_ARG_TYPES (fntype);
1045 1.1 mrg bool prototyped = true;
1046 1.1 mrg if (!args)
1047 1.1 mrg {
1048 1.1 mrg args = DECL_ARGUMENTS (decl);
1049 1.1 mrg prototyped = false;
1050 1.1 mrg }
1051 1.1 mrg
1052 1.1 mrg for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
1053 1.1 mrg {
1054 1.1 mrg tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1055 1.1 mrg
1056 1.1 mrg if (not_atomic_weak_arg)
1057 1.1 mrg argno = write_arg_type (s, -1, argno, type, prototyped);
1058 1.1 mrg else
1059 1.1 mrg gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE);
1060 1.1 mrg }
1061 1.1 mrg
1062 1.1 mrg if (stdarg_p (fntype))
1063 1.1 mrg argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1064 1.1 mrg
1065 1.1 mrg if (DECL_STATIC_CHAIN (decl))
1066 1.1 mrg argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1067 1.1 mrg
1068 1.1 mrg if (argno < 2 && strcmp (name, "main") == 0)
1069 1.1 mrg {
1070 1.1 mrg if (argno == 0)
1071 1.1 mrg argno = write_arg_type (s, -1, argno, integer_type_node, true);
1072 1.1 mrg
1073 1.1 mrg if (argno == 1)
1074 1.1 mrg argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1075 1.1 mrg }
1076 1.1 mrg
1077 1.1 mrg if (argno)
1078 1.1 mrg s << ")";
1079 1.1 mrg
1080 1.1 mrg s << (is_defn ? "\n" : ";\n");
1081 1.1 mrg }
1082 1.1 mrg
1083 1.1 mrg /* Write a .func or .kernel declaration or definition along with
1084 1.1 mrg a helper comment for use by ld. S is the stream to write to, DECL
1085 1.1 mrg the decl for the function with name NAME. For definitions, emit
1086 1.1 mrg a declaration too. */
1087 1.1 mrg
1088 1.1 mrg static void
1089 1.1 mrg write_fn_proto (std::stringstream &s, bool is_defn,
1090 1.1 mrg const char *name, const_tree decl)
1091 1.1 mrg {
1092 1.1 mrg const char *replacement = nvptx_name_replacement (name);
1093 1.1 mrg char *replaced_dots = NULL;
1094 1.1 mrg if (replacement != name)
1095 1.1 mrg name = replacement;
1096 1.1 mrg else
1097 1.1 mrg {
1098 1.1 mrg replaced_dots = nvptx_replace_dot (name);
1099 1.1 mrg if (replaced_dots)
1100 1.1 mrg name = replaced_dots;
1101 1.1 mrg }
1102 1.1 mrg if (name[0] == '*')
1103 1.1 mrg name++;
1104 1.1 mrg
1105 1.1 mrg if (is_defn)
1106 1.1 mrg /* Emit a declaration. The PTX assembler gets upset without it. */
1107 1.1 mrg write_fn_proto_1 (s, false, name, decl);
1108 1.1 mrg
1109 1.1 mrg write_fn_proto_1 (s, is_defn, name, decl);
1110 1.1 mrg
1111 1.1 mrg if (replaced_dots)
1112 1.1 mrg XDELETE (replaced_dots);
1113 1.1 mrg }
1114 1.1 mrg
1115 1.1 mrg /* Construct a function declaration from a call insn. This can be
1116 1.1 mrg necessary for two reasons - either we have an indirect call which
1117 1.1 mrg requires a .callprototype declaration, or we have a libcall
1118 1.1 mrg generated by emit_library_call for which no decl exists. */
1119 1.1 mrg
1120 1.1 mrg static void
1121 1.1 mrg write_fn_proto_from_insn (std::stringstream &s, const char *name,
1122 1.1 mrg rtx result, rtx pat)
1123 1.1 mrg {
1124 1.1 mrg char *replaced_dots = NULL;
1125 1.1 mrg
1126 1.1 mrg if (!name)
1127 1.1 mrg {
1128 1.1 mrg s << "\t.callprototype ";
1129 1.1 mrg name = "_";
1130 1.1 mrg }
1131 1.1 mrg else
1132 1.1 mrg {
1133 1.1 mrg const char *replacement = nvptx_name_replacement (name);
1134 1.1 mrg if (replacement != name)
1135 1.1 mrg name = replacement;
1136 1.1 mrg else
1137 1.1 mrg {
1138 1.1 mrg replaced_dots = nvptx_replace_dot (name);
1139 1.1 mrg if (replaced_dots)
1140 1.1 mrg name = replaced_dots;
1141 1.1 mrg }
1142 1.1 mrg write_fn_marker (s, false, true, name);
1143 1.1 mrg s << "\t.extern .func ";
1144 1.1 mrg }
1145 1.1 mrg
1146 1.1 mrg if (result != NULL_RTX)
1147 1.1 mrg write_return_mode (s, true, GET_MODE (result));
1148 1.1 mrg
1149 1.1 mrg s << name;
1150 1.1 mrg if (replaced_dots)
1151 1.1 mrg XDELETE (replaced_dots);
1152 1.1 mrg
1153 1.1 mrg int arg_end = XVECLEN (pat, 0);
1154 1.1 mrg for (int i = 1; i < arg_end; i++)
1155 1.1 mrg {
1156 1.1 mrg /* We don't have to deal with mode splitting & promotion here,
1157 1.1 mrg as that was already done when generating the call
1158 1.1 mrg sequence. */
1159 1.1 mrg machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
1160 1.1 mrg
1161 1.1 mrg write_arg_mode (s, -1, i - 1, mode);
1162 1.1 mrg }
1163 1.1 mrg if (arg_end != 1)
1164 1.1 mrg s << ")";
1165 1.1 mrg s << ";\n";
1166 1.1 mrg }
1167 1.1 mrg
1168 1.1 mrg /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
1169 1.1 mrg table and write a ptx prototype. These are emitted at end of
1170 1.1 mrg compilation. */
1171 1.1 mrg
1172 1.1 mrg static void
1173 1.1 mrg nvptx_record_fndecl (tree decl)
1174 1.1 mrg {
1175 1.1 mrg tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
1176 1.1 mrg if (*slot == NULL)
1177 1.1 mrg {
1178 1.1 mrg *slot = decl;
1179 1.1 mrg const char *name = get_fnname_from_decl (decl);
1180 1.1 mrg write_fn_proto (func_decls, false, name, decl);
1181 1.1 mrg }
1182 1.1 mrg }
1183 1.1 mrg
1184 1.1 mrg /* Record a libcall or unprototyped external function. CALLEE is the
1185 1.1 mrg SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
1186 1.1 mrg declaration for it. */
1187 1.1 mrg
1188 1.1 mrg static void
1189 1.1 mrg nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
1190 1.1 mrg {
1191 1.1 mrg rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1192 1.1 mrg if (*slot == NULL)
1193 1.1 mrg {
1194 1.1 mrg *slot = callee;
1195 1.1 mrg
1196 1.1 mrg const char *name = XSTR (callee, 0);
1197 1.1 mrg write_fn_proto_from_insn (func_decls, name, retval, pat);
1198 1.1 mrg }
1199 1.1 mrg }
1200 1.1 mrg
1201 1.1 mrg /* DECL is an external FUNCTION_DECL, that we're referencing. If it
1202 1.1 mrg is prototyped, record it now. Otherwise record it as needed at end
1203 1.1 mrg of compilation, when we might have more information about it. */
1204 1.1 mrg
1205 1.1 mrg void
1206 1.1 mrg nvptx_record_needed_fndecl (tree decl)
1207 1.1 mrg {
1208 1.1 mrg if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1209 1.1 mrg {
1210 1.1 mrg tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1211 1.1 mrg if (*slot == NULL)
1212 1.1 mrg *slot = decl;
1213 1.1 mrg }
1214 1.1 mrg else
1215 1.1 mrg nvptx_record_fndecl (decl);
1216 1.1 mrg }
1217 1.1 mrg
1218 1.1 mrg /* SYM is a SYMBOL_REF. If it refers to an external function, record
1219 1.1 mrg it as needed. */
1220 1.1 mrg
1221 1.1 mrg static void
1222 1.1 mrg nvptx_maybe_record_fnsym (rtx sym)
1223 1.1 mrg {
1224 1.1 mrg tree decl = SYMBOL_REF_DECL (sym);
1225 1.1 mrg
1226 1.1 mrg if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1227 1.1 mrg nvptx_record_needed_fndecl (decl);
1228 1.1 mrg }
1229 1.1 mrg
1230 1.1 mrg /* Emit a local array to hold some part of a conventional stack frame
1231 1.1 mrg and initialize REGNO to point to it. If the size is zero, it'll
1232 1.1 mrg never be valid to dereference, so we can simply initialize to
1233 1.1 mrg zero. */
1234 1.1 mrg
1235 1.1 mrg static void
1236 1.1 mrg init_frame (FILE *file, int regno, unsigned align, unsigned size)
1237 1.1 mrg {
1238 1.1 mrg if (size)
1239 1.1 mrg fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1240 1.1 mrg align, reg_names[regno], size);
1241 1.1 mrg fprintf (file, "\t.reg.u%d %s;\n",
1242 1.1 mrg POINTER_SIZE, reg_names[regno]);
1243 1.1 mrg fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1244 1.1 mrg : "\tmov.u%d %s, 0;\n"),
1245 1.1 mrg POINTER_SIZE, reg_names[regno], reg_names[regno]);
1246 1.1 mrg }
1247 1.1 mrg
1248 1.1 mrg /* Emit soft stack frame setup sequence. */
1249 1.1 mrg
1250 1.1 mrg static void
1251 1.1 mrg init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1252 1.1 mrg {
1253 1.1 mrg /* Maintain 64-bit stack alignment. */
1254 1.1 mrg unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1255 1.1 mrg size = ROUND_UP (size, keep_align);
1256 1.1 mrg int bits = POINTER_SIZE;
1257 1.1 mrg const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1258 1.1 mrg const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1259 1.1 mrg const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1260 1.1 mrg const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1261 1.1 mrg fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1262 1.1 mrg fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1263 1.1 mrg fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1264 1.1 mrg fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1265 1.1 mrg fprintf (file, "\t{\n");
1266 1.1 mrg fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1267 1.1 mrg fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1268 1.1 mrg fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1269 1.1 mrg fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1270 1.1 mrg fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1271 1.1 mrg bits == 64 ? ".wide" : ".lo", bits / 8);
1272 1.1 mrg fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1273 1.1 mrg
1274 1.1 mrg /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1275 1.1 mrg fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1276 1.1 mrg
1277 1.1 mrg /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1278 1.1 mrg fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1279 1.1 mrg bits, reg_sspprev, reg_sspslot);
1280 1.1 mrg
1281 1.1 mrg /* Initialize %frame = %sspprev - size. */
1282 1.1 mrg fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1283 1.1 mrg bits, reg_frame, reg_sspprev, size);
1284 1.1 mrg
1285 1.1 mrg /* Apply alignment, if larger than 64. */
1286 1.1 mrg if (alignment > keep_align)
1287 1.1 mrg fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1288 1.1 mrg bits, reg_frame, reg_frame, -alignment);
1289 1.1 mrg
1290 1.1 mrg size = crtl->outgoing_args_size;
1291 1.1 mrg gcc_assert (size % keep_align == 0);
1292 1.1 mrg
1293 1.1 mrg /* Initialize %stack. */
1294 1.1 mrg fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1295 1.1 mrg bits, reg_stack, reg_frame, size);
1296 1.1 mrg
1297 1.1 mrg if (!crtl->is_leaf)
1298 1.1 mrg fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1299 1.1 mrg bits, reg_sspslot, reg_stack);
1300 1.1 mrg fprintf (file, "\t}\n");
1301 1.1 mrg cfun->machine->has_softstack = true;
1302 1.1 mrg need_softstack_decl = true;
1303 1.1 mrg }
1304 1.1 mrg
1305 1.1 mrg /* Emit code to initialize the REGNO predicate register to indicate
1306 1.1 mrg whether we are not lane zero on the NAME axis. */
1307 1.1 mrg
1308 1.1 mrg static void
1309 1.1 mrg nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1310 1.1 mrg {
1311 1.1 mrg fprintf (file, "\t{\n");
1312 1.1 mrg fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1313 1.1 mrg if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1314 1.1 mrg {
1315 1.1 mrg fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1316 1.1 mrg fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1317 1.1 mrg }
1318 1.1 mrg fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1319 1.1 mrg fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1320 1.1 mrg if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1321 1.1 mrg {
1322 1.1 mrg fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1323 1.1 mrg fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1324 1.1 mrg fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1325 1.1 mrg "// vector reduction buffer\n",
1326 1.1 mrg REGNO (cfun->machine->red_partition),
1327 1.1 mrg vector_red_partition);
1328 1.1 mrg }
1329 1.1 mrg /* Verify vector_red_size. */
1330 1.1 mrg gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1331 1.1 mrg <= vector_red_size);
1332 1.1 mrg fprintf (file, "\t}\n");
1333 1.1 mrg }
1334 1.1 mrg
1335 1.1 mrg /* Emit code to initialize OpenACC worker broadcast and synchronization
1336 1.1 mrg registers. */
1337 1.1 mrg
1338 1.1 mrg static void
1339 1.1 mrg nvptx_init_oacc_workers (FILE *file)
1340 1.1 mrg {
1341 1.1 mrg fprintf (file, "\t{\n");
1342 1.1 mrg fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1343 1.1 mrg if (cfun->machine->bcast_partition)
1344 1.1 mrg {
1345 1.1 mrg fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1346 1.1 mrg fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1347 1.1 mrg }
1348 1.1 mrg fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1349 1.1 mrg if (cfun->machine->bcast_partition)
1350 1.1 mrg {
1351 1.1 mrg fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1352 1.1 mrg fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1353 1.1 mrg fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1354 1.1 mrg fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1355 1.1 mrg "// vector broadcast offset\n",
1356 1.1 mrg REGNO (cfun->machine->bcast_partition),
1357 1.1 mrg oacc_bcast_partition);
1358 1.1 mrg }
1359 1.1 mrg /* Verify oacc_bcast_size. */
1360 1.1 mrg gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1361 1.1 mrg <= oacc_bcast_size);
1362 1.1 mrg if (cfun->machine->sync_bar)
1363 1.1 mrg fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1364 1.1 mrg "// vector synchronization barrier\n",
1365 1.1 mrg REGNO (cfun->machine->sync_bar));
1366 1.1 mrg fprintf (file, "\t}\n");
1367 1.1 mrg }
1368 1.1 mrg
1369 1.1 mrg /* Emit code to initialize predicate and master lane index registers for
1370 1.1 mrg -muniform-simt code generation variant. */
1371 1.1 mrg
1372 1.1 mrg static void
1373 1.1 mrg nvptx_init_unisimt_predicate (FILE *file)
1374 1.1 mrg {
1375 1.1 mrg cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1376 1.1 mrg int loc = REGNO (cfun->machine->unisimt_location);
1377 1.1 mrg int bits = POINTER_SIZE;
1378 1.1 mrg fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1379 1.1 mrg fprintf (file, "\t{\n");
1380 1.1 mrg fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1381 1.1 mrg fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1382 1.1 mrg fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1383 1.1 mrg fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1384 1.1 mrg bits == 64 ? ".wide" : ".lo");
1385 1.1 mrg fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1386 1.1 mrg fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1387 1.1 mrg if (cfun->machine->unisimt_predicate)
1388 1.1 mrg {
1389 1.1 mrg int master = REGNO (cfun->machine->unisimt_master);
1390 1.1 mrg int pred = REGNO (cfun->machine->unisimt_predicate);
1391 1.1 mrg fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1392 1.1 mrg if (cfun->machine->unisimt_outside_simt_predicate)
1393 1.1 mrg {
1394 1.1 mrg int pred_outside_simt
1395 1.1 mrg = REGNO (cfun->machine->unisimt_outside_simt_predicate);
1396 1.1 mrg fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, 0;\n",
1397 1.1 mrg pred_outside_simt, master);
1398 1.1 mrg }
1399 1.1 mrg fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1400 1.1 mrg /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1401 1.1 mrg fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1402 1.1 mrg /* Compute predicate as 'tid.x == master'. */
1403 1.1 mrg fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1404 1.1 mrg }
1405 1.1 mrg fprintf (file, "\t}\n");
1406 1.1 mrg need_unisimt_decl = true;
1407 1.1 mrg }
1408 1.1 mrg
1409 1.1 mrg /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1410 1.1 mrg
1411 1.1 mrg extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1412 1.1 mrg void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1413 1.1 mrg {
1414 1.1 mrg __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1415 1.1 mrg __nvptx_uni[tid.y] = 0;
1416 1.1 mrg gomp_nvptx_main (ORIG, arg);
1417 1.1 mrg }
1418 1.1 mrg ORIG itself should not be emitted as a PTX .entry function. */
1419 1.1 mrg
1420 1.1 mrg static void
1421 1.1 mrg write_omp_entry (FILE *file, const char *name, const char *orig)
1422 1.1 mrg {
1423 1.1 mrg static bool gomp_nvptx_main_declared;
1424 1.1 mrg if (!gomp_nvptx_main_declared)
1425 1.1 mrg {
1426 1.1 mrg gomp_nvptx_main_declared = true;
1427 1.1 mrg write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1428 1.1 mrg func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1429 1.1 mrg << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1430 1.1 mrg }
1431 1.1 mrg /* PR79332. Single out this string; it confuses gcc.pot generation. */
1432 1.1 mrg #define NTID_Y "%ntid.y"
1433 1.1 mrg #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1434 1.1 mrg (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1435 1.1 mrg {\n\
1436 1.1 mrg .reg.u32 %r<3>;\n\
1437 1.1 mrg .reg.u" PS " %R<4>;\n\
1438 1.1 mrg mov.u32 %r0, %tid.y;\n\
1439 1.1 mrg mov.u32 %r1, " NTID_Y ";\n\
1440 1.1 mrg mov.u32 %r2, %ctaid.x;\n\
1441 1.1 mrg cvt.u" PS ".u32 %R1, %r0;\n\
1442 1.1 mrg " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1443 1.1 mrg mov.u" PS " %R0, __nvptx_stacks;\n\
1444 1.1 mrg " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1445 1.1 mrg ld.param.u" PS " %R2, [%stack];\n\
1446 1.1 mrg ld.param.u" PS " %R3, [%sz];\n\
1447 1.1 mrg add.u" PS " %R2, %R2, %R3;\n\
1448 1.1 mrg mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1449 1.1 mrg st.shared.u" PS " [%R0], %R2;\n\
1450 1.1 mrg mov.u" PS " %R0, __nvptx_uni;\n\
1451 1.1 mrg " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1452 1.1 mrg mov.u32 %r0, 0;\n\
1453 1.1 mrg st.shared.u32 [%R0], %r0;\n\
1454 1.1 mrg mov.u" PS " %R0, \0;\n\
1455 1.1 mrg ld.param.u" PS " %R1, [%arg];\n\
1456 1.1 mrg {\n\
1457 1.1 mrg .param.u" PS " %P<2>;\n\
1458 1.1 mrg st.param.u" PS " [%P0], %R0;\n\
1459 1.1 mrg st.param.u" PS " [%P1], %R1;\n\
1460 1.1 mrg call.uni gomp_nvptx_main, (%P0, %P1);\n\
1461 1.1 mrg }\n\
1462 1.1 mrg ret.uni;\n\
1463 1.1 mrg }\n"
1464 1.1 mrg static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1465 1.1 mrg static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1466 1.1 mrg #undef ENTRY_TEMPLATE
1467 1.1 mrg #undef NTID_Y
1468 1.1 mrg const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1469 1.1 mrg /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1470 1.1 mrg const char *entry_2 = entry_1 + strlen (entry64) + 1;
1471 1.1 mrg fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1472 1.1 mrg need_softstack_decl = need_unisimt_decl = true;
1473 1.1 mrg }
1474 1.1 mrg
1475 1.1 mrg /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1476 1.1 mrg function, including local var decls and copies from the arguments to
1477 1.1 mrg local regs. */
1478 1.1 mrg
1479 1.1 mrg void
1480 1.1 mrg nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1481 1.1 mrg {
1482 1.1 mrg tree fntype = TREE_TYPE (decl);
1483 1.1 mrg tree result_type = TREE_TYPE (fntype);
1484 1.1 mrg int argno = 0;
1485 1.1 mrg
1486 1.1 mrg if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1487 1.1 mrg && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1488 1.1 mrg {
1489 1.1 mrg char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1490 1.1 mrg sprintf (buf, "%s$impl", name);
1491 1.1 mrg write_omp_entry (file, name, buf);
1492 1.1 mrg name = buf;
1493 1.1 mrg }
1494 1.1 mrg /* We construct the initial part of the function into a string
1495 1.1 mrg stream, in order to share the prototype writing code. */
1496 1.1 mrg std::stringstream s;
1497 1.1 mrg write_fn_proto (s, true, name, decl);
1498 1.1 mrg s << "{\n";
1499 1.1 mrg
1500 1.1 mrg bool return_in_mem = write_return_type (s, false, result_type);
1501 1.1 mrg if (return_in_mem)
1502 1.1 mrg argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1503 1.1 mrg
1504 1.1 mrg /* Declare and initialize incoming arguments. */
1505 1.1 mrg tree args = TYPE_ARG_TYPES (fntype);
1506 1.1 mrg bool prototyped = true;
1507 1.1 mrg if (!args)
1508 1.1 mrg {
1509 1.1 mrg args = DECL_ARGUMENTS (decl);
1510 1.1 mrg prototyped = false;
1511 1.1 mrg }
1512 1.1 mrg
1513 1.1 mrg for (; args != NULL_TREE; args = TREE_CHAIN (args))
1514 1.1 mrg {
1515 1.1 mrg tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1516 1.1 mrg
1517 1.1 mrg argno = write_arg_type (s, 0, argno, type, prototyped);
1518 1.1 mrg }
1519 1.1 mrg
1520 1.1 mrg if (stdarg_p (fntype))
1521 1.1 mrg argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1522 1.1 mrg true);
1523 1.1 mrg
1524 1.1 mrg if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1525 1.1 mrg write_arg_type (s, STATIC_CHAIN_REGNUM,
1526 1.1 mrg DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1527 1.1 mrg true);
1528 1.1 mrg
1529 1.1 mrg fprintf (file, "%s", s.str().c_str());
1530 1.1 mrg
1531 1.1 mrg /* Usually 'crtl->is_leaf' is computed during register allocator
1532 1.1 mrg initialization (which is not done on NVPTX) or for pressure-sensitive
1533 1.1 mrg optimizations. Initialize it here, except if already set. */
1534 1.1 mrg if (!crtl->is_leaf)
1535 1.1 mrg crtl->is_leaf = leaf_function_p ();
1536 1.1 mrg
1537 1.1 mrg HOST_WIDE_INT sz = get_frame_size ();
1538 1.1 mrg bool need_frameptr = sz || cfun->machine->has_chain;
1539 1.1 mrg int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1540 1.1 mrg if (!TARGET_SOFT_STACK)
1541 1.1 mrg {
1542 1.1 mrg /* Declare a local var for outgoing varargs. */
1543 1.1 mrg if (cfun->machine->has_varadic)
1544 1.1 mrg init_frame (file, STACK_POINTER_REGNUM,
1545 1.1 mrg UNITS_PER_WORD, crtl->outgoing_args_size);
1546 1.1 mrg
1547 1.1 mrg /* Declare a local variable for the frame. Force its size to be
1548 1.1 mrg DImode-compatible. */
1549 1.1 mrg if (need_frameptr)
1550 1.1 mrg init_frame (file, FRAME_POINTER_REGNUM, alignment,
1551 1.1 mrg ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1552 1.1 mrg }
1553 1.1 mrg else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1554 1.1 mrg || (cfun->machine->has_simtreg && !crtl->is_leaf))
1555 1.1 mrg init_softstack_frame (file, alignment, sz);
1556 1.1 mrg
1557 1.1 mrg if (cfun->machine->has_simtreg)
1558 1.1 mrg {
1559 1.1 mrg unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1560 1.1 mrg unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1561 1.1 mrg align = MAX (align, GET_MODE_SIZE (DImode));
1562 1.1 mrg if (!crtl->is_leaf || cfun->calls_alloca)
1563 1.1 mrg simtsz = HOST_WIDE_INT_M1U;
1564 1.1 mrg if (simtsz == HOST_WIDE_INT_M1U)
1565 1.1 mrg simtsz = nvptx_softstack_size;
1566 1.1 mrg if (cfun->machine->has_softstack)
1567 1.1 mrg simtsz += POINTER_SIZE / 8;
1568 1.1 mrg simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1569 1.1 mrg if (align > GET_MODE_SIZE (DImode))
1570 1.1 mrg simtsz += align - GET_MODE_SIZE (DImode);
1571 1.1 mrg if (simtsz)
1572 1.1 mrg fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1573 1.1 mrg HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1574 1.1 mrg }
1575 1.1 mrg
1576 1.1 mrg /* Restore the vector reduction partition register, if necessary.
1577 1.1 mrg FIXME: Find out when and why this is necessary, and fix it. */
1578 1.1 mrg if (cfun->machine->red_partition)
1579 1.1 mrg regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1580 1.1 mrg = cfun->machine->red_partition;
1581 1.1 mrg
1582 1.1 mrg /* Declare the pseudos we have as ptx registers. */
1583 1.1 mrg int maxregs = max_reg_num ();
1584 1.1 mrg for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1585 1.1 mrg {
1586 1.1 mrg if (regno_reg_rtx[i] != const0_rtx)
1587 1.1 mrg {
1588 1.1 mrg machine_mode mode = PSEUDO_REGNO_MODE (i);
1589 1.1 mrg machine_mode split = maybe_split_mode (mode);
1590 1.1 mrg
1591 1.1 mrg if (split_mode_p (mode))
1592 1.1 mrg mode = split;
1593 1.1 mrg fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1594 1.1 mrg output_reg (file, i, split, -2);
1595 1.1 mrg fprintf (file, ";\n");
1596 1.1 mrg }
1597 1.1 mrg }
1598 1.1 mrg
1599 1.1 mrg /* Emit axis predicates. */
1600 1.1 mrg if (cfun->machine->axis_predicate[0])
1601 1.1 mrg nvptx_init_axis_predicate (file,
1602 1.1 mrg REGNO (cfun->machine->axis_predicate[0]), "y");
1603 1.1 mrg if (cfun->machine->axis_predicate[1])
1604 1.1 mrg nvptx_init_axis_predicate (file,
1605 1.1 mrg REGNO (cfun->machine->axis_predicate[1]), "x");
1606 1.1 mrg if (cfun->machine->unisimt_predicate
1607 1.1 mrg || (cfun->machine->has_simtreg && !crtl->is_leaf))
1608 1.1 mrg nvptx_init_unisimt_predicate (file);
1609 1.1 mrg if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1610 1.1 mrg nvptx_init_oacc_workers (file);
1611 1.1 mrg }
1612 1.1 mrg
1613 1.1 mrg /* Output code for switching uniform-simt state. ENTERING indicates whether
1614 1.1 mrg we are entering or leaving non-uniform execution region. */
1615 1.1 mrg
1616 1.1 mrg static void
1617 1.1 mrg nvptx_output_unisimt_switch (FILE *file, bool entering)
1618 1.1 mrg {
1619 1.1 mrg if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1620 1.1 mrg return;
1621 1.1 mrg fprintf (file, "\t{\n");
1622 1.1 mrg fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1623 1.1 mrg fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1624 1.1 mrg if (cfun->machine->unisimt_outside_simt_predicate)
1625 1.1 mrg {
1626 1.1 mrg int pred_outside_simt
1627 1.1 mrg = REGNO (cfun->machine->unisimt_outside_simt_predicate);
1628 1.1 mrg fprintf (file, "\t\tmov.pred %%r%d, %d;\n", pred_outside_simt,
1629 1.1 mrg entering ? 0 : 1);
1630 1.1 mrg }
1631 1.1 mrg if (!crtl->is_leaf)
1632 1.1 mrg {
1633 1.1 mrg int loc = REGNO (cfun->machine->unisimt_location);
1634 1.1 mrg fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1635 1.1 mrg }
1636 1.1 mrg if (cfun->machine->unisimt_predicate)
1637 1.1 mrg {
1638 1.1 mrg int master = REGNO (cfun->machine->unisimt_master);
1639 1.1 mrg int pred = REGNO (cfun->machine->unisimt_predicate);
1640 1.1 mrg fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1641 1.1 mrg fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1642 1.1 mrg master, entering ? "%ustmp2" : "0");
1643 1.1 mrg fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1644 1.1 mrg }
1645 1.1 mrg fprintf (file, "\t}\n");
1646 1.1 mrg }
1647 1.1 mrg
1648 1.1 mrg /* Output code for allocating per-lane storage and switching soft-stack pointer.
1649 1.1 mrg ENTERING indicates whether we are entering or leaving non-uniform execution.
1650 1.1 mrg PTR is the register pointing to allocated storage, it is assigned to on
1651 1.1 mrg entering and used to restore state on leaving. SIZE and ALIGN are used only
1652 1.1 mrg on entering. */
1653 1.1 mrg
1654 1.1 mrg static void
1655 1.1 mrg nvptx_output_softstack_switch (FILE *file, bool entering,
1656 1.1 mrg rtx ptr, rtx size, rtx align)
1657 1.1 mrg {
1658 1.1 mrg gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1659 1.1 mrg if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1660 1.1 mrg return;
1661 1.1 mrg int bits = POINTER_SIZE, regno = REGNO (ptr);
1662 1.1 mrg fprintf (file, "\t{\n");
1663 1.1 mrg if (entering)
1664 1.1 mrg {
1665 1.1 mrg fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1666 1.1 mrg HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1667 1.1 mrg cfun->machine->simt_stack_size);
1668 1.1 mrg fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1669 1.1 mrg if (CONST_INT_P (size))
1670 1.1 mrg fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1671 1.1 mrg ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1672 1.1 mrg else
1673 1.1 mrg output_reg (file, REGNO (size), VOIDmode);
1674 1.1 mrg fputs (";\n", file);
1675 1.1 mrg if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1676 1.1 mrg fprintf (file,
1677 1.1 mrg "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1678 1.1 mrg bits, regno, regno, UINTVAL (align));
1679 1.1 mrg }
1680 1.1 mrg if (cfun->machine->has_softstack)
1681 1.1 mrg {
1682 1.1 mrg const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1683 1.1 mrg if (entering)
1684 1.1 mrg {
1685 1.1 mrg fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1686 1.1 mrg bits, regno, bits / 8, reg_stack);
1687 1.1 mrg fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1688 1.1 mrg bits, reg_stack, regno, bits / 8);
1689 1.1 mrg }
1690 1.1 mrg else
1691 1.1 mrg {
1692 1.1 mrg fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1693 1.1 mrg bits, reg_stack, regno, bits / 8);
1694 1.1 mrg }
1695 1.1 mrg nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1696 1.1 mrg }
1697 1.1 mrg fprintf (file, "\t}\n");
1698 1.1 mrg }
1699 1.1 mrg
1700 1.1 mrg /* Output code to enter non-uniform execution region. DEST is a register
1701 1.1 mrg to hold a per-lane allocation given by SIZE and ALIGN. */
1702 1.1 mrg
1703 1.1 mrg const char *
1704 1.1 mrg nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1705 1.1 mrg {
1706 1.1 mrg nvptx_output_unisimt_switch (asm_out_file, true);
1707 1.1 mrg nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1708 1.1 mrg return "";
1709 1.1 mrg }
1710 1.1 mrg
1711 1.1 mrg /* Output code to leave non-uniform execution region. SRC is the register
1712 1.1 mrg holding per-lane storage previously allocated by omp_simt_enter insn. */
1713 1.1 mrg
1714 1.1 mrg const char *
1715 1.1 mrg nvptx_output_simt_exit (rtx src)
1716 1.1 mrg {
1717 1.1 mrg nvptx_output_unisimt_switch (asm_out_file, false);
1718 1.1 mrg nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1719 1.1 mrg return "";
1720 1.1 mrg }
1721 1.1 mrg
1722 1.1 mrg /* Output instruction that sets soft stack pointer in shared memory to the
1723 1.1 mrg value in register given by SRC_REGNO. */
1724 1.1 mrg
1725 1.1 mrg const char *
1726 1.1 mrg nvptx_output_set_softstack (unsigned src_regno)
1727 1.1 mrg {
1728 1.1 mrg if (cfun->machine->has_softstack && !crtl->is_leaf)
1729 1.1 mrg {
1730 1.1 mrg fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1731 1.1 mrg POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1732 1.1 mrg output_reg (asm_out_file, src_regno, VOIDmode);
1733 1.1 mrg fprintf (asm_out_file, ";\n");
1734 1.1 mrg }
1735 1.1 mrg return "";
1736 1.1 mrg }
1737 1.1 mrg /* Output a return instruction. Also copy the return value to its outgoing
1738 1.1 mrg location. */
1739 1.1 mrg
1740 1.1 mrg const char *
1741 1.1 mrg nvptx_output_return (void)
1742 1.1 mrg {
1743 1.1 mrg machine_mode mode = (machine_mode)cfun->machine->return_mode;
1744 1.1 mrg
1745 1.1 mrg if (mode != VOIDmode)
1746 1.1 mrg fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1747 1.1 mrg nvptx_ptx_type_from_mode (mode, false),
1748 1.1 mrg reg_names[NVPTX_RETURN_REGNUM],
1749 1.1 mrg reg_names[NVPTX_RETURN_REGNUM]);
1750 1.1 mrg
1751 1.1 mrg return "ret;";
1752 1.1 mrg }
1753 1.1 mrg
1754 1.1 mrg /* Terminate a function by writing a closing brace to FILE. */
1755 1.1 mrg
1756 1.1 mrg void
1757 1.1 mrg nvptx_function_end (FILE *file)
1758 1.1 mrg {
1759 1.1 mrg fprintf (file, "}\n");
1760 1.1 mrg }
1761 1.1 mrg
1762 1.1 mrg /* Decide whether we can make a sibling call to a function. For ptx, we
1764 1.1 mrg can't. */
1765 1.1 mrg
1766 1.1 mrg static bool
1767 1.1 mrg nvptx_function_ok_for_sibcall (tree, tree)
1768 1.1 mrg {
1769 1.1 mrg return false;
1770 1.1 mrg }
1771 1.1 mrg
1772 1.1 mrg /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1773 1.1 mrg
1774 1.1 mrg static rtx
1775 1.1 mrg nvptx_get_drap_rtx (void)
1776 1.1 mrg {
1777 1.1 mrg if (TARGET_SOFT_STACK && stack_realign_drap)
1778 1.1 mrg return arg_pointer_rtx;
1779 1.1 mrg return NULL_RTX;
1780 1.1 mrg }
1781 1.1 mrg
1782 1.1 mrg /* Implement the TARGET_CALL_ARGS hook. Record information about one
1783 1.1 mrg argument to the next call. */
1784 1.1 mrg
1785 1.1 mrg static void
1786 1.1 mrg nvptx_call_args (rtx arg, tree fntype)
1787 1.1 mrg {
1788 1.1 mrg if (!cfun->machine->doing_call)
1789 1.1 mrg {
1790 1.1 mrg cfun->machine->doing_call = true;
1791 1.1 mrg cfun->machine->is_varadic = false;
1792 1.1 mrg cfun->machine->num_args = 0;
1793 1.1 mrg
1794 1.1 mrg if (fntype && stdarg_p (fntype))
1795 1.1 mrg {
1796 1.1 mrg cfun->machine->is_varadic = true;
1797 1.1 mrg cfun->machine->has_varadic = true;
1798 1.1 mrg cfun->machine->num_args++;
1799 1.1 mrg }
1800 1.1 mrg }
1801 1.1 mrg
1802 1.1 mrg if (REG_P (arg) && arg != pc_rtx)
1803 1.1 mrg {
1804 1.1 mrg cfun->machine->num_args++;
1805 1.1 mrg cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1806 1.1 mrg cfun->machine->call_args);
1807 1.1 mrg }
1808 1.1 mrg }
1809 1.1 mrg
1810 1.1 mrg /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1811 1.1 mrg information we recorded. */
1812 1.1 mrg
1813 1.1 mrg static void
1814 1.1 mrg nvptx_end_call_args (void)
1815 1.1 mrg {
1816 1.1 mrg cfun->machine->doing_call = false;
1817 1.1 mrg free_EXPR_LIST_list (&cfun->machine->call_args);
1818 1.1 mrg }
1819 1.1 mrg
1820 1.1 mrg /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1821 1.1 mrg track of whether calls involving static chains or varargs were seen
1822 1.1 mrg in the current function.
1823 1.1 mrg For libcalls, maintain a hash table of decls we have seen, and
1824 1.1 mrg record a function decl for later when encountering a new one. */
1825 1.1 mrg
1826 1.1 mrg void
1827 1.1 mrg nvptx_expand_call (rtx retval, rtx address)
1828 1.1 mrg {
1829 1.1 mrg rtx callee = XEXP (address, 0);
1830 1.1 mrg rtx varargs = NULL_RTX;
1831 1.1 mrg unsigned parallel = 0;
1832 1.1 mrg
1833 1.1 mrg if (!call_insn_operand (callee, Pmode))
1834 1.1 mrg {
1835 1.1 mrg callee = force_reg (Pmode, callee);
1836 1.1 mrg address = change_address (address, QImode, callee);
1837 1.1 mrg }
1838 1.1 mrg
1839 1.1 mrg if (GET_CODE (callee) == SYMBOL_REF)
1840 1.1 mrg {
1841 1.1 mrg tree decl = SYMBOL_REF_DECL (callee);
1842 1.1 mrg if (decl != NULL_TREE)
1843 1.1 mrg {
1844 1.1 mrg if (DECL_STATIC_CHAIN (decl))
1845 1.1 mrg cfun->machine->has_chain = true;
1846 1.1 mrg
1847 1.1 mrg tree attr = oacc_get_fn_attrib (decl);
1848 1.1 mrg if (attr)
1849 1.1 mrg {
1850 1.1 mrg tree dims = TREE_VALUE (attr);
1851 1.1 mrg
1852 1.1 mrg parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1853 1.1 mrg for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1854 1.1 mrg {
1855 1.1 mrg if (TREE_PURPOSE (dims)
1856 1.1 mrg && !integer_zerop (TREE_PURPOSE (dims)))
1857 1.1 mrg break;
1858 1.1 mrg /* Not on this axis. */
1859 1.1 mrg parallel ^= GOMP_DIM_MASK (ix);
1860 1.1 mrg dims = TREE_CHAIN (dims);
1861 1.1 mrg }
1862 1.1 mrg }
1863 1.1 mrg }
1864 1.1 mrg }
1865 1.1 mrg
1866 1.1 mrg unsigned nargs = cfun->machine->num_args;
1867 1.1 mrg if (cfun->machine->is_varadic)
1868 1.1 mrg {
1869 1.1 mrg varargs = gen_reg_rtx (Pmode);
1870 1.1 mrg emit_move_insn (varargs, stack_pointer_rtx);
1871 1.1 mrg }
1872 1.1 mrg
1873 1.1 mrg rtvec vec = rtvec_alloc (nargs + 1);
1874 1.1 mrg rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1875 1.1 mrg int vec_pos = 0;
1876 1.1 mrg
1877 1.1 mrg rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1878 1.1 mrg rtx tmp_retval = retval;
1879 1.1 mrg if (retval)
1880 1.1 mrg {
1881 1.1 mrg if (!nvptx_register_operand (retval, GET_MODE (retval)))
1882 1.1 mrg tmp_retval = gen_reg_rtx (GET_MODE (retval));
1883 1.1 mrg call = gen_rtx_SET (tmp_retval, call);
1884 1.1 mrg }
1885 1.1 mrg XVECEXP (pat, 0, vec_pos++) = call;
1886 1.1 mrg
1887 1.1 mrg /* Construct the call insn, including a USE for each argument pseudo
1888 1.1 mrg register. These will be used when printing the insn. */
1889 1.1 mrg for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1890 1.1 mrg XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1891 1.1 mrg
1892 1.1 mrg if (varargs)
1893 1.1 mrg XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1894 1.1 mrg
1895 1.1 mrg gcc_assert (vec_pos == XVECLEN (pat, 0));
1896 1.1 mrg
1897 1.1 mrg nvptx_emit_forking (parallel, true);
1898 1.1 mrg emit_call_insn (pat);
1899 1.1 mrg nvptx_emit_joining (parallel, true);
1900 1.1 mrg
1901 1.1 mrg if (tmp_retval != retval)
1902 1.1 mrg emit_move_insn (retval, tmp_retval);
1903 1.1 mrg }
1904 1.1 mrg
1905 1.1 mrg /* Emit a comparison COMPARE, and return the new test to be used in the
1906 1.1 mrg jump. */
1907 1.1 mrg
1908 1.1 mrg rtx
1909 1.1 mrg nvptx_expand_compare (rtx compare)
1910 1.1 mrg {
1911 1.1 mrg rtx pred = gen_reg_rtx (BImode);
1912 1.1 mrg rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1913 1.1 mrg XEXP (compare, 0), XEXP (compare, 1));
1914 1.1 mrg emit_insn (gen_rtx_SET (pred, cmp));
1915 1.1 mrg return gen_rtx_NE (BImode, pred, const0_rtx);
1916 1.1 mrg }
1917 1.1 mrg
1918 1.1 mrg /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1919 1.1 mrg
1920 1.1 mrg void
1921 1.1 mrg nvptx_expand_oacc_fork (unsigned mode)
1922 1.1 mrg {
1923 1.1 mrg nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1924 1.1 mrg }
1925 1.1 mrg
1926 1.1 mrg void
1927 1.1 mrg nvptx_expand_oacc_join (unsigned mode)
1928 1.1 mrg {
1929 1.1 mrg nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1930 1.1 mrg }
1931 1.1 mrg
1932 1.1 mrg /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1933 1.1 mrg objects. */
1934 1.1 mrg
1935 1.1 mrg static rtx
1936 1.1 mrg nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1937 1.1 mrg {
1938 1.1 mrg rtx res;
1939 1.1 mrg
1940 1.1 mrg switch (GET_MODE (src))
1941 1.1 mrg {
1942 1.1 mrg case E_DImode:
1943 1.1 mrg res = gen_unpackdisi2 (dst0, dst1, src);
1944 1.1 mrg break;
1945 1.1 mrg case E_DFmode:
1946 1.1 mrg res = gen_unpackdfsi2 (dst0, dst1, src);
1947 1.1 mrg break;
1948 1.1 mrg default: gcc_unreachable ();
1949 1.1 mrg }
1950 1.1 mrg return res;
1951 1.1 mrg }
1952 1.1 mrg
1953 1.1 mrg /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1954 1.1 mrg object. */
1955 1.1 mrg
1956 1.1 mrg static rtx
1957 1.1 mrg nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1958 1.1 mrg {
1959 1.1 mrg rtx res;
1960 1.1 mrg
1961 1.1 mrg switch (GET_MODE (dst))
1962 1.1 mrg {
1963 1.1 mrg case E_DImode:
1964 1.1 mrg res = gen_packsidi2 (dst, src0, src1);
1965 1.1 mrg break;
1966 1.1 mrg case E_DFmode:
1967 1.1 mrg res = gen_packsidf2 (dst, src0, src1);
1968 1.1 mrg break;
1969 1.1 mrg default: gcc_unreachable ();
1970 1.1 mrg }
1971 1.1 mrg return res;
1972 1.1 mrg }
1973 1.1 mrg
1974 1.1 mrg /* Generate an instruction or sequence to broadcast register REG
1975 1.1 mrg across the vectors of a single warp. */
1976 1.1 mrg
1977 1.1 mrg rtx
1978 1.1 mrg nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1979 1.1 mrg {
1980 1.1 mrg rtx res;
1981 1.1 mrg
1982 1.1 mrg switch (GET_MODE (dst))
1983 1.1 mrg {
1984 1.1 mrg case E_DCmode:
1985 1.1 mrg case E_CDImode:
1986 1.1 mrg {
1987 1.1 mrg gcc_assert (GET_CODE (dst) == CONCAT);
1988 1.1 mrg gcc_assert (GET_CODE (src) == CONCAT);
1989 1.1 mrg rtx dst_real = XEXP (dst, 0);
1990 1.1 mrg rtx dst_imag = XEXP (dst, 1);
1991 1.1 mrg rtx src_real = XEXP (src, 0);
1992 1.1 mrg rtx src_imag = XEXP (src, 1);
1993 1.1 mrg
1994 1.1 mrg start_sequence ();
1995 1.1 mrg emit_insn (nvptx_gen_shuffle (dst_real, src_real, idx, kind));
1996 1.1 mrg emit_insn (nvptx_gen_shuffle (dst_imag, src_imag, idx, kind));
1997 1.1 mrg res = get_insns ();
1998 1.1 mrg end_sequence ();
1999 1.1 mrg }
2000 1.1 mrg break;
2001 1.1 mrg case E_SImode:
2002 1.1 mrg res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
2003 1.1 mrg break;
2004 1.1 mrg case E_SFmode:
2005 1.1 mrg res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
2006 1.1 mrg break;
2007 1.1 mrg case E_DImode:
2008 1.1 mrg case E_DFmode:
2009 1.1 mrg {
2010 1.1 mrg rtx tmp0 = gen_reg_rtx (SImode);
2011 1.1 mrg rtx tmp1 = gen_reg_rtx (SImode);
2012 1.1 mrg
2013 1.1 mrg start_sequence ();
2014 1.1 mrg emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
2015 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2016 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2017 1.1 mrg emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
2018 1.1 mrg res = get_insns ();
2019 1.1 mrg end_sequence ();
2020 1.1 mrg }
2021 1.1 mrg break;
2022 1.1 mrg case E_V2SImode:
2023 1.1 mrg {
2024 1.1 mrg rtx src0 = gen_rtx_SUBREG (SImode, src, 0);
2025 1.1 mrg rtx src1 = gen_rtx_SUBREG (SImode, src, 4);
2026 1.1 mrg rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0);
2027 1.1 mrg rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4);
2028 1.1 mrg rtx tmp0 = gen_reg_rtx (SImode);
2029 1.1 mrg rtx tmp1 = gen_reg_rtx (SImode);
2030 1.1 mrg start_sequence ();
2031 1.1 mrg emit_insn (gen_movsi (tmp0, src0));
2032 1.1 mrg emit_insn (gen_movsi (tmp1, src1));
2033 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2034 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2035 1.1 mrg emit_insn (gen_movsi (dst0, tmp0));
2036 1.1 mrg emit_insn (gen_movsi (dst1, tmp1));
2037 1.1 mrg res = get_insns ();
2038 1.1 mrg end_sequence ();
2039 1.1 mrg }
2040 1.1 mrg break;
2041 1.1 mrg case E_V2DImode:
2042 1.1 mrg {
2043 1.1 mrg rtx src0 = gen_rtx_SUBREG (DImode, src, 0);
2044 1.1 mrg rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
2045 1.1 mrg rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
2046 1.1 mrg rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
2047 1.1 mrg rtx tmp0 = gen_reg_rtx (DImode);
2048 1.1 mrg rtx tmp1 = gen_reg_rtx (DImode);
2049 1.1 mrg start_sequence ();
2050 1.1 mrg emit_insn (gen_movdi (tmp0, src0));
2051 1.1 mrg emit_insn (gen_movdi (tmp1, src1));
2052 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2053 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2054 1.1 mrg emit_insn (gen_movdi (dst0, tmp0));
2055 1.1 mrg emit_insn (gen_movdi (dst1, tmp1));
2056 1.1 mrg res = get_insns ();
2057 1.1 mrg end_sequence ();
2058 1.1 mrg }
2059 1.1 mrg break;
2060 1.1 mrg case E_BImode:
2061 1.1 mrg {
2062 1.1 mrg rtx tmp = gen_reg_rtx (SImode);
2063 1.1 mrg
2064 1.1 mrg start_sequence ();
2065 1.1 mrg emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
2066 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
2067 1.1 mrg emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
2068 1.1 mrg res = get_insns ();
2069 1.1 mrg end_sequence ();
2070 1.1 mrg }
2071 1.1 mrg break;
2072 1.1 mrg case E_QImode:
2073 1.1 mrg case E_HImode:
2074 1.1 mrg {
2075 1.1 mrg rtx tmp = gen_reg_rtx (SImode);
2076 1.1 mrg
2077 1.1 mrg start_sequence ();
2078 1.1 mrg emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
2079 1.1 mrg emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
2080 1.1 mrg emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
2081 1.1 mrg tmp)));
2082 1.1 mrg res = get_insns ();
2083 1.1 mrg end_sequence ();
2084 1.1 mrg }
2085 1.1 mrg break;
2086 1.1 mrg
2087 1.1 mrg default:
2088 1.1 mrg gcc_unreachable ();
2089 1.1 mrg }
2090 1.1 mrg return res;
2091 1.1 mrg }
2092 1.1 mrg
2093 1.1 mrg /* Generate an instruction or sequence to broadcast register REG
2094 1.1 mrg across the vectors of a single warp. */
2095 1.1 mrg
2096 1.1 mrg static rtx
2097 1.1 mrg nvptx_gen_warp_bcast (rtx reg)
2098 1.1 mrg {
2099 1.1 mrg return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
2100 1.1 mrg }
2101 1.1 mrg
2102 1.1 mrg /* Structure used when generating a worker-level spill or fill. */
2103 1.1 mrg
2104 1.1 mrg struct broadcast_data_t
2105 1.1 mrg {
2106 1.1 mrg rtx base; /* Register holding base addr of buffer. */
2107 1.1 mrg rtx ptr; /* Iteration var, if needed. */
2108 1.1 mrg unsigned offset; /* Offset into worker buffer. */
2109 1.1 mrg };
2110 1.1 mrg
2111 1.1 mrg /* Direction of the spill/fill and looping setup/teardown indicator. */
2112 1.1 mrg
2113 1.1 mrg enum propagate_mask
2114 1.1 mrg {
2115 1.1 mrg PM_read = 1 << 0,
2116 1.1 mrg PM_write = 1 << 1,
2117 1.1 mrg PM_loop_begin = 1 << 2,
2118 1.1 mrg PM_loop_end = 1 << 3,
2119 1.1 mrg
2120 1.1 mrg PM_read_write = PM_read | PM_write
2121 1.1 mrg };
2122 1.1 mrg
2123 1.1 mrg /* Generate instruction(s) to spill or fill register REG to/from the
2124 1.1 mrg worker broadcast array. PM indicates what is to be done, REP
2125 1.1 mrg how many loop iterations will be executed (0 for not a loop). */
2126 1.1 mrg
2127 1.1 mrg static rtx
2128 1.1 mrg nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
2129 1.1 mrg broadcast_data_t *data, bool vector)
2130 1.1 mrg {
2131 1.1 mrg rtx res;
2132 1.1 mrg machine_mode mode = GET_MODE (reg);
2133 1.1 mrg
2134 1.1 mrg switch (mode)
2135 1.1 mrg {
2136 1.1 mrg case E_BImode:
2137 1.1 mrg {
2138 1.1 mrg rtx tmp = gen_reg_rtx (SImode);
2139 1.1 mrg
2140 1.1 mrg start_sequence ();
2141 1.1 mrg if (pm & PM_read)
2142 1.1 mrg emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
2143 1.1 mrg emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
2144 1.1 mrg if (pm & PM_write)
2145 1.1 mrg emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
2146 1.1 mrg res = get_insns ();
2147 1.1 mrg end_sequence ();
2148 1.1 mrg }
2149 1.1 mrg break;
2150 1.1 mrg
2151 1.1 mrg default:
2152 1.1 mrg {
2153 1.1 mrg rtx addr = data->ptr;
2154 1.1 mrg
2155 1.1 mrg if (!addr)
2156 1.1 mrg {
2157 1.1 mrg unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
2158 1.1 mrg
2159 1.1 mrg oacc_bcast_align = MAX (oacc_bcast_align, align);
2160 1.1 mrg data->offset = ROUND_UP (data->offset, align);
2161 1.1 mrg addr = data->base;
2162 1.1 mrg gcc_assert (data->base != NULL);
2163 1.1 mrg if (data->offset)
2164 1.1 mrg addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
2165 1.1 mrg }
2166 1.1 mrg
2167 1.1 mrg addr = gen_rtx_MEM (mode, addr);
2168 1.1 mrg if (pm == PM_read)
2169 1.1 mrg res = gen_rtx_SET (addr, reg);
2170 1.1 mrg else if (pm == PM_write)
2171 1.1 mrg res = gen_rtx_SET (reg, addr);
2172 1.1 mrg else
2173 1.1 mrg gcc_unreachable ();
2174 1.1 mrg
2175 1.1 mrg if (data->ptr)
2176 1.1 mrg {
2177 1.1 mrg /* We're using a ptr, increment it. */
2178 1.1 mrg start_sequence ();
2179 1.1 mrg
2180 1.1 mrg emit_insn (res);
2181 1.1 mrg emit_insn (gen_adddi3 (data->ptr, data->ptr,
2182 1.1 mrg GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
2183 1.1 mrg res = get_insns ();
2184 1.1 mrg end_sequence ();
2185 1.1 mrg }
2186 1.1 mrg else
2187 1.1 mrg rep = 1;
2188 1.1 mrg data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
2189 1.1 mrg }
2190 1.1 mrg break;
2191 1.1 mrg }
2192 1.1 mrg return res;
2193 1.1 mrg }
2194 1.1 mrg
2195 1.1 mrg /* Returns true if X is a valid address for use in a memory reference. */
2197 1.1 mrg
2198 1.1 mrg static bool
2199 1.1 mrg nvptx_legitimate_address_p (machine_mode, rtx x, bool)
2200 1.1 mrg {
2201 1.1 mrg enum rtx_code code = GET_CODE (x);
2202 1.1 mrg
2203 1.1 mrg switch (code)
2204 1.1 mrg {
2205 1.1 mrg case REG:
2206 1.1 mrg return true;
2207 1.1 mrg
2208 1.1 mrg case PLUS:
2209 1.1 mrg if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
2210 1.1 mrg return true;
2211 1.1 mrg return false;
2212 1.1 mrg
2213 1.1 mrg case CONST:
2214 1.1 mrg case SYMBOL_REF:
2215 1.1 mrg case LABEL_REF:
2216 1.1 mrg return true;
2217 1.1 mrg
2218 1.1 mrg default:
2219 1.1 mrg return false;
2220 1.1 mrg }
2221 1.1 mrg }
2222 1.1 mrg
2223 1.1 mrg /* Machinery to output constant initializers. When beginning an
2225 1.1 mrg initializer, we decide on a fragment size (which is visible in ptx
2226 1.1 mrg in the type used), and then all initializer data is buffered until
2227 1.1 mrg a fragment is filled and ready to be written out. */
2228 1.1 mrg
2229 1.1 mrg static struct
2230 1.1 mrg {
2231 1.1 mrg unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
2232 1.1 mrg unsigned HOST_WIDE_INT val; /* Current fragment value. */
2233 1.1 mrg unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
2234 1.1 mrg out. */
2235 1.1 mrg unsigned size; /* Fragment size to accumulate. */
2236 1.1 mrg unsigned offset; /* Offset within current fragment. */
2237 1.1 mrg bool started; /* Whether we've output any initializer. */
2238 1.1 mrg } init_frag;
2239 1.1 mrg
2240 1.1 mrg /* The current fragment is full, write it out. SYM may provide a
2241 1.1 mrg symbolic reference we should output, in which case the fragment
2242 1.1 mrg value is the addend. */
2243 1.1 mrg
2244 1.1 mrg static void
2245 1.1 mrg output_init_frag (rtx sym)
2246 1.1 mrg {
2247 1.1 mrg fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
2248 1.1 mrg unsigned HOST_WIDE_INT val = init_frag.val;
2249 1.1 mrg
2250 1.1 mrg init_frag.started = true;
2251 1.1 mrg init_frag.val = 0;
2252 1.1 mrg init_frag.offset = 0;
2253 1.1 mrg init_frag.remaining--;
2254 1.1 mrg
2255 1.1 mrg if (sym)
2256 1.1 mrg {
2257 1.1 mrg bool function = (SYMBOL_REF_DECL (sym)
2258 1.1 mrg && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
2259 1.1 mrg if (!function)
2260 1.1 mrg fprintf (asm_out_file, "generic(");
2261 1.1 mrg output_address (VOIDmode, sym);
2262 1.1 mrg if (!function)
2263 1.1 mrg fprintf (asm_out_file, ")");
2264 1.1 mrg if (val)
2265 1.1 mrg fprintf (asm_out_file, " + ");
2266 1.1 mrg }
2267 1.1 mrg
2268 1.1 mrg if (!sym || val)
2269 1.1 mrg fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
2270 1.1 mrg }
2271 1.1 mrg
2272 1.1 mrg /* Add value VAL of size SIZE to the data we're emitting, and keep
2273 1.1 mrg writing out chunks as they fill up. */
2274 1.1 mrg
2275 1.1 mrg static void
2276 1.1 mrg nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
2277 1.1 mrg {
2278 1.1 mrg bool negative_p
2279 1.1 mrg = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1));
2280 1.1 mrg
2281 1.1 mrg /* Avoid undefined behaviour. */
2282 1.1 mrg if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT)
2283 1.1 mrg val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1;
2284 1.1 mrg
2285 1.1 mrg for (unsigned part = 0; size; size -= part)
2286 1.1 mrg {
2287 1.1 mrg if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT)
2288 1.1 mrg /* Avoid undefined behaviour. */
2289 1.1 mrg val = negative_p ? -1 : 0;
2290 1.1 mrg else
2291 1.1 mrg val >>= (part * BITS_PER_UNIT);
2292 1.1 mrg part = init_frag.size - init_frag.offset;
2293 1.1 mrg part = MIN (part, size);
2294 1.1 mrg
2295 1.1 mrg unsigned HOST_WIDE_INT partial
2296 1.1 mrg = val << (init_frag.offset * BITS_PER_UNIT);
2297 1.1 mrg init_frag.val |= partial & init_frag.mask;
2298 1.1 mrg init_frag.offset += part;
2299 1.1 mrg
2300 1.1 mrg if (init_frag.offset == init_frag.size)
2301 1.1 mrg output_init_frag (NULL);
2302 1.1 mrg }
2303 1.1 mrg }
2304 1.1 mrg
2305 1.1 mrg /* Target hook for assembling integer object X of size SIZE. */
2306 1.1 mrg
2307 1.1 mrg static bool
2308 1.1 mrg nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2309 1.1 mrg {
2310 1.1 mrg HOST_WIDE_INT val = 0;
2311 1.1 mrg
2312 1.1 mrg switch (GET_CODE (x))
2313 1.1 mrg {
2314 1.1 mrg default:
2315 1.1 mrg /* Let the generic machinery figure it out, usually for a
2316 1.1 mrg CONST_WIDE_INT. */
2317 1.1 mrg return false;
2318 1.1 mrg
2319 1.1 mrg case CONST_INT:
2320 1.1 mrg nvptx_assemble_value (INTVAL (x), size);
2321 1.1 mrg break;
2322 1.1 mrg
2323 1.1 mrg case CONST:
2324 1.1 mrg x = XEXP (x, 0);
2325 1.1 mrg gcc_assert (GET_CODE (x) == PLUS);
2326 1.1 mrg val = INTVAL (XEXP (x, 1));
2327 1.1 mrg x = XEXP (x, 0);
2328 1.1 mrg gcc_assert (GET_CODE (x) == SYMBOL_REF);
2329 1.1 mrg gcc_fallthrough (); /* FALLTHROUGH */
2330 1.1 mrg
2331 1.1 mrg case SYMBOL_REF:
2332 1.1 mrg gcc_assert (size == init_frag.size);
2333 1.1 mrg if (init_frag.offset)
2334 1.1 mrg sorry ("cannot emit unaligned pointers in ptx assembly");
2335 1.1 mrg
2336 1.1 mrg nvptx_maybe_record_fnsym (x);
2337 1.1 mrg init_frag.val = val;
2338 1.1 mrg output_init_frag (x);
2339 1.1 mrg break;
2340 1.1 mrg }
2341 1.1 mrg
2342 1.1 mrg return true;
2343 1.1 mrg }
2344 1.1 mrg
2345 1.1 mrg /* Output SIZE zero bytes. We ignore the FILE argument since the
2346 1.1 mrg functions we're calling to perform the output just use
2347 1.1 mrg asm_out_file. */
2348 1.1 mrg
2349 1.1 mrg void
2350 1.1 mrg nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2351 1.1 mrg {
2352 1.1 mrg /* Finish the current fragment, if it's started. */
2353 1.1 mrg if (init_frag.offset)
2354 1.1 mrg {
2355 1.1 mrg unsigned part = init_frag.size - init_frag.offset;
2356 1.1 mrg part = MIN (part, (unsigned)size);
2357 1.1 mrg size -= part;
2358 1.1 mrg nvptx_assemble_value (0, part);
2359 1.1 mrg }
2360 1.1 mrg
2361 1.1 mrg /* If this skip doesn't terminate the initializer, write as many
2362 1.1 mrg remaining pieces as possible directly. */
2363 1.1 mrg if (size < init_frag.remaining * init_frag.size)
2364 1.1 mrg {
2365 1.1 mrg while (size >= init_frag.size)
2366 1.1 mrg {
2367 1.1 mrg size -= init_frag.size;
2368 1.1 mrg output_init_frag (NULL_RTX);
2369 1.1 mrg }
2370 1.1 mrg if (size)
2371 1.1 mrg nvptx_assemble_value (0, size);
2372 1.1 mrg }
2373 1.1 mrg }
2374 1.1 mrg
2375 1.1 mrg /* Output a string STR with length SIZE. As in nvptx_output_skip we
2376 1.1 mrg ignore the FILE arg. */
2377 1.1 mrg
2378 1.1 mrg void
2379 1.1 mrg nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2380 1.1 mrg {
2381 1.1 mrg for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2382 1.1 mrg nvptx_assemble_value (str[i], 1);
2383 1.1 mrg }
2384 1.1 mrg
2385 1.1 mrg /* Return true if TYPE is a record type where the last field is an array without
2386 1.1 mrg given dimension. */
2387 1.1 mrg
2388 1.1 mrg static bool
2389 1.1 mrg flexible_array_member_type_p (const_tree type)
2390 1.1 mrg {
2391 1.1 mrg if (TREE_CODE (type) != RECORD_TYPE)
2392 1.1 mrg return false;
2393 1.1 mrg
2394 1.1 mrg const_tree last_field = NULL_TREE;
2395 1.1 mrg for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2396 1.1 mrg last_field = f;
2397 1.1 mrg
2398 1.1 mrg if (!last_field)
2399 1.1 mrg return false;
2400 1.1 mrg
2401 1.1 mrg const_tree last_field_type = TREE_TYPE (last_field);
2402 1.1 mrg if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2403 1.1 mrg return false;
2404 1.1 mrg
2405 1.1 mrg return (! TYPE_DOMAIN (last_field_type)
2406 1.1 mrg || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2407 1.1 mrg }
2408 1.1 mrg
2409 1.1 mrg /* Emit a PTX variable decl and prepare for emission of its
2410 1.1 mrg initializer. NAME is the symbol name and SETION the PTX data
2411 1.1 mrg area. The type is TYPE, object size SIZE and alignment is ALIGN.
2412 1.1 mrg The caller has already emitted any indentation and linkage
2413 1.1 mrg specifier. It is responsible for any initializer, terminating ;
2414 1.1 mrg and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2415 1.1 mrg this is the opposite way round that PTX wants them! */
2416 1.1 mrg
2417 1.1 mrg static void
2418 1.1 mrg nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2419 1.1 mrg const_tree type, HOST_WIDE_INT size, unsigned align,
2420 1.1 mrg bool undefined = false)
2421 1.1 mrg {
2422 1.1 mrg bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2423 1.1 mrg && (TYPE_DOMAIN (type) == NULL_TREE);
2424 1.1 mrg
2425 1.1 mrg if (undefined && flexible_array_member_type_p (type))
2426 1.1 mrg {
2427 1.1 mrg size = 0;
2428 1.1 mrg atype = true;
2429 1.1 mrg }
2430 1.1 mrg
2431 1.1 mrg while (TREE_CODE (type) == ARRAY_TYPE)
2432 1.1 mrg type = TREE_TYPE (type);
2433 1.1 mrg
2434 1.1 mrg if (TREE_CODE (type) == VECTOR_TYPE
2435 1.1 mrg || TREE_CODE (type) == COMPLEX_TYPE)
2436 1.1 mrg /* Neither vector nor complex types can contain the other. */
2437 1.1 mrg type = TREE_TYPE (type);
2438 1.1 mrg
2439 1.1 mrg unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type);
2440 1.1 mrg
2441 1.1 mrg /* Largest mode we're prepared to accept. For BLKmode types we
2442 1.1 mrg don't know if it'll contain pointer constants, so have to choose
2443 1.1 mrg pointer size, otherwise we can choose DImode. */
2444 1.1 mrg machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2445 1.1 mrg
2446 1.1 mrg elt_size |= GET_MODE_SIZE (elt_mode);
2447 1.1 mrg elt_size &= -elt_size; /* Extract LSB set. */
2448 1.1 mrg
2449 1.1 mrg init_frag.size = elt_size;
2450 1.1 mrg /* Avoid undefined shift behavior by using '2'. */
2451 1.1 mrg init_frag.mask = ((unsigned HOST_WIDE_INT)2
2452 1.1 mrg << (elt_size * BITS_PER_UNIT - 1)) - 1;
2453 1.1 mrg init_frag.val = 0;
2454 1.1 mrg init_frag.offset = 0;
2455 1.1 mrg init_frag.started = false;
2456 1.1 mrg /* Size might not be a multiple of elt size, if there's an
2457 1.1 mrg initialized trailing struct array with smaller type than
2458 1.1 mrg elt_size. */
2459 1.1 mrg init_frag.remaining = (size + elt_size - 1) / elt_size;
2460 1.1 mrg
2461 1.1 mrg fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ",
2462 1.1 mrg section, align / BITS_PER_UNIT,
2463 1.1 mrg elt_size * BITS_PER_UNIT);
2464 1.1 mrg assemble_name (file, name);
2465 1.1 mrg
2466 1.1 mrg if (size)
2467 1.1 mrg /* We make everything an array, to simplify any initialization
2468 1.1 mrg emission. */
2469 1.1 mrg fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining);
2470 1.1 mrg else if (atype)
2471 1.1 mrg fprintf (file, "[]");
2472 1.1 mrg }
2473 1.1 mrg
2474 1.1 mrg /* Called when the initializer for a decl has been completely output through
2475 1.1 mrg combinations of the three functions above. */
2476 1.1 mrg
2477 1.1 mrg static void
2478 1.1 mrg nvptx_assemble_decl_end (void)
2479 1.1 mrg {
2480 1.1 mrg if (init_frag.offset)
2481 1.1 mrg /* This can happen with a packed struct with trailing array member. */
2482 1.1 mrg nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2483 1.1 mrg fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2484 1.1 mrg }
2485 1.1 mrg
2486 1.1 mrg /* Output an uninitialized common or file-scope variable. */
2487 1.1 mrg
2488 1.1 mrg void
2489 1.1 mrg nvptx_output_aligned_decl (FILE *file, const char *name,
2490 1.1 mrg const_tree decl, HOST_WIDE_INT size, unsigned align)
2491 1.1 mrg {
2492 1.1 mrg write_var_marker (file, true, TREE_PUBLIC (decl), name);
2493 1.1 mrg
2494 1.1 mrg /* If this is public, it is common. The nearest thing we have to
2495 1.1 mrg common is weak. */
2496 1.1 mrg fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2497 1.1 mrg
2498 1.1 mrg nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2499 1.1 mrg TREE_TYPE (decl), size, align);
2500 1.1 mrg nvptx_assemble_decl_end ();
2501 1.1 mrg }
2502 1.1 mrg
2503 1.1 mrg /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2504 1.1 mrg writing a constant variable EXP with NAME and SIZE and its
2505 1.1 mrg initializer to FILE. */
2506 1.1 mrg
2507 1.1 mrg static void
2508 1.1 mrg nvptx_asm_declare_constant_name (FILE *file, const char *name,
2509 1.1 mrg const_tree exp, HOST_WIDE_INT obj_size)
2510 1.1 mrg {
2511 1.1 mrg write_var_marker (file, true, false, name);
2512 1.1 mrg
2513 1.1 mrg fprintf (file, "\t");
2514 1.1 mrg
2515 1.1 mrg tree type = TREE_TYPE (exp);
2516 1.1 mrg nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2517 1.1 mrg TYPE_ALIGN (type));
2518 1.1 mrg }
2519 1.1 mrg
2520 1.1 mrg /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2521 1.1 mrg a variable DECL with NAME to FILE. */
2522 1.1 mrg
2523 1.1 mrg void
2524 1.1 mrg nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2525 1.1 mrg {
2526 1.1 mrg write_var_marker (file, true, TREE_PUBLIC (decl), name);
2527 1.1 mrg
2528 1.1 mrg fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2529 1.1 mrg : DECL_WEAK (decl) ? ".weak " : ".visible "));
2530 1.1 mrg
2531 1.1 mrg tree type = TREE_TYPE (decl);
2532 1.1 mrg HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2533 1.1 mrg nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2534 1.1 mrg type, obj_size, DECL_ALIGN (decl));
2535 1.1 mrg }
2536 1.1 mrg
2537 1.1 mrg /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2538 1.1 mrg
2539 1.1 mrg static void
2540 1.1 mrg nvptx_globalize_label (FILE *, const char *)
2541 1.1 mrg {
2542 1.1 mrg }
2543 1.1 mrg
2544 1.1 mrg /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2545 1.1 mrg declaration only for variable DECL with NAME to FILE. */
2546 1.1 mrg
2547 1.1 mrg static void
2548 1.1 mrg nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2549 1.1 mrg {
2550 1.1 mrg /* The middle end can place constant pool decls into the varpool as
2551 1.1 mrg undefined. Until that is fixed, catch the problem here. */
2552 1.1 mrg if (DECL_IN_CONSTANT_POOL (decl))
2553 1.1 mrg return;
2554 1.1 mrg
2555 1.1 mrg /* We support weak defintions, and hence have the right
2556 1.1 mrg ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2557 1.1 mrg if (DECL_WEAK (decl))
2558 1.1 mrg error_at (DECL_SOURCE_LOCATION (decl),
2559 1.1 mrg "PTX does not support weak declarations"
2560 1.1 mrg " (only weak definitions)");
2561 1.1 mrg write_var_marker (file, false, TREE_PUBLIC (decl), name);
2562 1.1 mrg
2563 1.1 mrg fprintf (file, "\t.extern ");
2564 1.1 mrg tree size = DECL_SIZE_UNIT (decl);
2565 1.1 mrg nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2566 1.1 mrg TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2567 1.1 mrg DECL_ALIGN (decl), true);
2568 1.1 mrg nvptx_assemble_decl_end ();
2569 1.1 mrg }
2570 1.1 mrg
2571 1.1 mrg /* Output a pattern for a move instruction. */
2572 1.1 mrg
2573 1.1 mrg const char *
2574 1.1 mrg nvptx_output_mov_insn (rtx dst, rtx src)
2575 1.1 mrg {
2576 1.1 mrg machine_mode dst_mode = GET_MODE (dst);
2577 1.1 mrg machine_mode src_mode = GET_MODE (src);
2578 1.1 mrg machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2579 1.1 mrg ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2580 1.1 mrg machine_mode src_inner = (GET_CODE (src) == SUBREG
2581 1.1 mrg ? GET_MODE (XEXP (src, 0)) : dst_mode);
2582 1.1 mrg
2583 1.1 mrg rtx sym = src;
2584 1.1 mrg if (GET_CODE (sym) == CONST)
2585 1.1 mrg sym = XEXP (XEXP (sym, 0), 0);
2586 1.1 mrg if (SYMBOL_REF_P (sym))
2587 1.1 mrg {
2588 1.1 mrg if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2589 1.1 mrg return "%.\tcvta%D1%t0\t%0, %1;";
2590 1.1 mrg nvptx_maybe_record_fnsym (sym);
2591 1.1 mrg }
2592 1.1 mrg
2593 1.1 mrg if (src_inner == dst_inner)
2594 1.1 mrg return "%.\tmov%t0\t%0, %1;";
2595 1.1 mrg
2596 1.1 mrg if (CONSTANT_P (src))
2597 1.1 mrg return (GET_MODE_CLASS (dst_inner) == MODE_INT
2598 1.1 mrg && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2599 1.1 mrg ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2600 1.1 mrg
2601 1.1 mrg if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2602 1.1 mrg {
2603 1.1 mrg if (GET_MODE_BITSIZE (dst_mode) == 128
2604 1.1 mrg && GET_MODE_BITSIZE (src_mode) == 128)
2605 1.1 mrg {
2606 1.1 mrg /* mov.b128 is not supported. */
2607 1.1 mrg if (dst_inner == V2DImode && src_inner == TImode)
2608 1.1 mrg return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2609 1.1 mrg else if (dst_inner == TImode && src_inner == V2DImode)
2610 1.1 mrg return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2611 1.1 mrg
2612 1.1 mrg gcc_unreachable ();
2613 1.1 mrg }
2614 1.1 mrg return "%.\tmov.b%T0\t%0, %1;";
2615 1.1 mrg }
2616 1.1 mrg
2617 1.1 mrg if (GET_MODE_BITSIZE (src_inner) == 128
2618 1.1 mrg && GET_MODE_BITSIZE (src_mode) == 64)
2619 1.1 mrg return "%.\tmov.b%T0\t%0, %1;";
2620 1.1 mrg
2621 1.1 mrg return "%.\tcvt%t0%t1\t%0, %1;";
2622 1.1 mrg }
2623 1.1 mrg
2624 1.1 mrg /* Output a pre/post barrier for MEM_OPERAND according to MEMMODEL. */
2625 1.1 mrg
2626 1.1 mrg static void
2627 1.1 mrg nvptx_output_barrier (rtx *mem_operand, int memmodel, bool pre_p)
2628 1.1 mrg {
2629 1.1 mrg bool post_p = !pre_p;
2630 1.1 mrg
2631 1.1 mrg switch (memmodel)
2632 1.1 mrg {
2633 1.1 mrg case MEMMODEL_RELAXED:
2634 1.1 mrg return;
2635 1.1 mrg case MEMMODEL_CONSUME:
2636 1.1 mrg case MEMMODEL_ACQUIRE:
2637 1.1 mrg case MEMMODEL_SYNC_ACQUIRE:
2638 1.1 mrg if (post_p)
2639 1.1 mrg break;
2640 1.1 mrg return;
2641 1.1 mrg case MEMMODEL_RELEASE:
2642 1.1 mrg case MEMMODEL_SYNC_RELEASE:
2643 1.1 mrg if (pre_p)
2644 1.1 mrg break;
2645 1.1 mrg return;
2646 1.1 mrg case MEMMODEL_ACQ_REL:
2647 1.1 mrg case MEMMODEL_SEQ_CST:
2648 1.1 mrg case MEMMODEL_SYNC_SEQ_CST:
2649 1.1 mrg if (pre_p || post_p)
2650 1.1 mrg break;
2651 1.1 mrg return;
2652 1.1 mrg default:
2653 1.1 mrg gcc_unreachable ();
2654 1.1 mrg }
2655 1.1 mrg
2656 1.1 mrg output_asm_insn ("%.\tmembar%B0;", mem_operand);
2657 1.1 mrg }
2658 1.1 mrg
2659 1.1 mrg const char *
2660 1.1 mrg nvptx_output_atomic_insn (const char *asm_template, rtx *operands, int mem_pos,
2661 1.1 mrg int memmodel_pos)
2662 1.1 mrg {
2663 1.1 mrg nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
2664 1.1 mrg true);
2665 1.1 mrg output_asm_insn (asm_template, operands);
2666 1.1 mrg nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
2667 1.1 mrg false);
2668 1.1 mrg return "";
2669 1.1 mrg }
2670 1.1 mrg
2671 1.1 mrg static void nvptx_print_operand (FILE *, rtx, int);
2672 1.1 mrg
2673 1.1 mrg /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2674 1.1 mrg involves writing .param declarations and in/out copies into them. For
2675 1.1 mrg indirect calls, also write the .callprototype. */
2676 1.1 mrg
2677 1.1 mrg const char *
2678 1.1 mrg nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2679 1.1 mrg {
2680 1.1 mrg char buf[16];
2681 1.1 mrg static int labelno;
2682 1.1 mrg bool needs_tgt = register_operand (callee, Pmode);
2683 1.1 mrg rtx pat = PATTERN (insn);
2684 1.1 mrg if (GET_CODE (pat) == COND_EXEC)
2685 1.1 mrg pat = COND_EXEC_CODE (pat);
2686 1.1 mrg int arg_end = XVECLEN (pat, 0);
2687 1.1 mrg tree decl = NULL_TREE;
2688 1.1 mrg
2689 1.1 mrg fprintf (asm_out_file, "\t{\n");
2690 1.1 mrg if (result != NULL)
2691 1.1 mrg fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2692 1.1 mrg nvptx_ptx_type_from_mode (GET_MODE (result), false),
2693 1.1 mrg reg_names[NVPTX_RETURN_REGNUM]);
2694 1.1 mrg
2695 1.1 mrg /* Ensure we have a ptx declaration in the output if necessary. */
2696 1.1 mrg if (GET_CODE (callee) == SYMBOL_REF)
2697 1.1 mrg {
2698 1.1 mrg decl = SYMBOL_REF_DECL (callee);
2699 1.1 mrg if (!decl
2700 1.1 mrg || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2701 1.1 mrg nvptx_record_libfunc (callee, result, pat);
2702 1.1 mrg else if (DECL_EXTERNAL (decl))
2703 1.1 mrg nvptx_record_fndecl (decl);
2704 1.1 mrg }
2705 1.1 mrg
2706 1.1 mrg if (needs_tgt)
2707 1.1 mrg {
2708 1.1 mrg ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2709 1.1 mrg labelno++;
2710 1.1 mrg ASM_OUTPUT_LABEL (asm_out_file, buf);
2711 1.1 mrg std::stringstream s;
2712 1.1 mrg write_fn_proto_from_insn (s, NULL, result, pat);
2713 1.1 mrg fputs (s.str().c_str(), asm_out_file);
2714 1.1 mrg }
2715 1.1 mrg
2716 1.1 mrg for (int argno = 1; argno < arg_end; argno++)
2717 1.1 mrg {
2718 1.1 mrg rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2719 1.1 mrg machine_mode mode = GET_MODE (t);
2720 1.1 mrg const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2721 1.1 mrg
2722 1.1 mrg /* Mode splitting has already been done. */
2723 1.1 mrg fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2724 1.1 mrg "\t\tst.param%s [%%out_arg%d], ",
2725 1.1 mrg ptx_type, argno, ptx_type, argno);
2726 1.1 mrg output_reg (asm_out_file, REGNO (t), VOIDmode);
2727 1.1 mrg fprintf (asm_out_file, ";\n");
2728 1.1 mrg }
2729 1.1 mrg
2730 1.1 mrg /* The '.' stands for the call's predicate, if any. */
2731 1.1 mrg nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2732 1.1 mrg fprintf (asm_out_file, "\t\tcall ");
2733 1.1 mrg if (result != NULL_RTX)
2734 1.1 mrg fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2735 1.1 mrg
2736 1.1 mrg if (decl)
2737 1.1 mrg {
2738 1.1 mrg char *replaced_dots = NULL;
2739 1.1 mrg const char *name = get_fnname_from_decl (decl);
2740 1.1 mrg const char *replacement = nvptx_name_replacement (name);
2741 1.1 mrg if (replacement != name)
2742 1.1 mrg name = replacement;
2743 1.1 mrg else
2744 1.1 mrg {
2745 1.1 mrg replaced_dots = nvptx_replace_dot (name);
2746 1.1 mrg if (replaced_dots)
2747 1.1 mrg name = replaced_dots;
2748 1.1 mrg }
2749 1.1 mrg assemble_name (asm_out_file, name);
2750 1.1 mrg if (replaced_dots)
2751 1.1 mrg XDELETE (replaced_dots);
2752 1.1 mrg }
2753 1.1 mrg else
2754 1.1 mrg output_address (VOIDmode, callee);
2755 1.1 mrg
2756 1.1 mrg const char *open = "(";
2757 1.1 mrg for (int argno = 1; argno < arg_end; argno++)
2758 1.1 mrg {
2759 1.1 mrg fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2760 1.1 mrg open = "";
2761 1.1 mrg }
2762 1.1 mrg if (decl && DECL_STATIC_CHAIN (decl))
2763 1.1 mrg {
2764 1.1 mrg fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2765 1.1 mrg open = "";
2766 1.1 mrg }
2767 1.1 mrg if (!open[0])
2768 1.1 mrg fprintf (asm_out_file, ")");
2769 1.1 mrg
2770 1.1 mrg if (needs_tgt)
2771 1.1 mrg {
2772 1.1 mrg fprintf (asm_out_file, ", ");
2773 1.1 mrg assemble_name (asm_out_file, buf);
2774 1.1 mrg }
2775 1.1 mrg fprintf (asm_out_file, ";\n");
2776 1.1 mrg
2777 1.1 mrg if (find_reg_note (insn, REG_NORETURN, NULL))
2778 1.1 mrg {
2779 1.1 mrg /* No return functions confuse the PTX JIT, as it doesn't realize
2780 1.1 mrg the flow control barrier they imply. It can seg fault if it
2781 1.1 mrg encounters what looks like an unexitable loop. Emit a trailing
2782 1.1 mrg trap and exit, which it does grok. */
2783 1.1 mrg fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2784 1.1 mrg fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2785 1.1 mrg }
2786 1.1 mrg
2787 1.1 mrg if (result)
2788 1.1 mrg {
2789 1.1 mrg static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2790 1.1 mrg
2791 1.1 mrg if (!rval[0])
2792 1.1 mrg /* We must escape the '%' that starts RETURN_REGNUM. */
2793 1.1 mrg sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2794 1.1 mrg reg_names[NVPTX_RETURN_REGNUM]);
2795 1.1 mrg return rval;
2796 1.1 mrg }
2797 1.1 mrg
2798 1.1 mrg return "}";
2799 1.1 mrg }
2800 1.1 mrg
2801 1.1 mrg /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2802 1.1 mrg
2803 1.1 mrg static bool
2804 1.1 mrg nvptx_print_operand_punct_valid_p (unsigned char c)
2805 1.1 mrg {
2806 1.1 mrg return c == '.' || c== '#';
2807 1.1 mrg }
2808 1.1 mrg
2809 1.1 mrg /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2810 1.1 mrg
2811 1.1 mrg static void
2812 1.1 mrg nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2813 1.1 mrg {
2814 1.1 mrg rtx off;
2815 1.1 mrg if (GET_CODE (x) == CONST)
2816 1.1 mrg x = XEXP (x, 0);
2817 1.1 mrg switch (GET_CODE (x))
2818 1.1 mrg {
2819 1.1 mrg case PLUS:
2820 1.1 mrg off = XEXP (x, 1);
2821 1.1 mrg output_address (VOIDmode, XEXP (x, 0));
2822 1.1 mrg fprintf (file, "+");
2823 1.1 mrg output_address (VOIDmode, off);
2824 1.1 mrg break;
2825 1.1 mrg
2826 1.1 mrg case SYMBOL_REF:
2827 1.1 mrg case LABEL_REF:
2828 1.1 mrg output_addr_const (file, x);
2829 1.1 mrg break;
2830 1.1 mrg
2831 1.1 mrg default:
2832 1.1 mrg gcc_assert (GET_CODE (x) != MEM);
2833 1.1 mrg nvptx_print_operand (file, x, 0);
2834 1.1 mrg break;
2835 1.1 mrg }
2836 1.1 mrg }
2837 1.1 mrg
2838 1.1 mrg /* Write assembly language output for the address ADDR to FILE. */
2839 1.1 mrg
2840 1.1 mrg static void
2841 1.1 mrg nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2842 1.1 mrg {
2843 1.1 mrg nvptx_print_address_operand (file, addr, mode);
2844 1.1 mrg }
2845 1.1 mrg
2846 1.1 mrg static nvptx_data_area
2847 1.1 mrg nvptx_mem_data_area (const_rtx x)
2848 1.1 mrg {
2849 1.1 mrg gcc_assert (GET_CODE (x) == MEM);
2850 1.1 mrg
2851 1.1 mrg const_rtx addr = XEXP (x, 0);
2852 1.1 mrg subrtx_iterator::array_type array;
2853 1.1 mrg FOR_EACH_SUBRTX (iter, array, addr, ALL)
2854 1.1 mrg if (SYMBOL_REF_P (*iter))
2855 1.1 mrg return SYMBOL_DATA_AREA (*iter);
2856 1.1 mrg
2857 1.1 mrg return DATA_AREA_GENERIC;
2858 1.1 mrg }
2859 1.1 mrg
2860 1.1 mrg bool
2861 1.1 mrg nvptx_mem_maybe_shared_p (const_rtx x)
2862 1.1 mrg {
2863 1.1 mrg nvptx_data_area area = nvptx_mem_data_area (x);
2864 1.1 mrg return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC;
2865 1.1 mrg }
2866 1.1 mrg
2867 1.1 mrg /* Print an operand, X, to FILE, with an optional modifier in CODE.
2868 1.1 mrg
2869 1.1 mrg Meaning of CODE:
2870 1.1 mrg . -- print the predicate for the instruction or an emptry string for an
2871 1.1 mrg unconditional one.
2872 1.1 mrg # -- print a rounding mode for the instruction
2873 1.1 mrg
2874 1.1 mrg A -- print a data area for a MEM
2875 1.1 mrg c -- print an opcode suffix for a comparison operator, including a type code
2876 1.1 mrg D -- print a data area for a MEM operand
2877 1.1 mrg S -- print a shuffle kind specified by CONST_INT
2878 1.1 mrg t -- print a type opcode suffix, promoting QImode to 32 bits
2879 1.1 mrg T -- print a type size in bits
2880 1.1 mrg u -- print a type opcode suffix without promotions.
2881 1.1 mrg x -- print a destination operand that may also be a bit bucket. */
2882 1.1 mrg
2883 1.1 mrg static void
2884 1.1 mrg nvptx_print_operand (FILE *file, rtx x, int code)
2885 1.1 mrg {
2886 1.1 mrg if (code == '.')
2887 1.1 mrg {
2888 1.1 mrg x = current_insn_predicate;
2889 1.1 mrg if (x)
2890 1.1 mrg {
2891 1.1 mrg fputs ("@", file);
2892 1.1 mrg if (GET_CODE (x) == EQ)
2893 1.1 mrg fputs ("!", file);
2894 1.1 mrg output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2895 1.1 mrg }
2896 1.1 mrg return;
2897 1.1 mrg }
2898 1.1 mrg else if (code == '#')
2899 1.1 mrg {
2900 1.1 mrg fputs (".rn", file);
2901 1.1 mrg return;
2902 1.1 mrg }
2903 1.1 mrg
2904 1.1 mrg enum rtx_code x_code = GET_CODE (x);
2905 1.1 mrg machine_mode mode = GET_MODE (x);
2906 1.1 mrg
2907 1.1 mrg switch (code)
2908 1.1 mrg {
2909 1.1 mrg case 'x':
2910 1.1 mrg if (current_output_insn != NULL
2911 1.1 mrg && find_reg_note (current_output_insn, REG_UNUSED, x) != NULL_RTX)
2912 1.1 mrg {
2913 1.1 mrg fputs ("_", file);
2914 1.1 mrg return;
2915 1.1 mrg }
2916 1.1 mrg goto common;
2917 1.1 mrg case 'B':
2918 1.1 mrg if (SYMBOL_REF_P (XEXP (x, 0)))
2919 1.1 mrg switch (SYMBOL_DATA_AREA (XEXP (x, 0)))
2920 1.1 mrg {
2921 1.1 mrg case DATA_AREA_GENERIC:
2922 1.1 mrg /* Assume worst-case: global. */
2923 1.1 mrg gcc_fallthrough (); /* FALLTHROUGH. */
2924 1.1 mrg case DATA_AREA_GLOBAL:
2925 1.1 mrg break;
2926 1.1 mrg case DATA_AREA_SHARED:
2927 1.1 mrg fputs (".cta", file);
2928 1.1 mrg return;
2929 1.1 mrg case DATA_AREA_LOCAL:
2930 1.1 mrg case DATA_AREA_CONST:
2931 1.1 mrg case DATA_AREA_PARAM:
2932 1.1 mrg default:
2933 1.1 mrg gcc_unreachable ();
2934 1.1 mrg }
2935 1.1 mrg
2936 1.1 mrg /* There are 2 cases where membar.sys differs from membar.gl:
2937 1.1 mrg - host accesses global memory (f.i. systemwide atomics)
2938 1.1 mrg - 2 or more devices are setup in peer-to-peer mode, and one
2939 1.1 mrg peer can access global memory of other peer.
2940 1.1 mrg Neither are currently supported by openMP/OpenACC on nvptx, but
2941 1.1 mrg that could change, so we default to membar.sys. We could support
2942 1.1 mrg this more optimally by adding DATA_AREA_SYS and then emitting
2943 1.1 mrg .gl for DATA_AREA_GLOBAL and .sys for DATA_AREA_SYS. */
2944 1.1 mrg fputs (".sys", file);
2945 1.1 mrg return;
2946 1.1 mrg
2947 1.1 mrg case 'A':
2948 1.1 mrg x = XEXP (x, 0);
2949 1.1 mrg gcc_fallthrough (); /* FALLTHROUGH. */
2950 1.1 mrg
2951 1.1 mrg case 'D':
2952 1.1 mrg if (GET_CODE (x) == CONST)
2953 1.1 mrg x = XEXP (x, 0);
2954 1.1 mrg if (GET_CODE (x) == PLUS)
2955 1.1 mrg x = XEXP (x, 0);
2956 1.1 mrg
2957 1.1 mrg if (GET_CODE (x) == SYMBOL_REF)
2958 1.1 mrg fputs (section_for_sym (x), file);
2959 1.1 mrg break;
2960 1.1 mrg
2961 1.1 mrg case 't':
2962 1.1 mrg case 'u':
2963 1.1 mrg if (x_code == SUBREG)
2964 1.1 mrg {
2965 1.1 mrg machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2966 1.1 mrg if (VECTOR_MODE_P (inner_mode)
2967 1.1 mrg && (GET_MODE_SIZE (mode)
2968 1.1 mrg <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2969 1.1 mrg mode = GET_MODE_INNER (inner_mode);
2970 1.1 mrg else if (split_mode_p (inner_mode))
2971 1.1 mrg mode = maybe_split_mode (inner_mode);
2972 1.1 mrg else
2973 1.1 mrg mode = inner_mode;
2974 1.1 mrg }
2975 1.1 mrg fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2976 1.1 mrg break;
2977 1.1 mrg
2978 1.1 mrg case 'H':
2979 1.1 mrg case 'L':
2980 1.1 mrg {
2981 1.1 mrg rtx inner_x = SUBREG_REG (x);
2982 1.1 mrg machine_mode inner_mode = GET_MODE (inner_x);
2983 1.1 mrg machine_mode split = maybe_split_mode (inner_mode);
2984 1.1 mrg
2985 1.1 mrg output_reg (file, REGNO (inner_x), split,
2986 1.1 mrg (code == 'H'
2987 1.1 mrg ? GET_MODE_SIZE (inner_mode) / 2
2988 1.1 mrg : 0));
2989 1.1 mrg }
2990 1.1 mrg break;
2991 1.1 mrg
2992 1.1 mrg case 'S':
2993 1.1 mrg {
2994 1.1 mrg nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2995 1.1 mrg /* Same order as nvptx_shuffle_kind. */
2996 1.1 mrg static const char *const kinds[] =
2997 1.1 mrg {".up", ".down", ".bfly", ".idx"};
2998 1.1 mrg fputs (kinds[kind], file);
2999 1.1 mrg }
3000 1.1 mrg break;
3001 1.1 mrg
3002 1.1 mrg case 'T':
3003 1.1 mrg fprintf (file, "%d", GET_MODE_BITSIZE (mode));
3004 1.1 mrg break;
3005 1.1 mrg
3006 1.1 mrg case 'j':
3007 1.1 mrg fprintf (file, "@");
3008 1.1 mrg goto common;
3009 1.1 mrg
3010 1.1 mrg case 'J':
3011 1.1 mrg fprintf (file, "@!");
3012 1.1 mrg goto common;
3013 1.1 mrg
3014 1.1 mrg case 'c':
3015 1.1 mrg mode = GET_MODE (XEXP (x, 0));
3016 1.1 mrg switch (x_code)
3017 1.1 mrg {
3018 1.1 mrg case EQ:
3019 1.1 mrg fputs (".eq", file);
3020 1.1 mrg break;
3021 1.1 mrg case NE:
3022 1.1 mrg if (FLOAT_MODE_P (mode))
3023 1.1 mrg fputs (".neu", file);
3024 1.1 mrg else
3025 1.1 mrg fputs (".ne", file);
3026 1.1 mrg break;
3027 1.1 mrg case LE:
3028 1.1 mrg case LEU:
3029 1.1 mrg fputs (".le", file);
3030 1.1 mrg break;
3031 1.1 mrg case GE:
3032 1.1 mrg case GEU:
3033 1.1 mrg fputs (".ge", file);
3034 1.1 mrg break;
3035 1.1 mrg case LT:
3036 1.1 mrg case LTU:
3037 1.1 mrg fputs (".lt", file);
3038 1.1 mrg break;
3039 1.1 mrg case GT:
3040 1.1 mrg case GTU:
3041 1.1 mrg fputs (".gt", file);
3042 1.1 mrg break;
3043 1.1 mrg case LTGT:
3044 1.1 mrg fputs (".ne", file);
3045 1.1 mrg break;
3046 1.1 mrg case UNEQ:
3047 1.1 mrg fputs (".equ", file);
3048 1.1 mrg break;
3049 1.1 mrg case UNLE:
3050 1.1 mrg fputs (".leu", file);
3051 1.1 mrg break;
3052 1.1 mrg case UNGE:
3053 1.1 mrg fputs (".geu", file);
3054 1.1 mrg break;
3055 1.1 mrg case UNLT:
3056 1.1 mrg fputs (".ltu", file);
3057 1.1 mrg break;
3058 1.1 mrg case UNGT:
3059 1.1 mrg fputs (".gtu", file);
3060 1.1 mrg break;
3061 1.1 mrg case UNORDERED:
3062 1.1 mrg fputs (".nan", file);
3063 1.1 mrg break;
3064 1.1 mrg case ORDERED:
3065 1.1 mrg fputs (".num", file);
3066 1.1 mrg break;
3067 1.1 mrg default:
3068 1.1 mrg gcc_unreachable ();
3069 1.1 mrg }
3070 1.1 mrg if (FLOAT_MODE_P (mode)
3071 1.1 mrg || x_code == EQ || x_code == NE
3072 1.1 mrg || x_code == GEU || x_code == GTU
3073 1.1 mrg || x_code == LEU || x_code == LTU)
3074 1.1 mrg fputs (nvptx_ptx_type_from_mode (mode, true), file);
3075 1.1 mrg else
3076 1.1 mrg fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
3077 1.1 mrg break;
3078 1.1 mrg default:
3079 1.1 mrg common:
3080 1.1 mrg switch (x_code)
3081 1.1 mrg {
3082 1.1 mrg case SUBREG:
3083 1.1 mrg {
3084 1.1 mrg rtx inner_x = SUBREG_REG (x);
3085 1.1 mrg machine_mode inner_mode = GET_MODE (inner_x);
3086 1.1 mrg machine_mode split = maybe_split_mode (inner_mode);
3087 1.1 mrg
3088 1.1 mrg if (VECTOR_MODE_P (inner_mode)
3089 1.1 mrg && (GET_MODE_SIZE (mode)
3090 1.1 mrg <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
3091 1.1 mrg {
3092 1.1 mrg output_reg (file, REGNO (inner_x), VOIDmode);
3093 1.1 mrg fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
3094 1.1 mrg }
3095 1.1 mrg else if (split_mode_p (inner_mode)
3096 1.1 mrg && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
3097 1.1 mrg output_reg (file, REGNO (inner_x), split);
3098 1.1 mrg else
3099 1.1 mrg output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
3100 1.1 mrg }
3101 1.1 mrg break;
3102 1.1 mrg
3103 1.1 mrg case REG:
3104 1.1 mrg output_reg (file, REGNO (x), maybe_split_mode (mode));
3105 1.1 mrg break;
3106 1.1 mrg
3107 1.1 mrg case MEM:
3108 1.1 mrg fputc ('[', file);
3109 1.1 mrg nvptx_print_address_operand (file, XEXP (x, 0), mode);
3110 1.1 mrg fputc (']', file);
3111 1.1 mrg break;
3112 1.1 mrg
3113 1.1 mrg case CONST_INT:
3114 1.1 mrg output_addr_const (file, x);
3115 1.1 mrg break;
3116 1.1 mrg
3117 1.1 mrg case CONST:
3118 1.1 mrg case SYMBOL_REF:
3119 1.1 mrg case LABEL_REF:
3120 1.1 mrg /* We could use output_addr_const, but that can print things like
3121 1.1 mrg "x-8", which breaks ptxas. Need to ensure it is output as
3122 1.1 mrg "x+-8". */
3123 1.1 mrg nvptx_print_address_operand (file, x, VOIDmode);
3124 1.1 mrg break;
3125 1.1 mrg
3126 1.1 mrg case CONST_DOUBLE:
3127 1.1 mrg long vals[2];
3128 1.1 mrg real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
3129 1.1 mrg vals[0] &= 0xffffffff;
3130 1.1 mrg vals[1] &= 0xffffffff;
3131 1.1 mrg if (mode == SFmode)
3132 1.1 mrg fprintf (file, "0f%08lx", vals[0]);
3133 1.1 mrg else
3134 1.1 mrg fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
3135 1.1 mrg break;
3136 1.1 mrg
3137 1.1 mrg case CONST_VECTOR:
3138 1.1 mrg {
3139 1.1 mrg unsigned n = CONST_VECTOR_NUNITS (x);
3140 1.1 mrg fprintf (file, "{ ");
3141 1.1 mrg for (unsigned i = 0; i < n; ++i)
3142 1.1 mrg {
3143 1.1 mrg if (i != 0)
3144 1.1 mrg fprintf (file, ", ");
3145 1.1 mrg
3146 1.1 mrg rtx elem = CONST_VECTOR_ELT (x, i);
3147 1.1 mrg output_addr_const (file, elem);
3148 1.1 mrg }
3149 1.1 mrg fprintf (file, " }");
3150 1.1 mrg }
3151 1.1 mrg break;
3152 1.1 mrg
3153 1.1 mrg default:
3154 1.1 mrg output_addr_const (file, x);
3155 1.1 mrg }
3156 1.1 mrg }
3157 1.1 mrg }
3158 1.1 mrg
3159 1.1 mrg /* Record replacement regs used to deal with subreg operands. */
3161 1.1 mrg struct reg_replace
3162 1.1 mrg {
3163 1.1 mrg rtx replacement[MAX_RECOG_OPERANDS];
3164 1.1 mrg machine_mode mode;
3165 1.1 mrg int n_allocated;
3166 1.1 mrg int n_in_use;
3167 1.1 mrg };
3168 1.1 mrg
3169 1.1 mrg /* Allocate or reuse a replacement in R and return the rtx. */
3170 1.1 mrg
3171 1.1 mrg static rtx
3172 1.1 mrg get_replacement (struct reg_replace *r)
3173 1.1 mrg {
3174 1.1 mrg if (r->n_allocated == r->n_in_use)
3175 1.1 mrg r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
3176 1.1 mrg return r->replacement[r->n_in_use++];
3177 1.1 mrg }
3178 1.1 mrg
3179 1.1 mrg /* Clean up subreg operands. In ptx assembly, everything is typed, and
3180 1.1 mrg the presence of subregs would break the rules for most instructions.
3181 1.1 mrg Replace them with a suitable new register of the right size, plus
3182 1.1 mrg conversion copyin/copyout instructions. */
3183 1.1 mrg
3184 1.1 mrg static void
3185 1.1 mrg nvptx_reorg_subreg (void)
3186 1.1 mrg {
3187 1.1 mrg struct reg_replace qiregs, hiregs, siregs, diregs;
3188 1.1 mrg rtx_insn *insn, *next;
3189 1.1 mrg
3190 1.1 mrg qiregs.n_allocated = 0;
3191 1.1 mrg hiregs.n_allocated = 0;
3192 1.1 mrg siregs.n_allocated = 0;
3193 1.1 mrg diregs.n_allocated = 0;
3194 1.1 mrg qiregs.mode = QImode;
3195 1.1 mrg hiregs.mode = HImode;
3196 1.1 mrg siregs.mode = SImode;
3197 1.1 mrg diregs.mode = DImode;
3198 1.1 mrg
3199 1.1 mrg for (insn = get_insns (); insn; insn = next)
3200 1.1 mrg {
3201 1.1 mrg next = NEXT_INSN (insn);
3202 1.1 mrg if (!NONDEBUG_INSN_P (insn)
3203 1.1 mrg || asm_noperands (PATTERN (insn)) >= 0
3204 1.1 mrg || GET_CODE (PATTERN (insn)) == USE
3205 1.1 mrg || GET_CODE (PATTERN (insn)) == CLOBBER)
3206 1.1 mrg continue;
3207 1.1 mrg
3208 1.1 mrg qiregs.n_in_use = 0;
3209 1.1 mrg hiregs.n_in_use = 0;
3210 1.1 mrg siregs.n_in_use = 0;
3211 1.1 mrg diregs.n_in_use = 0;
3212 1.1 mrg extract_insn (insn);
3213 1.1 mrg enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
3214 1.1 mrg
3215 1.1 mrg for (int i = 0; i < recog_data.n_operands; i++)
3216 1.1 mrg {
3217 1.1 mrg rtx op = recog_data.operand[i];
3218 1.1 mrg if (GET_CODE (op) != SUBREG)
3219 1.1 mrg continue;
3220 1.1 mrg
3221 1.1 mrg rtx inner = SUBREG_REG (op);
3222 1.1 mrg
3223 1.1 mrg machine_mode outer_mode = GET_MODE (op);
3224 1.1 mrg machine_mode inner_mode = GET_MODE (inner);
3225 1.1 mrg gcc_assert (s_ok);
3226 1.1 mrg if (s_ok
3227 1.1 mrg && (GET_MODE_PRECISION (inner_mode)
3228 1.1 mrg >= GET_MODE_PRECISION (outer_mode)))
3229 1.1 mrg continue;
3230 1.1 mrg gcc_assert (SCALAR_INT_MODE_P (outer_mode));
3231 1.1 mrg struct reg_replace *r = (outer_mode == QImode ? &qiregs
3232 1.1 mrg : outer_mode == HImode ? &hiregs
3233 1.1 mrg : outer_mode == SImode ? &siregs
3234 1.1 mrg : &diregs);
3235 1.1 mrg rtx new_reg = get_replacement (r);
3236 1.1 mrg
3237 1.1 mrg if (recog_data.operand_type[i] != OP_OUT)
3238 1.1 mrg {
3239 1.1 mrg enum rtx_code code;
3240 1.1 mrg if (GET_MODE_PRECISION (inner_mode)
3241 1.1 mrg < GET_MODE_PRECISION (outer_mode))
3242 1.1 mrg code = ZERO_EXTEND;
3243 1.1 mrg else
3244 1.1 mrg code = TRUNCATE;
3245 1.1 mrg
3246 1.1 mrg rtx pat = gen_rtx_SET (new_reg,
3247 1.1 mrg gen_rtx_fmt_e (code, outer_mode, inner));
3248 1.1 mrg emit_insn_before (pat, insn);
3249 1.1 mrg }
3250 1.1 mrg
3251 1.1 mrg if (recog_data.operand_type[i] != OP_IN)
3252 1.1 mrg {
3253 1.1 mrg enum rtx_code code;
3254 1.1 mrg if (GET_MODE_PRECISION (inner_mode)
3255 1.1 mrg < GET_MODE_PRECISION (outer_mode))
3256 1.1 mrg code = TRUNCATE;
3257 1.1 mrg else
3258 1.1 mrg code = ZERO_EXTEND;
3259 1.1 mrg
3260 1.1 mrg rtx pat = gen_rtx_SET (inner,
3261 1.1 mrg gen_rtx_fmt_e (code, inner_mode, new_reg));
3262 1.1 mrg emit_insn_after (pat, insn);
3263 1.1 mrg }
3264 1.1 mrg validate_change (insn, recog_data.operand_loc[i], new_reg, false);
3265 1.1 mrg }
3266 1.1 mrg }
3267 1.1 mrg }
3268 1.1 mrg
3269 1.1 mrg /* Return a SImode "master lane index" register for uniform-simt, allocating on
3270 1.1 mrg first use. */
3271 1.1 mrg
3272 1.1 mrg static rtx
3273 1.1 mrg nvptx_get_unisimt_master ()
3274 1.1 mrg {
3275 1.1 mrg rtx &master = cfun->machine->unisimt_master;
3276 1.1 mrg return master ? master : master = gen_reg_rtx (SImode);
3277 1.1 mrg }
3278 1.1 mrg
3279 1.1 mrg /* Return a BImode "predicate" register for uniform-simt, similar to above. */
3280 1.1 mrg
3281 1.1 mrg static rtx
3282 1.1 mrg nvptx_get_unisimt_predicate ()
3283 1.1 mrg {
3284 1.1 mrg rtx &pred = cfun->machine->unisimt_predicate;
3285 1.1 mrg return pred ? pred : pred = gen_reg_rtx (BImode);
3286 1.1 mrg }
3287 1.1 mrg
3288 1.1 mrg static rtx
3289 1.1 mrg nvptx_get_unisimt_outside_simt_predicate ()
3290 1.1 mrg {
3291 1.1 mrg rtx &pred = cfun->machine->unisimt_outside_simt_predicate;
3292 1.1 mrg return pred ? pred : pred = gen_reg_rtx (BImode);
3293 1.1 mrg }
3294 1.1 mrg
3295 1.1 mrg /* Return true if given call insn references one of the functions provided by
3296 1.1 mrg the CUDA runtime: malloc, free, vprintf. */
3297 1.1 mrg
3298 1.1 mrg static bool
3299 1.1 mrg nvptx_call_insn_is_syscall_p (rtx_insn *insn)
3300 1.1 mrg {
3301 1.1 mrg rtx pat = PATTERN (insn);
3302 1.1 mrg gcc_checking_assert (GET_CODE (pat) == PARALLEL);
3303 1.1 mrg pat = XVECEXP (pat, 0, 0);
3304 1.1 mrg if (GET_CODE (pat) == SET)
3305 1.1 mrg pat = SET_SRC (pat);
3306 1.1 mrg gcc_checking_assert (GET_CODE (pat) == CALL
3307 1.1 mrg && GET_CODE (XEXP (pat, 0)) == MEM);
3308 1.1 mrg rtx addr = XEXP (XEXP (pat, 0), 0);
3309 1.1 mrg if (GET_CODE (addr) != SYMBOL_REF)
3310 1.1 mrg return false;
3311 1.1 mrg const char *name = XSTR (addr, 0);
3312 1.1 mrg /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
3313 1.1 mrg references with forced assembler name refer to PTX syscalls. For vprintf,
3314 1.1 mrg accept both normal and forced-assembler-name references. */
3315 1.1 mrg return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
3316 1.1 mrg || !strcmp (name, "*malloc")
3317 1.1 mrg || !strcmp (name, "*free"));
3318 1.1 mrg }
3319 1.1 mrg
3320 1.1 mrg /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
3321 1.1 mrg propagate its value from lane MASTER to current lane. */
3322 1.1 mrg
3323 1.1 mrg static bool
3324 1.1 mrg nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
3325 1.1 mrg {
3326 1.1 mrg rtx reg;
3327 1.1 mrg if (GET_CODE (set) == SET
3328 1.1 mrg && REG_P (reg = SET_DEST (set))
3329 1.1 mrg && find_reg_note (insn, REG_UNUSED, reg) == NULL_RTX)
3330 1.1 mrg {
3331 1.1 mrg emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX),
3332 1.1 mrg insn);
3333 1.1 mrg return true;
3334 1.1 mrg }
3335 1.1 mrg
3336 1.1 mrg return false;
3337 1.1 mrg }
3338 1.1 mrg
3339 1.1 mrg static void
3340 1.1 mrg predicate_insn (rtx_insn *insn, rtx pred)
3341 1.1 mrg {
3342 1.1 mrg rtx pat = PATTERN (insn);
3343 1.1 mrg pred = gen_rtx_NE (BImode, pred, const0_rtx);
3344 1.1 mrg pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
3345 1.1 mrg bool changed_p = validate_change (insn, &PATTERN (insn), pat, false);
3346 1.1 mrg gcc_assert (changed_p);
3347 1.1 mrg }
3348 1.1 mrg
3349 1.1 mrg /* Adjust code for uniform-simt code generation variant by making atomics and
3350 1.1 mrg "syscalls" conditionally executed, and inserting shuffle-based propagation
3351 1.1 mrg for registers being set. */
3352 1.1 mrg
3353 1.1 mrg static void
3354 1.1 mrg nvptx_reorg_uniform_simt ()
3355 1.1 mrg {
3356 1.1 mrg rtx_insn *insn, *next;
3357 1.1 mrg
3358 1.1 mrg for (insn = get_insns (); insn; insn = next)
3359 1.1 mrg {
3360 1.1 mrg next = NEXT_INSN (insn);
3361 1.1 mrg
3362 1.1 mrg /* Skip NOTE, USE, etc. */
3363 1.1 mrg if (!INSN_P (insn) || recog_memoized (insn) == -1)
3364 1.1 mrg continue;
3365 1.1 mrg
3366 1.1 mrg if (CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
3367 1.1 mrg {
3368 1.1 mrg /* Handle syscall. */
3369 1.1 mrg }
3370 1.1 mrg else if (get_attr_atomic (insn))
3371 1.1 mrg {
3372 1.1 mrg /* Handle atomic insn. */
3373 1.1 mrg }
3374 1.1 mrg else
3375 1.1 mrg continue;
3376 1.1 mrg
3377 1.1 mrg rtx pat = PATTERN (insn);
3378 1.1 mrg rtx master = nvptx_get_unisimt_master ();
3379 1.1 mrg bool shuffle_p = false;
3380 1.1 mrg switch (GET_CODE (pat))
3381 1.1 mrg {
3382 1.1 mrg case PARALLEL:
3383 1.1 mrg for (int i = 0; i < XVECLEN (pat, 0); i++)
3384 1.1 mrg shuffle_p
3385 1.1 mrg |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
3386 1.1 mrg break;
3387 1.1 mrg case SET:
3388 1.1 mrg shuffle_p |= nvptx_unisimt_handle_set (pat, insn, master);
3389 1.1 mrg break;
3390 1.1 mrg default:
3391 1.1 mrg gcc_unreachable ();
3392 1.1 mrg }
3393 1.1 mrg
3394 1.1 mrg if (shuffle_p && TARGET_PTX_6_0)
3395 1.1 mrg {
3396 1.1 mrg /* The shuffle is a sync, so uniformity is guaranteed. */
3397 1.1 mrg }
3398 1.1 mrg else
3399 1.1 mrg {
3400 1.1 mrg if (TARGET_PTX_6_0)
3401 1.1 mrg {
3402 1.1 mrg gcc_assert (!shuffle_p);
3403 1.1 mrg /* Emit after the insn, to guarantee uniformity. */
3404 1.1 mrg emit_insn_after (gen_nvptx_warpsync (), insn);
3405 1.1 mrg }
3406 1.1 mrg else
3407 1.1 mrg {
3408 1.1 mrg /* Emit after the insn (and before the shuffle, if there are any)
3409 1.1 mrg to check uniformity. */
3410 1.1 mrg emit_insn_after (gen_nvptx_uniform_warp_check (), insn);
3411 1.1 mrg }
3412 1.1 mrg }
3413 1.1 mrg
3414 1.1 mrg rtx pred = nvptx_get_unisimt_predicate ();
3415 1.1 mrg predicate_insn (insn, pred);
3416 1.1 mrg
3417 1.1 mrg pred = NULL_RTX;
3418 1.1 mrg for (rtx_insn *post = NEXT_INSN (insn); post != next;
3419 1.1 mrg post = NEXT_INSN (post))
3420 1.1 mrg {
3421 1.1 mrg if (pred == NULL_RTX)
3422 1.1 mrg pred = nvptx_get_unisimt_outside_simt_predicate ();
3423 1.1 mrg predicate_insn (post, pred);
3424 1.1 mrg }
3425 1.1 mrg }
3426 1.1 mrg }
3427 1.1 mrg
3428 1.1 mrg /* Offloading function attributes. */
3429 1.1 mrg
3430 1.1 mrg struct offload_attrs
3431 1.1 mrg {
3432 1.1 mrg unsigned mask;
3433 1.1 mrg int num_gangs;
3434 1.1 mrg int num_workers;
3435 1.1 mrg int vector_length;
3436 1.1 mrg };
3437 1.1 mrg
3438 1.1 mrg /* Define entries for cfun->machine->axis_dim. */
3439 1.1 mrg
3440 1.1 mrg #define MACH_VECTOR_LENGTH 0
3441 1.1 mrg #define MACH_MAX_WORKERS 1
3442 1.1 mrg
3443 1.1 mrg static void populate_offload_attrs (offload_attrs *oa);
3444 1.1 mrg
3445 1.1 mrg static void
3446 1.1 mrg init_axis_dim (void)
3447 1.1 mrg {
3448 1.1 mrg offload_attrs oa;
3449 1.1 mrg int max_workers;
3450 1.1 mrg
3451 1.1 mrg populate_offload_attrs (&oa);
3452 1.1 mrg
3453 1.1 mrg if (oa.num_workers == 0)
3454 1.1 mrg max_workers = PTX_CTA_SIZE / oa.vector_length;
3455 1.1 mrg else
3456 1.1 mrg max_workers = oa.num_workers;
3457 1.1 mrg
3458 1.1 mrg cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
3459 1.1 mrg cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
3460 1.1 mrg cfun->machine->axis_dim_init_p = true;
3461 1.1 mrg }
3462 1.1 mrg
3463 1.1 mrg static int ATTRIBUTE_UNUSED
3464 1.1 mrg nvptx_mach_max_workers ()
3465 1.1 mrg {
3466 1.1 mrg if (!cfun->machine->axis_dim_init_p)
3467 1.1 mrg init_axis_dim ();
3468 1.1 mrg return cfun->machine->axis_dim[MACH_MAX_WORKERS];
3469 1.1 mrg }
3470 1.1 mrg
3471 1.1 mrg static int ATTRIBUTE_UNUSED
3472 1.1 mrg nvptx_mach_vector_length ()
3473 1.1 mrg {
3474 1.1 mrg if (!cfun->machine->axis_dim_init_p)
3475 1.1 mrg init_axis_dim ();
3476 1.1 mrg return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3477 1.1 mrg }
3478 1.1 mrg
3479 1.1 mrg /* Loop structure of the function. The entire function is described as
3480 1.1 mrg a NULL loop. */
3481 1.1 mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'. */
3482 1.1 mrg
3483 1.1 mrg struct parallel
3484 1.1 mrg {
3485 1.1 mrg /* Parent parallel. */
3486 1.1 mrg parallel *parent;
3487 1.1 mrg
3488 1.1 mrg /* Next sibling parallel. */
3489 1.1 mrg parallel *next;
3490 1.1 mrg
3491 1.1 mrg /* First child parallel. */
3492 1.1 mrg parallel *inner;
3493 1.1 mrg
3494 1.1 mrg /* Partitioning mask of the parallel. */
3495 1.1 mrg unsigned mask;
3496 1.1 mrg
3497 1.1 mrg /* Partitioning used within inner parallels. */
3498 1.1 mrg unsigned inner_mask;
3499 1.1 mrg
3500 1.1 mrg /* Location of parallel forked and join. The forked is the first
3501 1.1 mrg block in the parallel and the join is the first block after of
3502 1.1 mrg the partition. */
3503 1.1 mrg basic_block forked_block;
3504 1.1 mrg basic_block join_block;
3505 1.1 mrg
3506 1.1 mrg rtx_insn *forked_insn;
3507 1.1 mrg rtx_insn *join_insn;
3508 1.1 mrg
3509 1.1 mrg rtx_insn *fork_insn;
3510 1.1 mrg rtx_insn *joining_insn;
3511 1.1 mrg
3512 1.1 mrg /* Basic blocks in this parallel, but not in child parallels. The
3513 1.1 mrg FORKED and JOINING blocks are in the partition. The FORK and JOIN
3514 1.1 mrg blocks are not. */
3515 1.1 mrg auto_vec<basic_block> blocks;
3516 1.1 mrg
3517 1.1 mrg public:
3518 1.1 mrg parallel (parallel *parent, unsigned mode);
3519 1.1 mrg ~parallel ();
3520 1.1 mrg };
3521 1.1 mrg
3522 1.1 mrg /* Constructor links the new parallel into it's parent's chain of
3523 1.1 mrg children. */
3524 1.1 mrg
3525 1.1 mrg parallel::parallel (parallel *parent_, unsigned mask_)
3526 1.1 mrg :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3527 1.1 mrg {
3528 1.1 mrg forked_block = join_block = 0;
3529 1.1 mrg forked_insn = join_insn = 0;
3530 1.1 mrg fork_insn = joining_insn = 0;
3531 1.1 mrg
3532 1.1 mrg if (parent)
3533 1.1 mrg {
3534 1.1 mrg next = parent->inner;
3535 1.1 mrg parent->inner = this;
3536 1.1 mrg }
3537 1.1 mrg }
3538 1.1 mrg
3539 1.1 mrg parallel::~parallel ()
3540 1.1 mrg {
3541 1.1 mrg delete inner;
3542 1.1 mrg delete next;
3543 1.1 mrg }
3544 1.1 mrg
3545 1.1 mrg /* Map of basic blocks to insns */
3546 1.1 mrg typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3547 1.1 mrg
3548 1.1 mrg /* A tuple of an insn of interest and the BB in which it resides. */
3549 1.1 mrg typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3550 1.1 mrg typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3551 1.1 mrg
3552 1.1 mrg /* Split basic blocks such that each forked and join unspecs are at
3553 1.1 mrg the start of their basic blocks. Thus afterwards each block will
3554 1.1 mrg have a single partitioning mode. We also do the same for return
3555 1.1 mrg insns, as they are executed by every thread. Return the
3556 1.1 mrg partitioning mode of the function as a whole. Populate MAP with
3557 1.1 mrg head and tail blocks. We also clear the BB visited flag, which is
3558 1.1 mrg used when finding partitions. */
3559 1.1 mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_split_blocks'. */
3560 1.1 mrg
3561 1.1 mrg static void
3562 1.1 mrg nvptx_split_blocks (bb_insn_map_t *map)
3563 1.1 mrg {
3564 1.1 mrg insn_bb_vec_t worklist;
3565 1.1 mrg basic_block block;
3566 1.1 mrg rtx_insn *insn;
3567 1.1 mrg
3568 1.1 mrg /* Locate all the reorg instructions of interest. */
3569 1.1 mrg FOR_ALL_BB_FN (block, cfun)
3570 1.1 mrg {
3571 1.1 mrg bool seen_insn = false;
3572 1.1 mrg
3573 1.1 mrg /* Clear visited flag, for use by parallel locator */
3574 1.1 mrg block->flags &= ~BB_VISITED;
3575 1.1 mrg
3576 1.1 mrg FOR_BB_INSNS (block, insn)
3577 1.1 mrg {
3578 1.1 mrg if (!INSN_P (insn))
3579 1.1 mrg continue;
3580 1.1 mrg switch (recog_memoized (insn))
3581 1.1 mrg {
3582 1.1 mrg default:
3583 1.1 mrg seen_insn = true;
3584 1.1 mrg continue;
3585 1.1 mrg case CODE_FOR_nvptx_forked:
3586 1.1 mrg case CODE_FOR_nvptx_join:
3587 1.1 mrg break;
3588 1.1 mrg
3589 1.1 mrg case CODE_FOR_return:
3590 1.1 mrg /* We also need to split just before return insns, as
3591 1.1 mrg that insn needs executing by all threads, but the
3592 1.1 mrg block it is in probably does not. */
3593 1.1 mrg break;
3594 1.1 mrg }
3595 1.1 mrg
3596 1.1 mrg if (seen_insn)
3597 1.1 mrg /* We've found an instruction that must be at the start of
3598 1.1 mrg a block, but isn't. Add it to the worklist. */
3599 1.1 mrg worklist.safe_push (insn_bb_t (insn, block));
3600 1.1 mrg else
3601 1.1 mrg /* It was already the first instruction. Just add it to
3602 1.1 mrg the map. */
3603 1.1 mrg map->get_or_insert (block) = insn;
3604 1.1 mrg seen_insn = true;
3605 1.1 mrg }
3606 1.1 mrg }
3607 1.1 mrg
3608 1.1 mrg /* Split blocks on the worklist. */
3609 1.1 mrg unsigned ix;
3610 1.1 mrg insn_bb_t *elt;
3611 1.1 mrg basic_block remap = 0;
3612 1.1 mrg for (ix = 0; worklist.iterate (ix, &elt); ix++)
3613 1.1 mrg {
3614 1.1 mrg if (remap != elt->second)
3615 1.1 mrg {
3616 1.1 mrg block = elt->second;
3617 1.1 mrg remap = block;
3618 1.1 mrg }
3619 1.1 mrg
3620 1.1 mrg /* Split block before insn. The insn is in the new block */
3621 1.1 mrg edge e = split_block (block, PREV_INSN (elt->first));
3622 1.1 mrg
3623 1.1 mrg block = e->dest;
3624 1.1 mrg map->get_or_insert (block) = elt->first;
3625 1.1 mrg }
3626 1.1 mrg }
3627 1.1 mrg
3628 1.1 mrg /* Return true if MASK contains parallelism that requires shared
3629 1.1 mrg memory to broadcast. */
3630 1.1 mrg
3631 1.1 mrg static bool
3632 1.1 mrg nvptx_needs_shared_bcast (unsigned mask)
3633 1.1 mrg {
3634 1.1 mrg bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3635 1.1 mrg bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3636 1.1 mrg && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3637 1.1 mrg
3638 1.1 mrg return worker || large_vector;
3639 1.1 mrg }
3640 1.1 mrg
3641 1.1 mrg /* BLOCK is a basic block containing a head or tail instruction.
3642 1.1 mrg Locate the associated prehead or pretail instruction, which must be
3643 1.1 mrg in the single predecessor block. */
3644 1.1 mrg
3645 1.1 mrg static rtx_insn *
3646 1.1 mrg nvptx_discover_pre (basic_block block, int expected)
3647 1.1 mrg {
3648 1.1 mrg gcc_assert (block->preds->length () == 1);
3649 1.1 mrg basic_block pre_block = (*block->preds)[0]->src;
3650 1.1 mrg rtx_insn *pre_insn;
3651 1.1 mrg
3652 1.1 mrg for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3653 1.1 mrg pre_insn = PREV_INSN (pre_insn))
3654 1.1 mrg gcc_assert (pre_insn != BB_HEAD (pre_block));
3655 1.1 mrg
3656 1.1 mrg gcc_assert (recog_memoized (pre_insn) == expected);
3657 1.1 mrg return pre_insn;
3658 1.1 mrg }
3659 1.1 mrg
3660 1.1 mrg /* Dump this parallel and all its inner parallels. */
3661 1.1 mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_dump_pars'. */
3662 1.1 mrg
3663 1.1 mrg static void
3664 1.1 mrg nvptx_dump_pars (parallel *par, unsigned depth)
3665 1.1 mrg {
3666 1.1 mrg fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3667 1.1 mrg depth, par->mask,
3668 1.1 mrg par->forked_block ? par->forked_block->index : -1,
3669 1.1 mrg par->join_block ? par->join_block->index : -1);
3670 1.1 mrg
3671 1.1 mrg fprintf (dump_file, " blocks:");
3672 1.1 mrg
3673 1.1 mrg basic_block block;
3674 1.1 mrg for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3675 1.1 mrg fprintf (dump_file, " %d", block->index);
3676 1.1 mrg fprintf (dump_file, "\n");
3677 1.1 mrg if (par->inner)
3678 1.1 mrg nvptx_dump_pars (par->inner, depth + 1);
3679 1.1 mrg
3680 1.1 mrg if (par->next)
3681 1.1 mrg nvptx_dump_pars (par->next, depth);
3682 1.1 mrg }
3683 1.1 mrg
3684 1.1 mrg /* If BLOCK contains a fork/join marker, process it to create or
3685 1.1 mrg terminate a loop structure. Add this block to the current loop,
3686 1.1 mrg and then walk successor blocks. */
3687 1.1 mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_find_par'. */
3688 1.1 mrg
3689 1.1 mrg static parallel *
3690 1.1 mrg nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3691 1.1 mrg {
3692 1.1 mrg if (block->flags & BB_VISITED)
3693 1.1 mrg return par;
3694 1.1 mrg block->flags |= BB_VISITED;
3695 1.1 mrg
3696 1.1 mrg if (rtx_insn **endp = map->get (block))
3697 1.1 mrg {
3698 1.1 mrg rtx_insn *end = *endp;
3699 1.1 mrg
3700 1.1 mrg /* This is a block head or tail, or return instruction. */
3701 1.1 mrg switch (recog_memoized (end))
3702 1.1 mrg {
3703 1.1 mrg case CODE_FOR_return:
3704 1.1 mrg /* Return instructions are in their own block, and we
3705 1.1 mrg don't need to do anything more. */
3706 1.1 mrg return par;
3707 1.1 mrg
3708 1.1 mrg case CODE_FOR_nvptx_forked:
3709 1.1 mrg /* Loop head, create a new inner loop and add it into
3710 1.1 mrg our parent's child list. */
3711 1.1 mrg {
3712 1.1 mrg unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3713 1.1 mrg
3714 1.1 mrg gcc_assert (mask);
3715 1.1 mrg par = new parallel (par, mask);
3716 1.1 mrg par->forked_block = block;
3717 1.1 mrg par->forked_insn = end;
3718 1.1 mrg if (nvptx_needs_shared_bcast (mask))
3719 1.1 mrg par->fork_insn
3720 1.1 mrg = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3721 1.1 mrg }
3722 1.1 mrg break;
3723 1.1 mrg
3724 1.1 mrg case CODE_FOR_nvptx_join:
3725 1.1 mrg /* A loop tail. Finish the current loop and return to
3726 1.1 mrg parent. */
3727 1.1 mrg {
3728 1.1 mrg unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3729 1.1 mrg
3730 1.1 mrg gcc_assert (par->mask == mask);
3731 1.1 mrg gcc_assert (par->join_block == NULL);
3732 1.1 mrg par->join_block = block;
3733 1.1 mrg par->join_insn = end;
3734 1.1 mrg if (nvptx_needs_shared_bcast (mask))
3735 1.1 mrg par->joining_insn
3736 1.1 mrg = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3737 1.1 mrg par = par->parent;
3738 1.1 mrg }
3739 1.1 mrg break;
3740 1.1 mrg
3741 1.1 mrg default:
3742 1.1 mrg gcc_unreachable ();
3743 1.1 mrg }
3744 1.1 mrg }
3745 1.1 mrg
3746 1.1 mrg if (par)
3747 1.1 mrg /* Add this block onto the current loop's list of blocks. */
3748 1.1 mrg par->blocks.safe_push (block);
3749 1.1 mrg else
3750 1.1 mrg /* This must be the entry block. Create a NULL parallel. */
3751 1.1 mrg par = new parallel (0, 0);
3752 1.1 mrg
3753 1.1 mrg /* Walk successor blocks. */
3754 1.1 mrg edge e;
3755 1.1 mrg edge_iterator ei;
3756 1.1 mrg
3757 1.1 mrg FOR_EACH_EDGE (e, ei, block->succs)
3758 1.1 mrg nvptx_find_par (map, par, e->dest);
3759 1.1 mrg
3760 1.1 mrg return par;
3761 1.1 mrg }
3762 1.1 mrg
3763 1.1 mrg /* DFS walk the CFG looking for fork & join markers. Construct
3764 1.1 mrg loop structures as we go. MAP is a mapping of basic blocks
3765 1.1 mrg to head & tail markers, discovered when splitting blocks. This
3766 1.1 mrg speeds up the discovery. We rely on the BB visited flag having
3767 1.1 mrg been cleared when splitting blocks. */
3768 1.1 mrg /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_discover_pars'. */
3769 1.1 mrg
3770 1.1 mrg static parallel *
3771 1.1 mrg nvptx_discover_pars (bb_insn_map_t *map)
3772 1.1 mrg {
3773 1.1 mrg basic_block block;
3774 1.1 mrg
3775 1.1 mrg /* Mark exit blocks as visited. */
3776 1.1 mrg block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3777 1.1 mrg block->flags |= BB_VISITED;
3778 1.1 mrg
3779 1.1 mrg /* And entry block as not. */
3780 1.1 mrg block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3781 1.1 mrg block->flags &= ~BB_VISITED;
3782 1.1 mrg
3783 1.1 mrg parallel *par = nvptx_find_par (map, 0, block);
3784 1.1 mrg
3785 1.1 mrg if (dump_file)
3786 1.1 mrg {
3787 1.1 mrg fprintf (dump_file, "\nLoops\n");
3788 1.1 mrg nvptx_dump_pars (par, 0);
3789 1.1 mrg fprintf (dump_file, "\n");
3790 1.1 mrg }
3791 1.1 mrg
3792 1.1 mrg return par;
3793 1.1 mrg }
3794 1.1 mrg
3795 1.1 mrg /* Analyse a group of BBs within a partitioned region and create N
3796 1.1 mrg Single-Entry-Single-Exit regions. Some of those regions will be
3797 1.1 mrg trivial ones consisting of a single BB. The blocks of a
3798 1.1 mrg partitioned region might form a set of disjoint graphs -- because
3799 1.1 mrg the region encloses a differently partitoned sub region.
3800 1.1 mrg
3801 1.1 mrg We use the linear time algorithm described in 'Finding Regions Fast:
3802 1.1 mrg Single Entry Single Exit and control Regions in Linear Time'
3803 1.1 mrg Johnson, Pearson & Pingali. That algorithm deals with complete
3804 1.1 mrg CFGs, where a back edge is inserted from END to START, and thus the
3805 1.1 mrg problem becomes one of finding equivalent loops.
3806 1.1 mrg
3807 1.1 mrg In this case we have a partial CFG. We complete it by redirecting
3808 1.1 mrg any incoming edge to the graph to be from an arbitrary external BB,
3809 1.1 mrg and similarly redirecting any outgoing edge to be to that BB.
3810 1.1 mrg Thus we end up with a closed graph.
3811 1.1 mrg
3812 1.1 mrg The algorithm works by building a spanning tree of an undirected
3813 1.1 mrg graph and keeping track of back edges from nodes further from the
3814 1.1 mrg root in the tree to nodes nearer to the root in the tree. In the
3815 1.1 mrg description below, the root is up and the tree grows downwards.
3816 1.1 mrg
3817 1.1 mrg We avoid having to deal with degenerate back-edges to the same
3818 1.1 mrg block, by splitting each BB into 3 -- one for input edges, one for
3819 1.1 mrg the node itself and one for the output edges. Such back edges are
3820 1.1 mrg referred to as 'Brackets'. Cycle equivalent nodes will have the
3821 1.1 mrg same set of brackets.
3822 1.1 mrg
3823 1.1 mrg Determining bracket equivalency is done by maintaining a list of
3824 1.1 mrg brackets in such a manner that the list length and final bracket
3825 1.1 mrg uniquely identify the set.
3826 1.1 mrg
3827 1.1 mrg We use coloring to mark all BBs with cycle equivalency with the
3828 1.1 mrg same color. This is the output of the 'Finding Regions Fast'
3829 1.1 mrg algorithm. Notice it doesn't actually find the set of nodes within
3830 1.1 mrg a particular region, just unorderd sets of nodes that are the
3831 1.1 mrg entries and exits of SESE regions.
3832 1.1 mrg
3833 1.1 mrg After determining cycle equivalency, we need to find the minimal
3834 1.1 mrg set of SESE regions. Do this with a DFS coloring walk of the
3835 1.1 mrg complete graph. We're either 'looking' or 'coloring'. When
3836 1.1 mrg looking, and we're in the subgraph, we start coloring the color of
3837 1.1 mrg the current node, and remember that node as the start of the
3838 1.1 mrg current color's SESE region. Every time we go to a new node, we
3839 1.1 mrg decrement the count of nodes with thet color. If it reaches zero,
3840 1.1 mrg we remember that node as the end of the current color's SESE region
3841 1.1 mrg and return to 'looking'. Otherwise we color the node the current
3842 1.1 mrg color.
3843 1.1 mrg
3844 1.1 mrg This way we end up with coloring the inside of non-trivial SESE
3845 1.1 mrg regions with the color of that region. */
3846 1.1 mrg
3847 1.1 mrg /* A pair of BBs. We use this to represent SESE regions. */
3848 1.1 mrg typedef std::pair<basic_block, basic_block> bb_pair_t;
3849 1.1 mrg typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3850 1.1 mrg
3851 1.1 mrg /* A node in the undirected CFG. The discriminator SECOND indicates just
3852 1.1 mrg above or just below the BB idicated by FIRST. */
3853 1.1 mrg typedef std::pair<basic_block, int> pseudo_node_t;
3854 1.1 mrg
3855 1.1 mrg /* A bracket indicates an edge towards the root of the spanning tree of the
3856 1.1 mrg undirected graph. Each bracket has a color, determined
3857 1.1 mrg from the currrent set of brackets. */
3858 1.1 mrg struct bracket
3859 1.1 mrg {
3860 1.1 mrg pseudo_node_t back; /* Back target */
3861 1.1 mrg
3862 1.1 mrg /* Current color and size of set. */
3863 1.1 mrg unsigned color;
3864 1.1 mrg unsigned size;
3865 1.1 mrg
3866 1.1 mrg bracket (pseudo_node_t back_)
3867 1.1 mrg : back (back_), color (~0u), size (~0u)
3868 1.1 mrg {
3869 1.1 mrg }
3870 1.1 mrg
3871 1.1 mrg unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3872 1.1 mrg {
3873 1.1 mrg if (length != size)
3874 1.1 mrg {
3875 1.1 mrg size = length;
3876 1.1 mrg color = color_counts.length ();
3877 1.1 mrg color_counts.quick_push (0);
3878 1.1 mrg }
3879 1.1 mrg color_counts[color]++;
3880 1.1 mrg return color;
3881 1.1 mrg }
3882 1.1 mrg };
3883 1.1 mrg
3884 1.1 mrg typedef auto_vec<bracket> bracket_vec_t;
3885 1.1 mrg
3886 1.1 mrg /* Basic block info for finding SESE regions. */
3887 1.1 mrg
3888 1.1 mrg struct bb_sese
3889 1.1 mrg {
3890 1.1 mrg int node; /* Node number in spanning tree. */
3891 1.1 mrg int parent; /* Parent node number. */
3892 1.1 mrg
3893 1.1 mrg /* The algorithm splits each node A into Ai, A', Ao. The incoming
3894 1.1 mrg edges arrive at pseudo-node Ai and the outgoing edges leave at
3895 1.1 mrg pseudo-node Ao. We have to remember which way we arrived at a
3896 1.1 mrg particular node when generating the spanning tree. dir > 0 means
3897 1.1 mrg we arrived at Ai, dir < 0 means we arrived at Ao. */
3898 1.1 mrg int dir;
3899 1.1 mrg
3900 1.1 mrg /* Lowest numbered pseudo-node reached via a backedge from thsis
3901 1.1 mrg node, or any descendant. */
3902 1.1 mrg pseudo_node_t high;
3903 1.1 mrg
3904 1.1 mrg int color; /* Cycle-equivalence color */
3905 1.1 mrg
3906 1.1 mrg /* Stack of brackets for this node. */
3907 1.1 mrg bracket_vec_t brackets;
3908 1.1 mrg
3909 1.1 mrg bb_sese (unsigned node_, unsigned p, int dir_)
3910 1.1 mrg :node (node_), parent (p), dir (dir_)
3911 1.1 mrg {
3912 1.1 mrg }
3913 1.1 mrg ~bb_sese ();
3914 1.1 mrg
3915 1.1 mrg /* Push a bracket ending at BACK. */
3916 1.1 mrg void push (const pseudo_node_t &back)
3917 1.1 mrg {
3918 1.1 mrg if (dump_file)
3919 1.1 mrg fprintf (dump_file, "Pushing backedge %d:%+d\n",
3920 1.1 mrg back.first ? back.first->index : 0, back.second);
3921 1.1 mrg brackets.safe_push (bracket (back));
3922 1.1 mrg }
3923 1.1 mrg
3924 1.1 mrg void append (bb_sese *child);
3925 1.1 mrg void remove (const pseudo_node_t &);
3926 1.1 mrg
3927 1.1 mrg /* Set node's color. */
3928 1.1 mrg void set_color (auto_vec<unsigned> &color_counts)
3929 1.1 mrg {
3930 1.1 mrg color = brackets.last ().get_color (color_counts, brackets.length ());
3931 1.1 mrg }
3932 1.1 mrg };
3933 1.1 mrg
3934 1.1 mrg bb_sese::~bb_sese ()
3935 1.1 mrg {
3936 1.1 mrg }
3937 1.1 mrg
3938 1.1 mrg /* Destructively append CHILD's brackets. */
3939 1.1 mrg
3940 1.1 mrg void
3941 1.1 mrg bb_sese::append (bb_sese *child)
3942 1.1 mrg {
3943 1.1 mrg if (int len = child->brackets.length ())
3944 1.1 mrg {
3945 1.1 mrg int ix;
3946 1.1 mrg
3947 1.1 mrg if (dump_file)
3948 1.1 mrg {
3949 1.1 mrg for (ix = 0; ix < len; ix++)
3950 1.1 mrg {
3951 1.1 mrg const pseudo_node_t &pseudo = child->brackets[ix].back;
3952 1.1 mrg fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3953 1.1 mrg child->node, pseudo.first ? pseudo.first->index : 0,
3954 1.1 mrg pseudo.second);
3955 1.1 mrg }
3956 1.1 mrg }
3957 1.1 mrg if (!brackets.length ())
3958 1.1 mrg std::swap (brackets, child->brackets);
3959 1.1 mrg else
3960 1.1 mrg {
3961 1.1 mrg brackets.reserve (len);
3962 1.1 mrg for (ix = 0; ix < len; ix++)
3963 1.1 mrg brackets.quick_push (child->brackets[ix]);
3964 1.1 mrg }
3965 1.1 mrg }
3966 1.1 mrg }
3967 1.1 mrg
3968 1.1 mrg /* Remove brackets that terminate at PSEUDO. */
3969 1.1 mrg
3970 1.1 mrg void
3971 1.1 mrg bb_sese::remove (const pseudo_node_t &pseudo)
3972 1.1 mrg {
3973 1.1 mrg unsigned removed = 0;
3974 1.1 mrg int len = brackets.length ();
3975 1.1 mrg
3976 1.1 mrg for (int ix = 0; ix < len; ix++)
3977 1.1 mrg {
3978 1.1 mrg if (brackets[ix].back == pseudo)
3979 1.1 mrg {
3980 1.1 mrg if (dump_file)
3981 1.1 mrg fprintf (dump_file, "Removing backedge %d:%+d\n",
3982 1.1 mrg pseudo.first ? pseudo.first->index : 0, pseudo.second);
3983 1.1 mrg removed++;
3984 1.1 mrg }
3985 1.1 mrg else if (removed)
3986 1.1 mrg brackets[ix-removed] = brackets[ix];
3987 1.1 mrg }
3988 1.1 mrg while (removed--)
3989 1.1 mrg brackets.pop ();
3990 1.1 mrg }
3991 1.1 mrg
3992 1.1 mrg /* Accessors for BB's aux pointer. */
3993 1.1 mrg #define BB_SET_SESE(B, S) ((B)->aux = (S))
3994 1.1 mrg #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3995 1.1 mrg
3996 1.1 mrg /* DFS walk creating SESE data structures. Only cover nodes with
3997 1.1 mrg BB_VISITED set. Append discovered blocks to LIST. We number in
3998 1.1 mrg increments of 3 so that the above and below pseudo nodes can be
3999 1.1 mrg implicitly numbered too. */
4000 1.1 mrg
4001 1.1 mrg static int
4002 1.1 mrg nvptx_sese_number (int n, int p, int dir, basic_block b,
4003 1.1 mrg auto_vec<basic_block> *list)
4004 1.1 mrg {
4005 1.1 mrg if (BB_GET_SESE (b))
4006 1.1 mrg return n;
4007 1.1 mrg
4008 1.1 mrg if (dump_file)
4009 1.1 mrg fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
4010 1.1 mrg b->index, n, p, dir);
4011 1.1 mrg
4012 1.1 mrg BB_SET_SESE (b, new bb_sese (n, p, dir));
4013 1.1 mrg p = n;
4014 1.1 mrg
4015 1.1 mrg n += 3;
4016 1.1 mrg list->quick_push (b);
4017 1.1 mrg
4018 1.1 mrg /* First walk the nodes on the 'other side' of this node, then walk
4019 1.1 mrg the nodes on the same side. */
4020 1.1 mrg for (unsigned ix = 2; ix; ix--)
4021 1.1 mrg {
4022 1.1 mrg vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
4023 1.1 mrg size_t offset = (dir > 0 ? offsetof (edge_def, dest)
4024 1.1 mrg : offsetof (edge_def, src));
4025 1.1 mrg edge e;
4026 1.1 mrg edge_iterator ei;
4027 1.1 mrg
4028 1.1 mrg FOR_EACH_EDGE (e, ei, edges)
4029 1.1 mrg {
4030 1.1 mrg basic_block target = *(basic_block *)((char *)e + offset);
4031 1.1 mrg
4032 1.1 mrg if (target->flags & BB_VISITED)
4033 1.1 mrg n = nvptx_sese_number (n, p, dir, target, list);
4034 1.1 mrg }
4035 1.1 mrg dir = -dir;
4036 1.1 mrg }
4037 1.1 mrg return n;
4038 1.1 mrg }
4039 1.1 mrg
4040 1.1 mrg /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
4041 1.1 mrg EDGES are the outgoing edges and OFFSET is the offset to the src
4042 1.1 mrg or dst block on the edges. */
4043 1.1 mrg
4044 1.1 mrg static void
4045 1.1 mrg nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
4046 1.1 mrg vec<edge, va_gc> *edges, size_t offset)
4047 1.1 mrg {
4048 1.1 mrg edge e;
4049 1.1 mrg edge_iterator ei;
4050 1.1 mrg int hi_back = depth;
4051 1.1 mrg pseudo_node_t node_back (nullptr, depth);
4052 1.1 mrg int hi_child = depth;
4053 1.1 mrg pseudo_node_t node_child (nullptr, depth);
4054 1.1 mrg basic_block child = NULL;
4055 1.1 mrg unsigned num_children = 0;
4056 1.1 mrg int usd = -dir * sese->dir;
4057 1.1 mrg
4058 1.1 mrg if (dump_file)
4059 1.1 mrg fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
4060 1.1 mrg me->index, sese->node, dir);
4061 1.1 mrg
4062 1.1 mrg if (dir < 0)
4063 1.1 mrg {
4064 1.1 mrg /* This is the above pseudo-child. It has the BB itself as an
4065 1.1 mrg additional child node. */
4066 1.1 mrg node_child = sese->high;
4067 1.1 mrg hi_child = node_child.second;
4068 1.1 mrg if (node_child.first)
4069 1.1 mrg hi_child += BB_GET_SESE (node_child.first)->node;
4070 1.1 mrg num_children++;
4071 1.1 mrg }
4072 1.1 mrg
4073 1.1 mrg /* Examine each edge.
4074 1.1 mrg - if it is a child (a) append its bracket list and (b) record
4075 1.1 mrg whether it is the child with the highest reaching bracket.
4076 1.1 mrg - if it is an edge to ancestor, record whether it's the highest
4077 1.1 mrg reaching backlink. */
4078 1.1 mrg FOR_EACH_EDGE (e, ei, edges)
4079 1.1 mrg {
4080 1.1 mrg basic_block target = *(basic_block *)((char *)e + offset);
4081 1.1 mrg
4082 1.1 mrg if (bb_sese *t_sese = BB_GET_SESE (target))
4083 1.1 mrg {
4084 1.1 mrg if (t_sese->parent == sese->node && !(t_sese->dir + usd))
4085 1.1 mrg {
4086 1.1 mrg /* Child node. Append its bracket list. */
4087 1.1 mrg num_children++;
4088 1.1 mrg sese->append (t_sese);
4089 1.1 mrg
4090 1.1 mrg /* Compare it's hi value. */
4091 1.1 mrg int t_hi = t_sese->high.second;
4092 1.1 mrg
4093 1.1 mrg if (basic_block child_hi_block = t_sese->high.first)
4094 1.1 mrg t_hi += BB_GET_SESE (child_hi_block)->node;
4095 1.1 mrg
4096 1.1 mrg if (hi_child > t_hi)
4097 1.1 mrg {
4098 1.1 mrg hi_child = t_hi;
4099 1.1 mrg node_child = t_sese->high;
4100 1.1 mrg child = target;
4101 1.1 mrg }
4102 1.1 mrg }
4103 1.1 mrg else if (t_sese->node < sese->node + dir
4104 1.1 mrg && !(dir < 0 && sese->parent == t_sese->node))
4105 1.1 mrg {
4106 1.1 mrg /* Non-parental ancestor node -- a backlink. */
4107 1.1 mrg int d = usd * t_sese->dir;
4108 1.1 mrg int back = t_sese->node + d;
4109 1.1 mrg
4110 1.1 mrg if (hi_back > back)
4111 1.1 mrg {
4112 1.1 mrg hi_back = back;
4113 1.1 mrg node_back = pseudo_node_t (target, d);
4114 1.1 mrg }
4115 1.1 mrg }
4116 1.1 mrg }
4117 1.1 mrg else
4118 1.1 mrg { /* Fallen off graph, backlink to entry node. */
4119 1.1 mrg hi_back = 0;
4120 1.1 mrg node_back = pseudo_node_t (nullptr, 0);
4121 1.1 mrg }
4122 1.1 mrg }
4123 1.1 mrg
4124 1.1 mrg /* Remove any brackets that terminate at this pseudo node. */
4125 1.1 mrg sese->remove (pseudo_node_t (me, dir));
4126 1.1 mrg
4127 1.1 mrg /* Now push any backlinks from this pseudo node. */
4128 1.1 mrg FOR_EACH_EDGE (e, ei, edges)
4129 1.1 mrg {
4130 1.1 mrg basic_block target = *(basic_block *)((char *)e + offset);
4131 1.1 mrg if (bb_sese *t_sese = BB_GET_SESE (target))
4132 1.1 mrg {
4133 1.1 mrg if (t_sese->node < sese->node + dir
4134 1.1 mrg && !(dir < 0 && sese->parent == t_sese->node))
4135 1.1 mrg /* Non-parental ancestor node - backedge from me. */
4136 1.1 mrg sese->push (pseudo_node_t (target, usd * t_sese->dir));
4137 1.1 mrg }
4138 1.1 mrg else
4139 1.1 mrg {
4140 1.1 mrg /* back edge to entry node */
4141 1.1 mrg sese->push (pseudo_node_t (nullptr, 0));
4142 1.1 mrg }
4143 1.1 mrg }
4144 1.1 mrg
4145 1.1 mrg /* If this node leads directly or indirectly to a no-return region of
4146 1.1 mrg the graph, then fake a backedge to entry node. */
4147 1.1 mrg if (!sese->brackets.length () || !edges || !edges->length ())
4148 1.1 mrg {
4149 1.1 mrg hi_back = 0;
4150 1.1 mrg node_back = pseudo_node_t (nullptr, 0);
4151 1.1 mrg sese->push (node_back);
4152 1.1 mrg }
4153 1.1 mrg
4154 1.1 mrg /* Record the highest reaching backedge from us or a descendant. */
4155 1.1 mrg sese->high = hi_back < hi_child ? node_back : node_child;
4156 1.1 mrg
4157 1.1 mrg if (num_children > 1)
4158 1.1 mrg {
4159 1.1 mrg /* There is more than one child -- this is a Y shaped piece of
4160 1.1 mrg spanning tree. We have to insert a fake backedge from this
4161 1.1 mrg node to the highest ancestor reached by not-the-highest
4162 1.1 mrg reaching child. Note that there may be multiple children
4163 1.1 mrg with backedges to the same highest node. That's ok and we
4164 1.1 mrg insert the edge to that highest node. */
4165 1.1 mrg hi_child = depth;
4166 1.1 mrg if (dir < 0 && child)
4167 1.1 mrg {
4168 1.1 mrg node_child = sese->high;
4169 1.1 mrg hi_child = node_child.second;
4170 1.1 mrg if (node_child.first)
4171 1.1 mrg hi_child += BB_GET_SESE (node_child.first)->node;
4172 1.1 mrg }
4173 1.1 mrg
4174 1.1 mrg FOR_EACH_EDGE (e, ei, edges)
4175 1.1 mrg {
4176 1.1 mrg basic_block target = *(basic_block *)((char *)e + offset);
4177 1.1 mrg
4178 1.1 mrg if (target == child)
4179 1.1 mrg /* Ignore the highest child. */
4180 1.1 mrg continue;
4181 1.1 mrg
4182 1.1 mrg bb_sese *t_sese = BB_GET_SESE (target);
4183 1.1 mrg if (!t_sese)
4184 1.1 mrg continue;
4185 1.1 mrg if (t_sese->parent != sese->node)
4186 1.1 mrg /* Not a child. */
4187 1.1 mrg continue;
4188 1.1 mrg
4189 1.1 mrg /* Compare its hi value. */
4190 1.1 mrg int t_hi = t_sese->high.second;
4191 1.1 mrg
4192 1.1 mrg if (basic_block child_hi_block = t_sese->high.first)
4193 1.1 mrg t_hi += BB_GET_SESE (child_hi_block)->node;
4194 1.1 mrg
4195 1.1 mrg if (hi_child > t_hi)
4196 1.1 mrg {
4197 1.1 mrg hi_child = t_hi;
4198 1.1 mrg node_child = t_sese->high;
4199 1.1 mrg }
4200 1.1 mrg }
4201 1.1 mrg
4202 1.1 mrg sese->push (node_child);
4203 1.1 mrg }
4204 1.1 mrg }
4205 1.1 mrg
4206 1.1 mrg
4207 1.1 mrg /* DFS walk of BB graph. Color node BLOCK according to COLORING then
4208 1.1 mrg proceed to successors. Set SESE entry and exit nodes of
4209 1.1 mrg REGIONS. */
4210 1.1 mrg
4211 1.1 mrg static void
4212 1.1 mrg nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t ®ions,
4213 1.1 mrg basic_block block, int coloring)
4214 1.1 mrg {
4215 1.1 mrg bb_sese *sese = BB_GET_SESE (block);
4216 1.1 mrg
4217 1.1 mrg if (block->flags & BB_VISITED)
4218 1.1 mrg {
4219 1.1 mrg /* If we've already encountered this block, either we must not
4220 1.1 mrg be coloring, or it must have been colored the current color. */
4221 1.1 mrg gcc_assert (coloring < 0 || (sese && coloring == sese->color));
4222 1.1 mrg return;
4223 1.1 mrg }
4224 1.1 mrg
4225 1.1 mrg block->flags |= BB_VISITED;
4226 1.1 mrg
4227 1.1 mrg if (sese)
4228 1.1 mrg {
4229 1.1 mrg if (coloring < 0)
4230 1.1 mrg {
4231 1.1 mrg /* Start coloring a region. */
4232 1.1 mrg regions[sese->color].first = block;
4233 1.1 mrg coloring = sese->color;
4234 1.1 mrg }
4235 1.1 mrg
4236 1.1 mrg if (!--color_counts[sese->color] && sese->color == coloring)
4237 1.1 mrg {
4238 1.1 mrg /* Found final block of SESE region. */
4239 1.1 mrg regions[sese->color].second = block;
4240 1.1 mrg coloring = -1;
4241 1.1 mrg }
4242 1.1 mrg else
4243 1.1 mrg /* Color the node, so we can assert on revisiting the node
4244 1.1 mrg that the graph is indeed SESE. */
4245 1.1 mrg sese->color = coloring;
4246 1.1 mrg }
4247 1.1 mrg else
4248 1.1 mrg /* Fallen off the subgraph, we cannot be coloring. */
4249 1.1 mrg gcc_assert (coloring < 0);
4250 1.1 mrg
4251 1.1 mrg /* Walk each successor block. */
4252 1.1 mrg if (block->succs && block->succs->length ())
4253 1.1 mrg {
4254 1.1 mrg edge e;
4255 1.1 mrg edge_iterator ei;
4256 1.1 mrg
4257 1.1 mrg FOR_EACH_EDGE (e, ei, block->succs)
4258 1.1 mrg nvptx_sese_color (color_counts, regions, e->dest, coloring);
4259 1.1 mrg }
4260 1.1 mrg else
4261 1.1 mrg gcc_assert (coloring < 0);
4262 1.1 mrg }
4263 1.1 mrg
4264 1.1 mrg /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
4265 1.1 mrg end up with NULL entries in it. */
4266 1.1 mrg
4267 1.1 mrg static void
4268 1.1 mrg nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions)
4269 1.1 mrg {
4270 1.1 mrg basic_block block;
4271 1.1 mrg int ix;
4272 1.1 mrg
4273 1.1 mrg /* First clear each BB of the whole function. */
4274 1.1 mrg FOR_ALL_BB_FN (block, cfun)
4275 1.1 mrg {
4276 1.1 mrg block->flags &= ~BB_VISITED;
4277 1.1 mrg BB_SET_SESE (block, 0);
4278 1.1 mrg }
4279 1.1 mrg
4280 1.1 mrg /* Mark blocks in the function that are in this graph. */
4281 1.1 mrg for (ix = 0; blocks.iterate (ix, &block); ix++)
4282 1.1 mrg block->flags |= BB_VISITED;
4283 1.1 mrg
4284 1.1 mrg /* Counts of nodes assigned to each color. There cannot be more
4285 1.1 mrg colors than blocks (and hopefully there will be fewer). */
4286 1.1 mrg auto_vec<unsigned> color_counts;
4287 1.1 mrg color_counts.reserve (blocks.length ());
4288 1.1 mrg
4289 1.1 mrg /* Worklist of nodes in the spanning tree. Again, there cannot be
4290 1.1 mrg more nodes in the tree than blocks (there will be fewer if the
4291 1.1 mrg CFG of blocks is disjoint). */
4292 1.1 mrg auto_vec<basic_block> spanlist;
4293 1.1 mrg spanlist.reserve (blocks.length ());
4294 1.1 mrg
4295 1.1 mrg /* Make sure every block has its cycle class determined. */
4296 1.1 mrg for (ix = 0; blocks.iterate (ix, &block); ix++)
4297 1.1 mrg {
4298 1.1 mrg if (BB_GET_SESE (block))
4299 1.1 mrg /* We already met this block in an earlier graph solve. */
4300 1.1 mrg continue;
4301 1.1 mrg
4302 1.1 mrg if (dump_file)
4303 1.1 mrg fprintf (dump_file, "Searching graph starting at %d\n", block->index);
4304 1.1 mrg
4305 1.1 mrg /* Number the nodes reachable from block initial DFS order. */
4306 1.1 mrg int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
4307 1.1 mrg
4308 1.1 mrg /* Now walk in reverse DFS order to find cycle equivalents. */
4309 1.1 mrg while (spanlist.length ())
4310 1.1 mrg {
4311 1.1 mrg block = spanlist.pop ();
4312 1.1 mrg bb_sese *sese = BB_GET_SESE (block);
4313 1.1 mrg
4314 1.1 mrg /* Do the pseudo node below. */
4315 1.1 mrg nvptx_sese_pseudo (block, sese, depth, +1,
4316 1.1 mrg sese->dir > 0 ? block->succs : block->preds,
4317 1.1 mrg (sese->dir > 0 ? offsetof (edge_def, dest)
4318 1.1 mrg : offsetof (edge_def, src)));
4319 1.1 mrg sese->set_color (color_counts);
4320 1.1 mrg /* Do the pseudo node above. */
4321 1.1 mrg nvptx_sese_pseudo (block, sese, depth, -1,
4322 1.1 mrg sese->dir < 0 ? block->succs : block->preds,
4323 1.1 mrg (sese->dir < 0 ? offsetof (edge_def, dest)
4324 1.1 mrg : offsetof (edge_def, src)));
4325 1.1 mrg }
4326 1.1 mrg if (dump_file)
4327 1.1 mrg fprintf (dump_file, "\n");
4328 1.1 mrg }
4329 1.1 mrg
4330 1.1 mrg if (dump_file)
4331 1.1 mrg {
4332 1.1 mrg unsigned count;
4333 1.1 mrg const char *comma = "";
4334 1.1 mrg
4335 1.1 mrg fprintf (dump_file, "Found %d cycle equivalents\n",
4336 1.1 mrg color_counts.length ());
4337 1.1 mrg for (ix = 0; color_counts.iterate (ix, &count); ix++)
4338 1.1 mrg {
4339 1.1 mrg fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
4340 1.1 mrg
4341 1.1 mrg comma = "";
4342 1.1 mrg for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
4343 1.1 mrg if (BB_GET_SESE (block)->color == ix)
4344 1.1 mrg {
4345 1.1 mrg block->flags |= BB_VISITED;
4346 1.1 mrg fprintf (dump_file, "%s%d", comma, block->index);
4347 1.1 mrg comma=",";
4348 1.1 mrg }
4349 1.1 mrg fprintf (dump_file, "}");
4350 1.1 mrg comma = ", ";
4351 1.1 mrg }
4352 1.1 mrg fprintf (dump_file, "\n");
4353 1.1 mrg }
4354 1.1 mrg
4355 1.1 mrg /* Now we've colored every block in the subgraph. We now need to
4356 1.1 mrg determine the minimal set of SESE regions that cover that
4357 1.1 mrg subgraph. Do this with a DFS walk of the complete function.
4358 1.1 mrg During the walk we're either 'looking' or 'coloring'. When we
4359 1.1 mrg reach the last node of a particular color, we stop coloring and
4360 1.1 mrg return to looking. */
4361 1.1 mrg
4362 1.1 mrg /* There cannot be more SESE regions than colors. */
4363 1.1 mrg regions.reserve (color_counts.length ());
4364 1.1 mrg for (ix = color_counts.length (); ix--;)
4365 1.1 mrg regions.quick_push (bb_pair_t (0, 0));
4366 1.1 mrg
4367 1.1 mrg for (ix = 0; blocks.iterate (ix, &block); ix++)
4368 1.1 mrg block->flags &= ~BB_VISITED;
4369 1.1 mrg
4370 1.1 mrg nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
4371 1.1 mrg
4372 1.1 mrg if (dump_file)
4373 1.1 mrg {
4374 1.1 mrg const char *comma = "";
4375 1.1 mrg int len = regions.length ();
4376 1.1 mrg
4377 1.1 mrg fprintf (dump_file, "SESE regions:");
4378 1.1 mrg for (ix = 0; ix != len; ix++)
4379 1.1 mrg {
4380 1.1 mrg basic_block from = regions[ix].first;
4381 1.1 mrg basic_block to = regions[ix].second;
4382 1.1 mrg
4383 1.1 mrg if (from)
4384 1.1 mrg {
4385 1.1 mrg fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
4386 1.1 mrg if (to != from)
4387 1.1 mrg fprintf (dump_file, "->%d", to->index);
4388 1.1 mrg
4389 1.1 mrg int color = BB_GET_SESE (from)->color;
4390 1.1 mrg
4391 1.1 mrg /* Print the blocks within the region (excluding ends). */
4392 1.1 mrg FOR_EACH_BB_FN (block, cfun)
4393 1.1 mrg {
4394 1.1 mrg bb_sese *sese = BB_GET_SESE (block);
4395 1.1 mrg
4396 1.1 mrg if (sese && sese->color == color
4397 1.1 mrg && block != from && block != to)
4398 1.1 mrg fprintf (dump_file, ".%d", block->index);
4399 1.1 mrg }
4400 1.1 mrg fprintf (dump_file, "}");
4401 1.1 mrg }
4402 1.1 mrg comma = ",";
4403 1.1 mrg }
4404 1.1 mrg fprintf (dump_file, "\n\n");
4405 1.1 mrg }
4406 1.1 mrg
4407 1.1 mrg for (ix = 0; blocks.iterate (ix, &block); ix++)
4408 1.1 mrg delete BB_GET_SESE (block);
4409 1.1 mrg }
4410 1.1 mrg
4411 1.1 mrg #undef BB_SET_SESE
4412 1.1 mrg #undef BB_GET_SESE
4413 1.1 mrg
4414 1.1 mrg /* Propagate live state at the start of a partitioned region. IS_CALL
4415 1.1 mrg indicates whether the propagation is for a (partitioned) call
4416 1.1 mrg instruction. BLOCK provides the live register information, and
4417 1.1 mrg might not contain INSN. Propagation is inserted just after INSN. RW
4418 1.1 mrg indicates whether we are reading and/or writing state. This
4419 1.1 mrg separation is needed for worker-level proppagation where we
4420 1.1 mrg essentially do a spill & fill. FN is the underlying worker
4421 1.1 mrg function to generate the propagation instructions for single
4422 1.1 mrg register. DATA is user data.
4423 1.1 mrg
4424 1.1 mrg Returns true if we didn't emit any instructions.
4425 1.1 mrg
4426 1.1 mrg We propagate the live register set for non-calls and the entire
4427 1.1 mrg frame for calls and non-calls. We could do better by (a)
4428 1.1 mrg propagating just the live set that is used within the partitioned
4429 1.1 mrg regions and (b) only propagating stack entries that are used. The
4430 1.1 mrg latter might be quite hard to determine. */
4431 1.1 mrg
4432 1.1 mrg typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
4433 1.1 mrg
4434 1.1 mrg static bool
4435 1.1 mrg nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
4436 1.1 mrg propagate_mask rw, propagator_fn fn, void *data, bool vector)
4437 1.1 mrg {
4438 1.1 mrg bitmap live = DF_LIVE_IN (block);
4439 1.1 mrg bitmap_iterator iterator;
4440 1.1 mrg unsigned ix;
4441 1.1 mrg bool empty = true;
4442 1.1 mrg
4443 1.1 mrg /* Copy the frame array. */
4444 1.1 mrg HOST_WIDE_INT fs = get_frame_size ();
4445 1.1 mrg if (fs)
4446 1.1 mrg {
4447 1.1 mrg rtx tmp = gen_reg_rtx (DImode);
4448 1.1 mrg rtx idx = NULL_RTX;
4449 1.1 mrg rtx ptr = gen_reg_rtx (Pmode);
4450 1.1 mrg rtx pred = NULL_RTX;
4451 1.1 mrg rtx_code_label *label = NULL;
4452 1.1 mrg
4453 1.1 mrg empty = false;
4454 1.1 mrg /* The frame size might not be DImode compatible, but the frame
4455 1.1 mrg array's declaration will be. So it's ok to round up here. */
4456 1.1 mrg fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
4457 1.1 mrg /* Detect single iteration loop. */
4458 1.1 mrg if (fs == 1)
4459 1.1 mrg fs = 0;
4460 1.1 mrg
4461 1.1 mrg start_sequence ();
4462 1.1 mrg emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
4463 1.1 mrg if (fs)
4464 1.1 mrg {
4465 1.1 mrg idx = gen_reg_rtx (SImode);
4466 1.1 mrg pred = gen_reg_rtx (BImode);
4467 1.1 mrg label = gen_label_rtx ();
4468 1.1 mrg
4469 1.1 mrg emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
4470 1.1 mrg /* Allow worker function to initialize anything needed. */
4471 1.1 mrg rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
4472 1.1 mrg if (init)
4473 1.1 mrg emit_insn (init);
4474 1.1 mrg emit_label (label);
4475 1.1 mrg LABEL_NUSES (label)++;
4476 1.1 mrg emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4477 1.1 mrg }
4478 1.1 mrg if (rw & PM_read)
4479 1.1 mrg emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
4480 1.1 mrg emit_insn (fn (tmp, rw, fs, data, vector));
4481 1.1 mrg if (rw & PM_write)
4482 1.1 mrg emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4483 1.1 mrg if (fs)
4484 1.1 mrg {
4485 1.1 mrg emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4486 1.1 mrg emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4487 1.1 mrg emit_insn (gen_br_true_uni (pred, label));
4488 1.1 mrg rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
4489 1.1 mrg if (fini)
4490 1.1 mrg emit_insn (fini);
4491 1.1 mrg emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4492 1.1 mrg }
4493 1.1 mrg emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4494 1.1 mrg emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4495 1.1 mrg rtx cpy = get_insns ();
4496 1.1 mrg end_sequence ();
4497 1.1 mrg insn = emit_insn_after (cpy, insn);
4498 1.1 mrg }
4499 1.1 mrg
4500 1.1 mrg if (!is_call)
4501 1.1 mrg /* Copy live registers. */
4502 1.1 mrg EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4503 1.1 mrg {
4504 1.1 mrg rtx reg = regno_reg_rtx[ix];
4505 1.1 mrg
4506 1.1 mrg if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4507 1.1 mrg {
4508 1.1 mrg rtx bcast = fn (reg, rw, 0, data, vector);
4509 1.1 mrg
4510 1.1 mrg insn = emit_insn_after (bcast, insn);
4511 1.1 mrg empty = false;
4512 1.1 mrg }
4513 1.1 mrg }
4514 1.1 mrg return empty;
4515 1.1 mrg }
4516 1.1 mrg
4517 1.1 mrg /* Worker for nvptx_warp_propagate. */
4518 1.1 mrg
4519 1.1 mrg static rtx
4520 1.1 mrg warp_prop_gen (rtx reg, propagate_mask pm,
4521 1.1 mrg unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4522 1.1 mrg bool ARG_UNUSED (vector))
4523 1.1 mrg {
4524 1.1 mrg if (!(pm & PM_read_write))
4525 1.1 mrg return 0;
4526 1.1 mrg
4527 1.1 mrg return nvptx_gen_warp_bcast (reg);
4528 1.1 mrg }
4529 1.1 mrg
4530 1.1 mrg /* Propagate state that is live at start of BLOCK across the vectors
4531 1.1 mrg of a single warp. Propagation is inserted just after INSN.
4532 1.1 mrg IS_CALL and return as for nvptx_propagate. */
4533 1.1 mrg
4534 1.1 mrg static bool
4535 1.1 mrg nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
4536 1.1 mrg {
4537 1.1 mrg return nvptx_propagate (is_call, block, insn, PM_read_write,
4538 1.1 mrg warp_prop_gen, 0, false);
4539 1.1 mrg }
4540 1.1 mrg
4541 1.1 mrg /* Worker for nvptx_shared_propagate. */
4542 1.1 mrg
4543 1.1 mrg static rtx
4544 1.1 mrg shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4545 1.1 mrg bool vector)
4546 1.1 mrg {
4547 1.1 mrg broadcast_data_t *data = (broadcast_data_t *)data_;
4548 1.1 mrg
4549 1.1 mrg if (pm & PM_loop_begin)
4550 1.1 mrg {
4551 1.1 mrg /* Starting a loop, initialize pointer. */
4552 1.1 mrg unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4553 1.1 mrg
4554 1.1 mrg oacc_bcast_align = MAX (oacc_bcast_align, align);
4555 1.1 mrg data->offset = ROUND_UP (data->offset, align);
4556 1.1 mrg
4557 1.1 mrg data->ptr = gen_reg_rtx (Pmode);
4558 1.1 mrg
4559 1.1 mrg return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4560 1.1 mrg }
4561 1.1 mrg else if (pm & PM_loop_end)
4562 1.1 mrg {
4563 1.1 mrg rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4564 1.1 mrg data->ptr = NULL_RTX;
4565 1.1 mrg return clobber;
4566 1.1 mrg }
4567 1.1 mrg else
4568 1.1 mrg return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
4569 1.1 mrg }
4570 1.1 mrg
4571 1.1 mrg /* Spill or fill live state that is live at start of BLOCK. PRE_P
4572 1.1 mrg indicates if this is just before partitioned mode (do spill), or
4573 1.1 mrg just after it starts (do fill). Sequence is inserted just after
4574 1.1 mrg INSN. IS_CALL and return as for nvptx_propagate. */
4575 1.1 mrg
4576 1.1 mrg static bool
4577 1.1 mrg nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4578 1.1 mrg rtx_insn *insn, bool vector)
4579 1.1 mrg {
4580 1.1 mrg broadcast_data_t data;
4581 1.1 mrg
4582 1.1 mrg data.base = gen_reg_rtx (Pmode);
4583 1.1 mrg data.offset = 0;
4584 1.1 mrg data.ptr = NULL_RTX;
4585 1.1 mrg
4586 1.1 mrg bool empty = nvptx_propagate (is_call, block, insn,
4587 1.1 mrg pre_p ? PM_read : PM_write, shared_prop_gen,
4588 1.1 mrg &data, vector);
4589 1.1 mrg gcc_assert (empty == !data.offset);
4590 1.1 mrg if (data.offset)
4591 1.1 mrg {
4592 1.1 mrg rtx bcast_sym = oacc_bcast_sym;
4593 1.1 mrg
4594 1.1 mrg /* Stuff was emitted, initialize the base pointer now. */
4595 1.1 mrg if (vector && nvptx_mach_max_workers () > 1)
4596 1.1 mrg {
4597 1.1 mrg if (!cfun->machine->bcast_partition)
4598 1.1 mrg {
4599 1.1 mrg /* It would be nice to place this register in
4600 1.1 mrg DATA_AREA_SHARED. */
4601 1.1 mrg cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4602 1.1 mrg }
4603 1.1 mrg if (!cfun->machine->sync_bar)
4604 1.1 mrg cfun->machine->sync_bar = gen_reg_rtx (SImode);
4605 1.1 mrg
4606 1.1 mrg bcast_sym = cfun->machine->bcast_partition;
4607 1.1 mrg }
4608 1.1 mrg
4609 1.1 mrg rtx init = gen_rtx_SET (data.base, bcast_sym);
4610 1.1 mrg emit_insn_after (init, insn);
4611 1.1 mrg
4612 1.1 mrg unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4613 1.1 mrg unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4614 1.1 mrg ? nvptx_mach_max_workers () + 1
4615 1.1 mrg : 1);
4616 1.1 mrg
4617 1.1 mrg oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4618 1.1 mrg oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4619 1.1 mrg }
4620 1.1 mrg return empty;
4621 1.1 mrg }
4622 1.1 mrg
4623 1.1 mrg /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4624 1.1 mrg which is an integer or a register. THREADS is the number of threads
4625 1.1 mrg controlled by the barrier. */
4626 1.1 mrg
4627 1.1 mrg static rtx
4628 1.1 mrg nvptx_cta_sync (rtx lock, int threads)
4629 1.1 mrg {
4630 1.1 mrg return gen_nvptx_barsync (lock, GEN_INT (threads));
4631 1.1 mrg }
4632 1.1 mrg
4633 1.1 mrg #if WORKAROUND_PTXJIT_BUG
4634 1.1 mrg /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4635 1.1 mrg real insns. */
4636 1.1 mrg
4637 1.1 mrg static rtx_insn *
4638 1.1 mrg bb_first_real_insn (basic_block bb)
4639 1.1 mrg {
4640 1.1 mrg rtx_insn *insn;
4641 1.1 mrg
4642 1.1 mrg /* Find first insn of from block. */
4643 1.1 mrg FOR_BB_INSNS (bb, insn)
4644 1.1 mrg if (INSN_P (insn))
4645 1.1 mrg return insn;
4646 1.1 mrg
4647 1.1 mrg return 0;
4648 1.1 mrg }
4649 1.1 mrg #endif
4650 1.1 mrg
4651 1.1 mrg /* Return true if INSN needs neutering. */
4652 1.1 mrg
4653 1.1 mrg static bool
4654 1.1 mrg needs_neutering_p (rtx_insn *insn)
4655 1.1 mrg {
4656 1.1 mrg if (!INSN_P (insn))
4657 1.1 mrg return false;
4658 1.1 mrg
4659 1.1 mrg switch (recog_memoized (insn))
4660 1.1 mrg {
4661 1.1 mrg case CODE_FOR_nvptx_fork:
4662 1.1 mrg case CODE_FOR_nvptx_forked:
4663 1.1 mrg case CODE_FOR_nvptx_joining:
4664 1.1 mrg case CODE_FOR_nvptx_join:
4665 1.1 mrg case CODE_FOR_nvptx_barsync:
4666 1.1 mrg return false;
4667 1.1 mrg default:
4668 1.1 mrg return true;
4669 1.1 mrg }
4670 1.1 mrg }
4671 1.1 mrg
4672 1.1 mrg /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4673 1.1 mrg
4674 1.1 mrg static bool
4675 1.1 mrg verify_neutering_jumps (basic_block from,
4676 1.1 mrg rtx_insn *vector_jump, rtx_insn *worker_jump,
4677 1.1 mrg rtx_insn *vector_label, rtx_insn *worker_label)
4678 1.1 mrg {
4679 1.1 mrg basic_block bb = from;
4680 1.1 mrg rtx_insn *insn = BB_HEAD (bb);
4681 1.1 mrg bool seen_worker_jump = false;
4682 1.1 mrg bool seen_vector_jump = false;
4683 1.1 mrg bool seen_worker_label = false;
4684 1.1 mrg bool seen_vector_label = false;
4685 1.1 mrg bool worker_neutered = false;
4686 1.1 mrg bool vector_neutered = false;
4687 1.1 mrg while (true)
4688 1.1 mrg {
4689 1.1 mrg if (insn == worker_jump)
4690 1.1 mrg {
4691 1.1 mrg seen_worker_jump = true;
4692 1.1 mrg worker_neutered = true;
4693 1.1 mrg gcc_assert (!vector_neutered);
4694 1.1 mrg }
4695 1.1 mrg else if (insn == vector_jump)
4696 1.1 mrg {
4697 1.1 mrg seen_vector_jump = true;
4698 1.1 mrg vector_neutered = true;
4699 1.1 mrg }
4700 1.1 mrg else if (insn == worker_label)
4701 1.1 mrg {
4702 1.1 mrg seen_worker_label = true;
4703 1.1 mrg gcc_assert (worker_neutered);
4704 1.1 mrg worker_neutered = false;
4705 1.1 mrg }
4706 1.1 mrg else if (insn == vector_label)
4707 1.1 mrg {
4708 1.1 mrg seen_vector_label = true;
4709 1.1 mrg gcc_assert (vector_neutered);
4710 1.1 mrg vector_neutered = false;
4711 1.1 mrg }
4712 1.1 mrg else if (INSN_P (insn))
4713 1.1 mrg switch (recog_memoized (insn))
4714 1.1 mrg {
4715 1.1 mrg case CODE_FOR_nvptx_barsync:
4716 1.1 mrg gcc_assert (!vector_neutered && !worker_neutered);
4717 1.1 mrg break;
4718 1.1 mrg default:
4719 1.1 mrg break;
4720 1.1 mrg }
4721 1.1 mrg
4722 1.1 mrg if (insn != BB_END (bb))
4723 1.1 mrg insn = NEXT_INSN (insn);
4724 1.1 mrg else if (JUMP_P (insn) && single_succ_p (bb)
4725 1.1 mrg && !seen_vector_jump && !seen_worker_jump)
4726 1.1 mrg {
4727 1.1 mrg bb = single_succ (bb);
4728 1.1 mrg insn = BB_HEAD (bb);
4729 1.1 mrg }
4730 1.1 mrg else
4731 1.1 mrg break;
4732 1.1 mrg }
4733 1.1 mrg
4734 1.1 mrg gcc_assert (!(vector_jump && !seen_vector_jump));
4735 1.1 mrg gcc_assert (!(worker_jump && !seen_worker_jump));
4736 1.1 mrg
4737 1.1 mrg if (seen_vector_label || seen_worker_label)
4738 1.1 mrg {
4739 1.1 mrg gcc_assert (!(vector_label && !seen_vector_label));
4740 1.1 mrg gcc_assert (!(worker_label && !seen_worker_label));
4741 1.1 mrg
4742 1.1 mrg return true;
4743 1.1 mrg }
4744 1.1 mrg
4745 1.1 mrg return false;
4746 1.1 mrg }
4747 1.1 mrg
4748 1.1 mrg /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4749 1.1 mrg
4750 1.1 mrg static void
4751 1.1 mrg verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4752 1.1 mrg rtx_insn *worker_label)
4753 1.1 mrg {
4754 1.1 mrg basic_block bb = to;
4755 1.1 mrg rtx_insn *insn = BB_END (bb);
4756 1.1 mrg bool seen_worker_label = false;
4757 1.1 mrg bool seen_vector_label = false;
4758 1.1 mrg while (true)
4759 1.1 mrg {
4760 1.1 mrg if (insn == worker_label)
4761 1.1 mrg {
4762 1.1 mrg seen_worker_label = true;
4763 1.1 mrg gcc_assert (!seen_vector_label);
4764 1.1 mrg }
4765 1.1 mrg else if (insn == vector_label)
4766 1.1 mrg seen_vector_label = true;
4767 1.1 mrg else if (INSN_P (insn))
4768 1.1 mrg switch (recog_memoized (insn))
4769 1.1 mrg {
4770 1.1 mrg case CODE_FOR_nvptx_barsync:
4771 1.1 mrg gcc_assert (!seen_vector_label && !seen_worker_label);
4772 1.1 mrg break;
4773 1.1 mrg }
4774 1.1 mrg
4775 1.1 mrg if (insn != BB_HEAD (bb))
4776 1.1 mrg insn = PREV_INSN (insn);
4777 1.1 mrg else
4778 1.1 mrg break;
4779 1.1 mrg }
4780 1.1 mrg
4781 1.1 mrg gcc_assert (!(vector_label && !seen_vector_label));
4782 1.1 mrg gcc_assert (!(worker_label && !seen_worker_label));
4783 1.1 mrg }
4784 1.1 mrg
4785 1.1 mrg /* Single neutering according to MASK. FROM is the incoming block and
4786 1.1 mrg TO is the outgoing block. These may be the same block. Insert at
4787 1.1 mrg start of FROM:
4788 1.1 mrg
4789 1.1 mrg if (tid.<axis>) goto end.
4790 1.1 mrg
4791 1.1 mrg and insert before ending branch of TO (if there is such an insn):
4792 1.1 mrg
4793 1.1 mrg end:
4794 1.1 mrg <possibly-broadcast-cond>
4795 1.1 mrg <branch>
4796 1.1 mrg
4797 1.1 mrg We currently only use differnt FROM and TO when skipping an entire
4798 1.1 mrg loop. We could do more if we detected superblocks. */
4799 1.1 mrg
4800 1.1 mrg static void
4801 1.1 mrg nvptx_single (unsigned mask, basic_block from, basic_block to)
4802 1.1 mrg {
4803 1.1 mrg rtx_insn *head = BB_HEAD (from);
4804 1.1 mrg rtx_insn *tail = BB_END (to);
4805 1.1 mrg unsigned skip_mask = mask;
4806 1.1 mrg
4807 1.1 mrg while (true)
4808 1.1 mrg {
4809 1.1 mrg /* Find first insn of from block. */
4810 1.1 mrg while (head != BB_END (from) && !needs_neutering_p (head))
4811 1.1 mrg head = NEXT_INSN (head);
4812 1.1 mrg
4813 1.1 mrg if (from == to)
4814 1.1 mrg break;
4815 1.1 mrg
4816 1.1 mrg if (!(JUMP_P (head) && single_succ_p (from)))
4817 1.1 mrg break;
4818 1.1 mrg
4819 1.1 mrg basic_block jump_target = single_succ (from);
4820 1.1 mrg if (!single_pred_p (jump_target))
4821 1.1 mrg break;
4822 1.1 mrg
4823 1.1 mrg from = jump_target;
4824 1.1 mrg head = BB_HEAD (from);
4825 1.1 mrg }
4826 1.1 mrg
4827 1.1 mrg /* Find last insn of to block */
4828 1.1 mrg rtx_insn *limit = from == to ? head : BB_HEAD (to);
4829 1.1 mrg while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4830 1.1 mrg tail = PREV_INSN (tail);
4831 1.1 mrg
4832 1.1 mrg /* Detect if tail is a branch. */
4833 1.1 mrg rtx tail_branch = NULL_RTX;
4834 1.1 mrg rtx cond_branch = NULL_RTX;
4835 1.1 mrg if (tail && INSN_P (tail))
4836 1.1 mrg {
4837 1.1 mrg tail_branch = PATTERN (tail);
4838 1.1 mrg if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4839 1.1 mrg tail_branch = NULL_RTX;
4840 1.1 mrg else
4841 1.1 mrg {
4842 1.1 mrg cond_branch = SET_SRC (tail_branch);
4843 1.1 mrg if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4844 1.1 mrg cond_branch = NULL_RTX;
4845 1.1 mrg }
4846 1.1 mrg }
4847 1.1 mrg
4848 1.1 mrg if (tail == head)
4849 1.1 mrg {
4850 1.1 mrg /* If this is empty, do nothing. */
4851 1.1 mrg if (!head || !needs_neutering_p (head))
4852 1.1 mrg return;
4853 1.1 mrg
4854 1.1 mrg if (cond_branch)
4855 1.1 mrg {
4856 1.1 mrg /* If we're only doing vector single, there's no need to
4857 1.1 mrg emit skip code because we'll not insert anything. */
4858 1.1 mrg if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4859 1.1 mrg skip_mask = 0;
4860 1.1 mrg }
4861 1.1 mrg else if (tail_branch)
4862 1.1 mrg /* Block with only unconditional branch. Nothing to do. */
4863 1.1 mrg return;
4864 1.1 mrg }
4865 1.1 mrg
4866 1.1 mrg /* Insert the vector test inside the worker test. */
4867 1.1 mrg unsigned mode;
4868 1.1 mrg rtx_insn *before = tail;
4869 1.1 mrg rtx_insn *neuter_start = NULL;
4870 1.1 mrg rtx_insn *worker_label = NULL, *vector_label = NULL;
4871 1.1 mrg rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4872 1.1 mrg rtx_insn *warp_sync = NULL;
4873 1.1 mrg for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4874 1.1 mrg if (GOMP_DIM_MASK (mode) & skip_mask)
4875 1.1 mrg {
4876 1.1 mrg rtx_code_label *label = gen_label_rtx ();
4877 1.1 mrg rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4878 1.1 mrg rtx_insn **mode_jump
4879 1.1 mrg = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4880 1.1 mrg rtx_insn **mode_label
4881 1.1 mrg = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4882 1.1 mrg
4883 1.1 mrg if (!pred)
4884 1.1 mrg {
4885 1.1 mrg pred = gen_reg_rtx (BImode);
4886 1.1 mrg cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4887 1.1 mrg }
4888 1.1 mrg
4889 1.1 mrg rtx br;
4890 1.1 mrg if (mode == GOMP_DIM_VECTOR)
4891 1.1 mrg br = gen_br_true (pred, label);
4892 1.1 mrg else
4893 1.1 mrg br = gen_br_true_uni (pred, label);
4894 1.1 mrg if (neuter_start)
4895 1.1 mrg neuter_start = emit_insn_after (br, neuter_start);
4896 1.1 mrg else
4897 1.1 mrg neuter_start = emit_insn_before (br, head);
4898 1.1 mrg *mode_jump = neuter_start;
4899 1.1 mrg
4900 1.1 mrg LABEL_NUSES (label)++;
4901 1.1 mrg rtx_insn *label_insn;
4902 1.1 mrg if (tail_branch)
4903 1.1 mrg {
4904 1.1 mrg label_insn = emit_label_before (label, before);
4905 1.1 mrg if (mode == GOMP_DIM_VECTOR)
4906 1.1 mrg {
4907 1.1 mrg if (TARGET_PTX_6_0)
4908 1.1 mrg warp_sync = emit_insn_after (gen_nvptx_warpsync (),
4909 1.1 mrg label_insn);
4910 1.1 mrg else
4911 1.1 mrg warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
4912 1.1 mrg label_insn);
4913 1.1 mrg }
4914 1.1 mrg before = label_insn;
4915 1.1 mrg }
4916 1.1 mrg else
4917 1.1 mrg {
4918 1.1 mrg label_insn = emit_label_after (label, tail);
4919 1.1 mrg if (mode == GOMP_DIM_VECTOR)
4920 1.1 mrg {
4921 1.1 mrg if (TARGET_PTX_6_0)
4922 1.1 mrg warp_sync = emit_insn_after (gen_nvptx_warpsync (),
4923 1.1 mrg label_insn);
4924 1.1 mrg else
4925 1.1 mrg warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
4926 1.1 mrg label_insn);
4927 1.1 mrg }
4928 1.1 mrg if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4929 1.1 mrg && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4930 1.1 mrg emit_insn_after (gen_exit (), label_insn);
4931 1.1 mrg }
4932 1.1 mrg
4933 1.1 mrg *mode_label = label_insn;
4934 1.1 mrg }
4935 1.1 mrg
4936 1.1 mrg /* Now deal with propagating the branch condition. */
4937 1.1 mrg if (cond_branch)
4938 1.1 mrg {
4939 1.1 mrg rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4940 1.1 mrg
4941 1.1 mrg if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4942 1.1 mrg && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4943 1.1 mrg {
4944 1.1 mrg /* Vector mode only, do a shuffle. */
4945 1.1 mrg #if WORKAROUND_PTXJIT_BUG
4946 1.1 mrg /* The branch condition %rcond is propagated like this:
4947 1.1 mrg
4948 1.1 mrg {
4949 1.1 mrg .reg .u32 %x;
4950 1.1 mrg mov.u32 %x,%tid.x;
4951 1.1 mrg setp.ne.u32 %rnotvzero,%x,0;
4952 1.1 mrg }
4953 1.1 mrg
4954 1.1 mrg @%rnotvzero bra Lskip;
4955 1.1 mrg setp.<op>.<type> %rcond,op1,op2;
4956 1.1 mrg Lskip:
4957 1.1 mrg selp.u32 %rcondu32,1,0,%rcond;
4958 1.1 mrg shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4959 1.1 mrg setp.ne.u32 %rcond,%rcondu32,0;
4960 1.1 mrg
4961 1.1 mrg There seems to be a bug in the ptx JIT compiler (observed at driver
4962 1.1 mrg version 381.22, at -O1 and higher for sm_61), that drops the shfl
4963 1.1 mrg unless %rcond is initialized to something before 'bra Lskip'. The
4964 1.1 mrg bug is not observed with ptxas from cuda 8.0.61.
4965 1.1 mrg
4966 1.1 mrg It is true that the code is non-trivial: at Lskip, %rcond is
4967 1.1 mrg uninitialized in threads 1-31, and after the selp the same holds
4968 1.1 mrg for %rcondu32. But shfl propagates the defined value in thread 0
4969 1.1 mrg to threads 1-31, so after the shfl %rcondu32 is defined in threads
4970 1.1 mrg 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4971 1.1 mrg
4972 1.1 mrg There is nothing in the PTX spec to suggest that this is wrong, or
4973 1.1 mrg to explain why the extra initialization is needed. So, we classify
4974 1.1 mrg it as a JIT bug, and the extra initialization as workaround:
4975 1.1 mrg
4976 1.1 mrg {
4977 1.1 mrg .reg .u32 %x;
4978 1.1 mrg mov.u32 %x,%tid.x;
4979 1.1 mrg setp.ne.u32 %rnotvzero,%x,0;
4980 1.1 mrg }
4981 1.1 mrg
4982 1.1 mrg +.reg .pred %rcond2;
4983 1.1 mrg +setp.eq.u32 %rcond2, 1, 0;
4984 1.1 mrg
4985 1.1 mrg @%rnotvzero bra Lskip;
4986 1.1 mrg setp.<op>.<type> %rcond,op1,op2;
4987 1.1 mrg +mov.pred %rcond2, %rcond;
4988 1.1 mrg Lskip:
4989 1.1 mrg +mov.pred %rcond, %rcond2;
4990 1.1 mrg selp.u32 %rcondu32,1,0,%rcond;
4991 1.1 mrg shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4992 1.1 mrg setp.ne.u32 %rcond,%rcondu32,0;
4993 1.1 mrg */
4994 1.1 mrg rtx_insn *label = PREV_INSN (tail);
4995 1.1 mrg if (label == warp_sync)
4996 1.1 mrg label = PREV_INSN (label);
4997 1.1 mrg gcc_assert (label && LABEL_P (label));
4998 1.1 mrg rtx tmp = gen_reg_rtx (BImode);
4999 1.1 mrg emit_insn_before (gen_movbi (tmp, const0_rtx),
5000 1.1 mrg bb_first_real_insn (from));
5001 1.1 mrg emit_insn_before (gen_rtx_SET (tmp, pvar), label);
5002 1.1 mrg emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
5003 1.1 mrg #endif
5004 1.1 mrg emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
5005 1.1 mrg }
5006 1.1 mrg else
5007 1.1 mrg {
5008 1.1 mrg /* Includes worker mode, do spill & fill. By construction
5009 1.1 mrg we should never have worker mode only. */
5010 1.1 mrg broadcast_data_t data;
5011 1.1 mrg unsigned size = GET_MODE_SIZE (SImode);
5012 1.1 mrg bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
5013 1.1 mrg bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
5014 1.1 mrg rtx barrier = GEN_INT (0);
5015 1.1 mrg int threads = 0;
5016 1.1 mrg
5017 1.1 mrg data.base = oacc_bcast_sym;
5018 1.1 mrg data.ptr = 0;
5019 1.1 mrg
5020 1.1 mrg bool use_partitioning_p = (vector && !worker
5021 1.1 mrg && nvptx_mach_max_workers () > 1
5022 1.1 mrg && cfun->machine->bcast_partition);
5023 1.1 mrg if (use_partitioning_p)
5024 1.1 mrg {
5025 1.1 mrg data.base = cfun->machine->bcast_partition;
5026 1.1 mrg barrier = cfun->machine->sync_bar;
5027 1.1 mrg threads = nvptx_mach_vector_length ();
5028 1.1 mrg }
5029 1.1 mrg gcc_assert (data.base != NULL);
5030 1.1 mrg gcc_assert (barrier);
5031 1.1 mrg
5032 1.1 mrg unsigned int psize = ROUND_UP (size, oacc_bcast_align);
5033 1.1 mrg unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
5034 1.1 mrg ? nvptx_mach_max_workers () + 1
5035 1.1 mrg : 1);
5036 1.1 mrg
5037 1.1 mrg oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
5038 1.1 mrg oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
5039 1.1 mrg
5040 1.1 mrg data.offset = 0;
5041 1.1 mrg emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
5042 1.1 mrg vector),
5043 1.1 mrg before);
5044 1.1 mrg
5045 1.1 mrg /* Barrier so other workers can see the write. */
5046 1.1 mrg emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
5047 1.1 mrg data.offset = 0;
5048 1.1 mrg emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
5049 1.1 mrg vector),
5050 1.1 mrg tail);
5051 1.1 mrg /* This barrier is needed to avoid worker zero clobbering
5052 1.1 mrg the broadcast buffer before all the other workers have
5053 1.1 mrg had a chance to read this instance of it. */
5054 1.1 mrg emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
5055 1.1 mrg }
5056 1.1 mrg
5057 1.1 mrg extract_insn (tail);
5058 1.1 mrg rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
5059 1.1 mrg UNSPEC_BR_UNIFIED);
5060 1.1 mrg validate_change (tail, recog_data.operand_loc[0], unsp, false);
5061 1.1 mrg }
5062 1.1 mrg
5063 1.1 mrg bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
5064 1.1 mrg vector_label, worker_label);
5065 1.1 mrg if (!seen_label)
5066 1.1 mrg verify_neutering_labels (to, vector_label, worker_label);
5067 1.1 mrg }
5068 1.1 mrg
5069 1.1 mrg /* PAR is a parallel that is being skipped in its entirety according to
5070 1.1 mrg MASK. Treat this as skipping a superblock starting at forked
5071 1.1 mrg and ending at joining. */
5072 1.1 mrg
5073 1.1 mrg static void
5074 1.1 mrg nvptx_skip_par (unsigned mask, parallel *par)
5075 1.1 mrg {
5076 1.1 mrg basic_block tail = par->join_block;
5077 1.1 mrg gcc_assert (tail->preds->length () == 1);
5078 1.1 mrg
5079 1.1 mrg basic_block pre_tail = (*tail->preds)[0]->src;
5080 1.1 mrg gcc_assert (pre_tail->succs->length () == 1);
5081 1.1 mrg
5082 1.1 mrg nvptx_single (mask, par->forked_block, pre_tail);
5083 1.1 mrg }
5084 1.1 mrg
5085 1.1 mrg /* If PAR has a single inner parallel and PAR itself only contains
5086 1.1 mrg empty entry and exit blocks, swallow the inner PAR. */
5087 1.1 mrg
5088 1.1 mrg static void
5089 1.1 mrg nvptx_optimize_inner (parallel *par)
5090 1.1 mrg {
5091 1.1 mrg parallel *inner = par->inner;
5092 1.1 mrg
5093 1.1 mrg /* We mustn't be the outer dummy par. */
5094 1.1 mrg if (!par->mask)
5095 1.1 mrg return;
5096 1.1 mrg
5097 1.1 mrg /* We must have a single inner par. */
5098 1.1 mrg if (!inner || inner->next)
5099 1.1 mrg return;
5100 1.1 mrg
5101 1.1 mrg /* We must only contain 2 blocks ourselves -- the head and tail of
5102 1.1 mrg the inner par. */
5103 1.1 mrg if (par->blocks.length () != 2)
5104 1.1 mrg return;
5105 1.1 mrg
5106 1.1 mrg /* We must be disjoint partitioning. As we only have vector and
5107 1.1 mrg worker partitioning, this is sufficient to guarantee the pars
5108 1.1 mrg have adjacent partitioning. */
5109 1.1 mrg if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
5110 1.1 mrg /* This indicates malformed code generation. */
5111 1.1 mrg return;
5112 1.1 mrg
5113 1.1 mrg /* The outer forked insn should be immediately followed by the inner
5114 1.1 mrg fork insn. */
5115 1.1 mrg rtx_insn *forked = par->forked_insn;
5116 1.1 mrg rtx_insn *fork = BB_END (par->forked_block);
5117 1.1 mrg
5118 1.1 mrg if (NEXT_INSN (forked) != fork)
5119 1.1 mrg return;
5120 1.1 mrg gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
5121 1.1 mrg
5122 1.1 mrg /* The outer joining insn must immediately follow the inner join
5123 1.1 mrg insn. */
5124 1.1 mrg rtx_insn *joining = par->joining_insn;
5125 1.1 mrg rtx_insn *join = inner->join_insn;
5126 1.1 mrg if (NEXT_INSN (join) != joining)
5127 1.1 mrg return;
5128 1.1 mrg
5129 1.1 mrg /* Preconditions met. Swallow the inner par. */
5130 1.1 mrg if (dump_file)
5131 1.1 mrg fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
5132 1.1 mrg inner->mask, inner->forked_block->index,
5133 1.1 mrg inner->join_block->index,
5134 1.1 mrg par->mask, par->forked_block->index, par->join_block->index);
5135 1.1 mrg
5136 1.1 mrg par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
5137 1.1 mrg
5138 1.1 mrg par->blocks.reserve (inner->blocks.length ());
5139 1.1 mrg while (inner->blocks.length ())
5140 1.1 mrg par->blocks.quick_push (inner->blocks.pop ());
5141 1.1 mrg
5142 1.1 mrg par->inner = inner->inner;
5143 1.1 mrg inner->inner = NULL;
5144 1.1 mrg
5145 1.1 mrg delete inner;
5146 1.1 mrg }
5147 1.1 mrg
5148 1.1 mrg /* Process the parallel PAR and all its contained
5149 1.1 mrg parallels. We do everything but the neutering. Return mask of
5150 1.1 mrg partitioned modes used within this parallel. */
5151 1.1 mrg
5152 1.1 mrg static unsigned
5153 1.1 mrg nvptx_process_pars (parallel *par)
5154 1.1 mrg {
5155 1.1 mrg if (nvptx_optimize)
5156 1.1 mrg nvptx_optimize_inner (par);
5157 1.1 mrg
5158 1.1 mrg unsigned inner_mask = par->mask;
5159 1.1 mrg
5160 1.1 mrg /* Do the inner parallels first. */
5161 1.1 mrg if (par->inner)
5162 1.1 mrg {
5163 1.1 mrg par->inner_mask = nvptx_process_pars (par->inner);
5164 1.1 mrg inner_mask |= par->inner_mask;
5165 1.1 mrg }
5166 1.1 mrg
5167 1.1 mrg bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
5168 1.1 mrg bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
5169 1.1 mrg bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
5170 1.1 mrg && nvptx_mach_vector_length () > PTX_WARP_SIZE);
5171 1.1 mrg
5172 1.1 mrg if (worker || large_vector)
5173 1.1 mrg {
5174 1.1 mrg nvptx_shared_propagate (false, is_call, par->forked_block,
5175 1.1 mrg par->forked_insn, !worker);
5176 1.1 mrg bool no_prop_p
5177 1.1 mrg = nvptx_shared_propagate (true, is_call, par->forked_block,
5178 1.1 mrg par->fork_insn, !worker);
5179 1.1 mrg bool empty_loop_p
5180 1.1 mrg = !is_call && (NEXT_INSN (par->forked_insn)
5181 1.1 mrg && NEXT_INSN (par->forked_insn) == par->joining_insn);
5182 1.1 mrg rtx barrier = GEN_INT (0);
5183 1.1 mrg int threads = 0;
5184 1.1 mrg
5185 1.1 mrg if (!worker && cfun->machine->sync_bar)
5186 1.1 mrg {
5187 1.1 mrg barrier = cfun->machine->sync_bar;
5188 1.1 mrg threads = nvptx_mach_vector_length ();
5189 1.1 mrg }
5190 1.1 mrg
5191 1.1 mrg if (no_prop_p && empty_loop_p)
5192 1.1 mrg ;
5193 1.1 mrg else if (no_prop_p && is_call)
5194 1.1 mrg ;
5195 1.1 mrg else
5196 1.1 mrg {
5197 1.1 mrg /* Insert begin and end synchronizations. */
5198 1.1 mrg emit_insn_before (nvptx_cta_sync (barrier, threads),
5199 1.1 mrg par->forked_insn);
5200 1.1 mrg emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
5201 1.1 mrg }
5202 1.1 mrg }
5203 1.1 mrg else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
5204 1.1 mrg nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
5205 1.1 mrg
5206 1.1 mrg /* Now do siblings. */
5207 1.1 mrg if (par->next)
5208 1.1 mrg inner_mask |= nvptx_process_pars (par->next);
5209 1.1 mrg return inner_mask;
5210 1.1 mrg }
5211 1.1 mrg
5212 1.1 mrg /* Neuter the parallel described by PAR. We recurse in depth-first
5213 1.1 mrg order. MODES are the partitioning of the execution and OUTER is
5214 1.1 mrg the partitioning of the parallels we are contained in. */
5215 1.1 mrg
5216 1.1 mrg static void
5217 1.1 mrg nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
5218 1.1 mrg {
5219 1.1 mrg unsigned me = (par->mask
5220 1.1 mrg & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
5221 1.1 mrg | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5222 1.1 mrg unsigned skip_mask = 0, neuter_mask = 0;
5223 1.1 mrg
5224 1.1 mrg if (par->inner)
5225 1.1 mrg nvptx_neuter_pars (par->inner, modes, outer | me);
5226 1.1 mrg
5227 1.1 mrg for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
5228 1.1 mrg {
5229 1.1 mrg if ((outer | me) & GOMP_DIM_MASK (mode))
5230 1.1 mrg {} /* Mode is partitioned: no neutering. */
5231 1.1 mrg else if (!(modes & GOMP_DIM_MASK (mode)))
5232 1.1 mrg {} /* Mode is not used: nothing to do. */
5233 1.1 mrg else if (par->inner_mask & GOMP_DIM_MASK (mode)
5234 1.1 mrg || !par->forked_insn)
5235 1.1 mrg /* Partitioned in inner parallels, or we're not a partitioned
5236 1.1 mrg at all: neuter individual blocks. */
5237 1.1 mrg neuter_mask |= GOMP_DIM_MASK (mode);
5238 1.1 mrg else if (!par->parent || !par->parent->forked_insn
5239 1.1 mrg || par->parent->inner_mask & GOMP_DIM_MASK (mode))
5240 1.1 mrg /* Parent isn't a parallel or contains this paralleling: skip
5241 1.1 mrg parallel at this level. */
5242 1.1 mrg skip_mask |= GOMP_DIM_MASK (mode);
5243 1.1 mrg else
5244 1.1 mrg {} /* Parent will skip this parallel itself. */
5245 1.1 mrg }
5246 1.1 mrg
5247 1.1 mrg if (neuter_mask)
5248 1.1 mrg {
5249 1.1 mrg int ix, len;
5250 1.1 mrg
5251 1.1 mrg if (nvptx_optimize)
5252 1.1 mrg {
5253 1.1 mrg /* Neuter whole SESE regions. */
5254 1.1 mrg bb_pair_vec_t regions;
5255 1.1 mrg
5256 1.1 mrg nvptx_find_sese (par->blocks, regions);
5257 1.1 mrg len = regions.length ();
5258 1.1 mrg for (ix = 0; ix != len; ix++)
5259 1.1 mrg {
5260 1.1 mrg basic_block from = regions[ix].first;
5261 1.1 mrg basic_block to = regions[ix].second;
5262 1.1 mrg
5263 1.1 mrg if (from)
5264 1.1 mrg nvptx_single (neuter_mask, from, to);
5265 1.1 mrg else
5266 1.1 mrg gcc_assert (!to);
5267 1.1 mrg }
5268 1.1 mrg }
5269 1.1 mrg else
5270 1.1 mrg {
5271 1.1 mrg /* Neuter each BB individually. */
5272 1.1 mrg len = par->blocks.length ();
5273 1.1 mrg for (ix = 0; ix != len; ix++)
5274 1.1 mrg {
5275 1.1 mrg basic_block block = par->blocks[ix];
5276 1.1 mrg
5277 1.1 mrg nvptx_single (neuter_mask, block, block);
5278 1.1 mrg }
5279 1.1 mrg }
5280 1.1 mrg }
5281 1.1 mrg
5282 1.1 mrg if (skip_mask)
5283 1.1 mrg nvptx_skip_par (skip_mask, par);
5284 1.1 mrg
5285 1.1 mrg if (par->next)
5286 1.1 mrg nvptx_neuter_pars (par->next, modes, outer);
5287 1.1 mrg }
5288 1.1 mrg
5289 1.1 mrg static void
5290 1.1 mrg populate_offload_attrs (offload_attrs *oa)
5291 1.1 mrg {
5292 1.1 mrg tree attr = oacc_get_fn_attrib (current_function_decl);
5293 1.1 mrg tree dims = TREE_VALUE (attr);
5294 1.1 mrg unsigned ix;
5295 1.1 mrg
5296 1.1 mrg oa->mask = 0;
5297 1.1 mrg
5298 1.1 mrg for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
5299 1.1 mrg {
5300 1.1 mrg tree t = TREE_VALUE (dims);
5301 1.1 mrg int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
5302 1.1 mrg tree allowed = TREE_PURPOSE (dims);
5303 1.1 mrg
5304 1.1 mrg if (size != 1 && !(allowed && integer_zerop (allowed)))
5305 1.1 mrg oa->mask |= GOMP_DIM_MASK (ix);
5306 1.1 mrg
5307 1.1 mrg switch (ix)
5308 1.1 mrg {
5309 1.1 mrg case GOMP_DIM_GANG:
5310 1.1 mrg oa->num_gangs = size;
5311 1.1 mrg break;
5312 1.1 mrg
5313 1.1 mrg case GOMP_DIM_WORKER:
5314 1.1 mrg oa->num_workers = size;
5315 1.1 mrg break;
5316 1.1 mrg
5317 1.1 mrg case GOMP_DIM_VECTOR:
5318 1.1 mrg oa->vector_length = size;
5319 1.1 mrg break;
5320 1.1 mrg }
5321 1.1 mrg }
5322 1.1 mrg }
5323 1.1 mrg
5324 1.1 mrg #if WORKAROUND_PTXJIT_BUG_2
5325 1.1 mrg /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
5326 1.1 mrg is needed in the nvptx target because the branches generated for
5327 1.1 mrg parititioning are NONJUMP_INSN_P, not JUMP_P. */
5328 1.1 mrg
5329 1.1 mrg static rtx
5330 1.1 mrg nvptx_pc_set (const rtx_insn *insn, bool strict = true)
5331 1.1 mrg {
5332 1.1 mrg rtx pat;
5333 1.1 mrg if ((strict && !JUMP_P (insn))
5334 1.1 mrg || (!strict && !INSN_P (insn)))
5335 1.1 mrg return NULL_RTX;
5336 1.1 mrg pat = PATTERN (insn);
5337 1.1 mrg
5338 1.1 mrg /* The set is allowed to appear either as the insn pattern or
5339 1.1 mrg the first set in a PARALLEL. */
5340 1.1 mrg if (GET_CODE (pat) == PARALLEL)
5341 1.1 mrg pat = XVECEXP (pat, 0, 0);
5342 1.1 mrg if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
5343 1.1 mrg return pat;
5344 1.1 mrg
5345 1.1 mrg return NULL_RTX;
5346 1.1 mrg }
5347 1.1 mrg
5348 1.1 mrg /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
5349 1.1 mrg
5350 1.1 mrg static rtx
5351 1.1 mrg nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
5352 1.1 mrg {
5353 1.1 mrg rtx x = nvptx_pc_set (insn, strict);
5354 1.1 mrg
5355 1.1 mrg if (!x)
5356 1.1 mrg return NULL_RTX;
5357 1.1 mrg x = SET_SRC (x);
5358 1.1 mrg if (GET_CODE (x) == LABEL_REF)
5359 1.1 mrg return x;
5360 1.1 mrg if (GET_CODE (x) != IF_THEN_ELSE)
5361 1.1 mrg return NULL_RTX;
5362 1.1 mrg if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
5363 1.1 mrg return XEXP (x, 1);
5364 1.1 mrg if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
5365 1.1 mrg return XEXP (x, 2);
5366 1.1 mrg return NULL_RTX;
5367 1.1 mrg }
5368 1.1 mrg
5369 1.1 mrg /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
5370 1.1 mrg insn inbetween the branch and the label. This works around a JIT bug
5371 1.1 mrg observed at driver version 384.111, at -O0 for sm_50. */
5372 1.1 mrg
5373 1.1 mrg static void
5374 1.1 mrg prevent_branch_around_nothing (void)
5375 1.1 mrg {
5376 1.1 mrg rtx_insn *seen_label = NULL;
5377 1.1 mrg for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5378 1.1 mrg {
5379 1.1 mrg if (INSN_P (insn) && condjump_p (insn))
5380 1.1 mrg {
5381 1.1 mrg seen_label = label_ref_label (nvptx_condjump_label (insn, false));
5382 1.1 mrg continue;
5383 1.1 mrg }
5384 1.1 mrg
5385 1.1 mrg if (seen_label == NULL)
5386 1.1 mrg continue;
5387 1.1 mrg
5388 1.1 mrg if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5389 1.1 mrg continue;
5390 1.1 mrg
5391 1.1 mrg if (INSN_P (insn))
5392 1.1 mrg switch (recog_memoized (insn))
5393 1.1 mrg {
5394 1.1 mrg case CODE_FOR_nvptx_fork:
5395 1.1 mrg case CODE_FOR_nvptx_forked:
5396 1.1 mrg case CODE_FOR_nvptx_joining:
5397 1.1 mrg case CODE_FOR_nvptx_join:
5398 1.1 mrg case CODE_FOR_nop:
5399 1.1 mrg continue;
5400 1.1 mrg case -1:
5401 1.1 mrg /* Handle asm ("") and similar. */
5402 1.1 mrg if (GET_CODE (PATTERN (insn)) == ASM_INPUT
5403 1.1 mrg || GET_CODE (PATTERN (insn)) == ASM_OPERANDS
5404 1.1 mrg || (GET_CODE (PATTERN (insn)) == PARALLEL
5405 1.1 mrg && asm_noperands (PATTERN (insn)) >= 0))
5406 1.1 mrg continue;
5407 1.1 mrg /* FALLTHROUGH. */
5408 1.1 mrg default:
5409 1.1 mrg seen_label = NULL;
5410 1.1 mrg continue;
5411 1.1 mrg }
5412 1.1 mrg
5413 1.1 mrg if (LABEL_P (insn) && insn == seen_label)
5414 1.1 mrg emit_insn_before (gen_fake_nop (), insn);
5415 1.1 mrg
5416 1.1 mrg seen_label = NULL;
5417 1.1 mrg }
5418 1.1 mrg }
5419 1.1 mrg #endif
5420 1.1 mrg
5421 1.1 mrg #ifdef WORKAROUND_PTXJIT_BUG_3
5422 1.1 mrg /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
5423 1.1 mrg works around a hang observed at driver version 390.48 for sm_50. */
5424 1.1 mrg
5425 1.1 mrg static void
5426 1.1 mrg workaround_barsyncs (void)
5427 1.1 mrg {
5428 1.1 mrg bool seen_barsync = false;
5429 1.1 mrg for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5430 1.1 mrg {
5431 1.1 mrg if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
5432 1.1 mrg {
5433 1.1 mrg if (seen_barsync)
5434 1.1 mrg {
5435 1.1 mrg emit_insn_before (gen_nvptx_membar_cta (), insn);
5436 1.1 mrg emit_insn_before (gen_nvptx_membar_cta (), insn);
5437 1.1 mrg }
5438 1.1 mrg
5439 1.1 mrg seen_barsync = true;
5440 1.1 mrg continue;
5441 1.1 mrg }
5442 1.1 mrg
5443 1.1 mrg if (!seen_barsync)
5444 1.1 mrg continue;
5445 1.1 mrg
5446 1.1 mrg if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5447 1.1 mrg continue;
5448 1.1 mrg else if (INSN_P (insn))
5449 1.1 mrg switch (recog_memoized (insn))
5450 1.1 mrg {
5451 1.1 mrg case CODE_FOR_nvptx_fork:
5452 1.1 mrg case CODE_FOR_nvptx_forked:
5453 1.1 mrg case CODE_FOR_nvptx_joining:
5454 1.1 mrg case CODE_FOR_nvptx_join:
5455 1.1 mrg continue;
5456 1.1 mrg default:
5457 1.1 mrg break;
5458 1.1 mrg }
5459 1.1 mrg
5460 1.1 mrg seen_barsync = false;
5461 1.1 mrg }
5462 1.1 mrg }
5463 1.1 mrg #endif
5464 1.1 mrg
5465 1.1 mrg static rtx
5466 1.1 mrg gen_comment (const char *s)
5467 1.1 mrg {
5468 1.1 mrg const char *sep = " ";
5469 1.1 mrg size_t len = strlen (ASM_COMMENT_START) + strlen (sep) + strlen (s) + 1;
5470 1.1 mrg char *comment = (char *) alloca (len);
5471 1.1 mrg snprintf (comment, len, "%s%s%s", ASM_COMMENT_START, sep, s);
5472 1.1 mrg return gen_rtx_ASM_INPUT_loc (VOIDmode, ggc_strdup (comment),
5473 1.1 mrg DECL_SOURCE_LOCATION (cfun->decl));
5474 1.1 mrg }
5475 1.1 mrg
5476 1.1 mrg /* Initialize all declared regs at function entry.
5477 1.1 mrg Advantage : Fool-proof.
5478 1.1 mrg Disadvantage: Potentially creates a lot of long live ranges and adds a lot
5479 1.1 mrg of insns. */
5480 1.1 mrg
5481 1.1 mrg static void
5482 1.1 mrg workaround_uninit_method_1 (void)
5483 1.1 mrg {
5484 1.1 mrg rtx_insn *first = get_insns ();
5485 1.1 mrg rtx_insn *insert_here = NULL;
5486 1.1 mrg
5487 1.1 mrg for (int ix = LAST_VIRTUAL_REGISTER + 1; ix < max_reg_num (); ix++)
5488 1.1 mrg {
5489 1.1 mrg rtx reg = regno_reg_rtx[ix];
5490 1.1 mrg
5491 1.1 mrg /* Skip undeclared registers. */
5492 1.1 mrg if (reg == const0_rtx)
5493 1.1 mrg continue;
5494 1.1 mrg
5495 1.1 mrg gcc_assert (CONST0_RTX (GET_MODE (reg)));
5496 1.1 mrg
5497 1.1 mrg start_sequence ();
5498 1.1 mrg if (nvptx_comment && first != NULL)
5499 1.1 mrg emit_insn (gen_comment ("Start: Added by -minit-regs=1"));
5500 1.1 mrg emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5501 1.1 mrg rtx_insn *inits = get_insns ();
5502 1.1 mrg end_sequence ();
5503 1.1 mrg
5504 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS))
5505 1.1 mrg for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
5506 1.1 mrg fprintf (dump_file, "Default init of reg %u inserted: insn %u\n",
5507 1.1 mrg ix, INSN_UID (init));
5508 1.1 mrg
5509 1.1 mrg if (first != NULL)
5510 1.1 mrg {
5511 1.1 mrg insert_here = emit_insn_before (inits, first);
5512 1.1 mrg first = NULL;
5513 1.1 mrg }
5514 1.1 mrg else
5515 1.1 mrg insert_here = emit_insn_after (inits, insert_here);
5516 1.1 mrg }
5517 1.1 mrg
5518 1.1 mrg if (nvptx_comment && insert_here != NULL)
5519 1.1 mrg emit_insn_after (gen_comment ("End: Added by -minit-regs=1"), insert_here);
5520 1.1 mrg }
5521 1.1 mrg
5522 1.1 mrg /* Find uses of regs that are not defined on all incoming paths, and insert a
5523 1.1 mrg corresponding def at function entry.
5524 1.1 mrg Advantage : Simple.
5525 1.1 mrg Disadvantage: Potentially creates long live ranges.
5526 1.1 mrg May not catch all cases. F.i. a clobber cuts a live range in
5527 1.1 mrg the compiler and may prevent entry_lr_in from being set for a
5528 1.1 mrg reg, but the clobber does not translate to a ptx insn, so in
5529 1.1 mrg ptx there still may be an uninitialized ptx reg. See f.i.
5530 1.1 mrg gcc.c-torture/compile/20020926-1.c. */
5531 1.1 mrg
5532 1.1 mrg static void
5533 1.1 mrg workaround_uninit_method_2 (void)
5534 1.1 mrg {
5535 1.1 mrg auto_bitmap entry_pseudo_uninit;
5536 1.1 mrg {
5537 1.1 mrg auto_bitmap not_pseudo;
5538 1.1 mrg bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
5539 1.1 mrg
5540 1.1 mrg bitmap entry_lr_in = DF_LR_IN (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5541 1.1 mrg bitmap_and_compl (entry_pseudo_uninit, entry_lr_in, not_pseudo);
5542 1.1 mrg }
5543 1.1 mrg
5544 1.1 mrg rtx_insn *first = get_insns ();
5545 1.1 mrg rtx_insn *insert_here = NULL;
5546 1.1 mrg
5547 1.1 mrg bitmap_iterator iterator;
5548 1.1 mrg unsigned ix;
5549 1.1 mrg EXECUTE_IF_SET_IN_BITMAP (entry_pseudo_uninit, 0, ix, iterator)
5550 1.1 mrg {
5551 1.1 mrg rtx reg = regno_reg_rtx[ix];
5552 1.1 mrg gcc_assert (CONST0_RTX (GET_MODE (reg)));
5553 1.1 mrg
5554 1.1 mrg start_sequence ();
5555 1.1 mrg if (nvptx_comment && first != NULL)
5556 1.1 mrg emit_insn (gen_comment ("Start: Added by -minit-regs=2:"));
5557 1.1 mrg emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5558 1.1 mrg rtx_insn *inits = get_insns ();
5559 1.1 mrg end_sequence ();
5560 1.1 mrg
5561 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS))
5562 1.1 mrg for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
5563 1.1 mrg fprintf (dump_file, "Missing init of reg %u inserted: insn %u\n",
5564 1.1 mrg ix, INSN_UID (init));
5565 1.1 mrg
5566 1.1 mrg if (first != NULL)
5567 1.1 mrg {
5568 1.1 mrg insert_here = emit_insn_before (inits, first);
5569 1.1 mrg first = NULL;
5570 1.1 mrg }
5571 1.1 mrg else
5572 1.1 mrg insert_here = emit_insn_after (inits, insert_here);
5573 1.1 mrg }
5574 1.1 mrg
5575 1.1 mrg if (nvptx_comment && insert_here != NULL)
5576 1.1 mrg emit_insn_after (gen_comment ("End: Added by -minit-regs=2"), insert_here);
5577 1.1 mrg }
5578 1.1 mrg
5579 1.1 mrg /* Find uses of regs that are not defined on all incoming paths, and insert a
5580 1.1 mrg corresponding def on those.
5581 1.1 mrg Advantage : Doesn't create long live ranges.
5582 1.1 mrg Disadvantage: More complex, and potentially also more defs. */
5583 1.1 mrg
5584 1.1 mrg static void
5585 1.1 mrg workaround_uninit_method_3 (void)
5586 1.1 mrg {
5587 1.1 mrg auto_bitmap not_pseudo;
5588 1.1 mrg bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
5589 1.1 mrg
5590 1.1 mrg basic_block bb;
5591 1.1 mrg FOR_EACH_BB_FN (bb, cfun)
5592 1.1 mrg {
5593 1.1 mrg if (single_pred_p (bb))
5594 1.1 mrg continue;
5595 1.1 mrg
5596 1.1 mrg auto_bitmap bb_pseudo_uninit;
5597 1.1 mrg bitmap_and_compl (bb_pseudo_uninit, DF_LIVE_IN (bb), DF_MIR_IN (bb));
5598 1.1 mrg bitmap_and_compl_into (bb_pseudo_uninit, not_pseudo);
5599 1.1 mrg
5600 1.1 mrg bitmap_iterator iterator;
5601 1.1 mrg unsigned ix;
5602 1.1 mrg EXECUTE_IF_SET_IN_BITMAP (bb_pseudo_uninit, 0, ix, iterator)
5603 1.1 mrg {
5604 1.1 mrg bool have_false = false;
5605 1.1 mrg bool have_true = false;
5606 1.1 mrg
5607 1.1 mrg edge e;
5608 1.1 mrg edge_iterator ei;
5609 1.1 mrg FOR_EACH_EDGE (e, ei, bb->preds)
5610 1.1 mrg {
5611 1.1 mrg if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
5612 1.1 mrg have_true = true;
5613 1.1 mrg else
5614 1.1 mrg have_false = true;
5615 1.1 mrg }
5616 1.1 mrg if (have_false ^ have_true)
5617 1.1 mrg continue;
5618 1.1 mrg
5619 1.1 mrg FOR_EACH_EDGE (e, ei, bb->preds)
5620 1.1 mrg {
5621 1.1 mrg if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
5622 1.1 mrg continue;
5623 1.1 mrg
5624 1.1 mrg rtx reg = regno_reg_rtx[ix];
5625 1.1 mrg gcc_assert (CONST0_RTX (GET_MODE (reg)));
5626 1.1 mrg
5627 1.1 mrg start_sequence ();
5628 1.1 mrg emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5629 1.1 mrg rtx_insn *inits = get_insns ();
5630 1.1 mrg end_sequence ();
5631 1.1 mrg
5632 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS))
5633 1.1 mrg for (rtx_insn *init = inits; init != NULL;
5634 1.1 mrg init = NEXT_INSN (init))
5635 1.1 mrg fprintf (dump_file,
5636 1.1 mrg "Missing init of reg %u inserted on edge: %d -> %d:"
5637 1.1 mrg " insn %u\n", ix, e->src->index, e->dest->index,
5638 1.1 mrg INSN_UID (init));
5639 1.1 mrg
5640 1.1 mrg insert_insn_on_edge (inits, e);
5641 1.1 mrg }
5642 1.1 mrg }
5643 1.1 mrg }
5644 1.1 mrg
5645 1.1 mrg if (nvptx_comment)
5646 1.1 mrg FOR_EACH_BB_FN (bb, cfun)
5647 1.1 mrg {
5648 1.1 mrg if (single_pred_p (bb))
5649 1.1 mrg continue;
5650 1.1 mrg
5651 1.1 mrg edge e;
5652 1.1 mrg edge_iterator ei;
5653 1.1 mrg FOR_EACH_EDGE (e, ei, bb->preds)
5654 1.1 mrg {
5655 1.1 mrg if (e->insns.r == NULL_RTX)
5656 1.1 mrg continue;
5657 1.1 mrg start_sequence ();
5658 1.1 mrg emit_insn (gen_comment ("Start: Added by -minit-regs=3:"));
5659 1.1 mrg emit_insn (e->insns.r);
5660 1.1 mrg emit_insn (gen_comment ("End: Added by -minit-regs=3:"));
5661 1.1 mrg e->insns.r = get_insns ();
5662 1.1 mrg end_sequence ();
5663 1.1 mrg }
5664 1.1 mrg }
5665 1.1 mrg
5666 1.1 mrg commit_edge_insertions ();
5667 1.1 mrg }
5668 1.1 mrg
5669 1.1 mrg static void
5670 1.1 mrg workaround_uninit (void)
5671 1.1 mrg {
5672 1.1 mrg switch (nvptx_init_regs)
5673 1.1 mrg {
5674 1.1 mrg case 0:
5675 1.1 mrg /* Skip. */
5676 1.1 mrg break;
5677 1.1 mrg case 1:
5678 1.1 mrg workaround_uninit_method_1 ();
5679 1.1 mrg break;
5680 1.1 mrg case 2:
5681 1.1 mrg workaround_uninit_method_2 ();
5682 1.1 mrg break;
5683 1.1 mrg case 3:
5684 1.1 mrg workaround_uninit_method_3 ();
5685 1.1 mrg break;
5686 1.1 mrg default:
5687 1.1 mrg gcc_unreachable ();
5688 1.1 mrg }
5689 1.1 mrg }
5690 1.1 mrg
5691 1.1 mrg /* PTX-specific reorganization
5692 1.1 mrg - Split blocks at fork and join instructions
5693 1.1 mrg - Compute live registers
5694 1.1 mrg - Mark now-unused registers, so function begin doesn't declare
5695 1.1 mrg unused registers.
5696 1.1 mrg - Insert state propagation when entering partitioned mode
5697 1.1 mrg - Insert neutering instructions when in single mode
5698 1.1 mrg - Replace subregs with suitable sequences.
5699 1.1 mrg */
5700 1.1 mrg
5701 1.1 mrg static void
5702 1.1 mrg nvptx_reorg (void)
5703 1.1 mrg {
5704 1.1 mrg /* We are freeing block_for_insn in the toplev to keep compatibility
5705 1.1 mrg with old MDEP_REORGS that are not CFG based. Recompute it now. */
5706 1.1 mrg compute_bb_for_insn ();
5707 1.1 mrg
5708 1.1 mrg thread_prologue_and_epilogue_insns ();
5709 1.1 mrg
5710 1.1 mrg /* Split blocks and record interesting unspecs. */
5711 1.1 mrg bb_insn_map_t bb_insn_map;
5712 1.1 mrg
5713 1.1 mrg nvptx_split_blocks (&bb_insn_map);
5714 1.1 mrg
5715 1.1 mrg /* Compute live regs */
5716 1.1 mrg df_clear_flags (DF_LR_RUN_DCE);
5717 1.1 mrg df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
5718 1.1 mrg df_live_add_problem ();
5719 1.1 mrg df_live_set_all_dirty ();
5720 1.1 mrg if (nvptx_init_regs == 3)
5721 1.1 mrg df_mir_add_problem ();
5722 1.1 mrg df_analyze ();
5723 1.1 mrg regstat_init_n_sets_and_refs ();
5724 1.1 mrg
5725 1.1 mrg if (dump_file)
5726 1.1 mrg df_dump (dump_file);
5727 1.1 mrg
5728 1.1 mrg /* Mark unused regs as unused. */
5729 1.1 mrg int max_regs = max_reg_num ();
5730 1.1 mrg for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
5731 1.1 mrg if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5732 1.1 mrg regno_reg_rtx[i] = const0_rtx;
5733 1.1 mrg
5734 1.1 mrg workaround_uninit ();
5735 1.1 mrg
5736 1.1 mrg /* Determine launch dimensions of the function. If it is not an
5737 1.1 mrg offloaded function (i.e. this is a regular compiler), the
5738 1.1 mrg function has no neutering. */
5739 1.1 mrg tree attr = oacc_get_fn_attrib (current_function_decl);
5740 1.1 mrg if (attr)
5741 1.1 mrg {
5742 1.1 mrg /* If we determined this mask before RTL expansion, we could
5743 1.1 mrg elide emission of some levels of forks and joins. */
5744 1.1 mrg offload_attrs oa;
5745 1.1 mrg
5746 1.1 mrg populate_offload_attrs (&oa);
5747 1.1 mrg
5748 1.1 mrg /* If there is worker neutering, there must be vector
5749 1.1 mrg neutering. Otherwise the hardware will fail. */
5750 1.1 mrg gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5751 1.1 mrg || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5752 1.1 mrg
5753 1.1 mrg /* Discover & process partitioned regions. */
5754 1.1 mrg parallel *pars = nvptx_discover_pars (&bb_insn_map);
5755 1.1 mrg nvptx_process_pars (pars);
5756 1.1 mrg nvptx_neuter_pars (pars, oa.mask, 0);
5757 1.1 mrg delete pars;
5758 1.1 mrg }
5759 1.1 mrg
5760 1.1 mrg /* Replace subregs. */
5761 1.1 mrg nvptx_reorg_subreg ();
5762 1.1 mrg
5763 1.1 mrg if (TARGET_UNIFORM_SIMT)
5764 1.1 mrg nvptx_reorg_uniform_simt ();
5765 1.1 mrg
5766 1.1 mrg #if WORKAROUND_PTXJIT_BUG_2
5767 1.1 mrg prevent_branch_around_nothing ();
5768 1.1 mrg #endif
5769 1.1 mrg
5770 1.1 mrg #ifdef WORKAROUND_PTXJIT_BUG_3
5771 1.1 mrg workaround_barsyncs ();
5772 1.1 mrg #endif
5773 1.1 mrg
5774 1.1 mrg regstat_free_n_sets_and_refs ();
5775 1.1 mrg
5776 1.1 mrg df_finish_pass (true);
5777 1.1 mrg }
5778 1.1 mrg
5779 1.1 mrg /* Handle a "kernel" attribute; arguments as in
5781 1.1 mrg struct attribute_spec.handler. */
5782 1.1 mrg
5783 1.1 mrg static tree
5784 1.1 mrg nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5785 1.1 mrg int ARG_UNUSED (flags), bool *no_add_attrs)
5786 1.1 mrg {
5787 1.1 mrg tree decl = *node;
5788 1.1 mrg
5789 1.1 mrg if (TREE_CODE (decl) != FUNCTION_DECL)
5790 1.1 mrg {
5791 1.1 mrg error ("%qE attribute only applies to functions", name);
5792 1.1 mrg *no_add_attrs = true;
5793 1.1 mrg }
5794 1.1 mrg else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
5795 1.1 mrg {
5796 1.1 mrg error ("%qE attribute requires a void return type", name);
5797 1.1 mrg *no_add_attrs = true;
5798 1.1 mrg }
5799 1.1 mrg
5800 1.1 mrg return NULL_TREE;
5801 1.1 mrg }
5802 1.1 mrg
5803 1.1 mrg /* Handle a "shared" attribute; arguments as in
5804 1.1 mrg struct attribute_spec.handler. */
5805 1.1 mrg
5806 1.1 mrg static tree
5807 1.1 mrg nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5808 1.1 mrg int ARG_UNUSED (flags), bool *no_add_attrs)
5809 1.1 mrg {
5810 1.1 mrg tree decl = *node;
5811 1.1 mrg
5812 1.1 mrg if (TREE_CODE (decl) != VAR_DECL)
5813 1.1 mrg {
5814 1.1 mrg error ("%qE attribute only applies to variables", name);
5815 1.1 mrg *no_add_attrs = true;
5816 1.1 mrg }
5817 1.1 mrg else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5818 1.1 mrg {
5819 1.1 mrg error ("%qE attribute not allowed with auto storage class", name);
5820 1.1 mrg *no_add_attrs = true;
5821 1.1 mrg }
5822 1.1 mrg
5823 1.1 mrg return NULL_TREE;
5824 1.1 mrg }
5825 1.1 mrg
5826 1.1 mrg /* Table of valid machine attributes. */
5827 1.1 mrg static const struct attribute_spec nvptx_attribute_table[] =
5828 1.1 mrg {
5829 1.1 mrg /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5830 1.1 mrg affects_type_identity, handler, exclude } */
5831 1.1 mrg { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5832 1.1 mrg NULL },
5833 1.1 mrg { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5834 1.1 mrg NULL },
5835 1.1 mrg { NULL, 0, 0, false, false, false, false, NULL, NULL }
5836 1.1 mrg };
5837 1.1 mrg
5838 1.1 mrg /* Limit vector alignments to BIGGEST_ALIGNMENT. */
5840 1.1 mrg
5841 1.1 mrg static HOST_WIDE_INT
5842 1.1 mrg nvptx_vector_alignment (const_tree type)
5843 1.1 mrg {
5844 1.1 mrg unsigned HOST_WIDE_INT align;
5845 1.1 mrg tree size = TYPE_SIZE (type);
5846 1.1 mrg
5847 1.1 mrg /* Ensure align is not bigger than BIGGEST_ALIGNMENT. */
5848 1.1 mrg if (tree_fits_uhwi_p (size))
5849 1.1 mrg {
5850 1.1 mrg align = tree_to_uhwi (size);
5851 1.1 mrg align = MIN (align, BIGGEST_ALIGNMENT);
5852 1.1 mrg }
5853 1.1 mrg else
5854 1.1 mrg align = BIGGEST_ALIGNMENT;
5855 1.1 mrg
5856 1.1 mrg /* Ensure align is not smaller than mode alignment. */
5857 1.1 mrg align = MAX (align, GET_MODE_ALIGNMENT (TYPE_MODE (type)));
5858 1.1 mrg
5859 1.1 mrg return align;
5860 1.1 mrg }
5861 1.1 mrg
5862 1.1 mrg /* Indicate that INSN cannot be duplicated. */
5863 1.1 mrg
5864 1.1 mrg static bool
5865 1.1 mrg nvptx_cannot_copy_insn_p (rtx_insn *insn)
5866 1.1 mrg {
5867 1.1 mrg switch (recog_memoized (insn))
5868 1.1 mrg {
5869 1.1 mrg case CODE_FOR_nvptx_shufflesi:
5870 1.1 mrg case CODE_FOR_nvptx_shufflesf:
5871 1.1 mrg case CODE_FOR_nvptx_barsync:
5872 1.1 mrg case CODE_FOR_nvptx_fork:
5873 1.1 mrg case CODE_FOR_nvptx_forked:
5874 1.1 mrg case CODE_FOR_nvptx_joining:
5875 1.1 mrg case CODE_FOR_nvptx_join:
5876 1.1 mrg return true;
5877 1.1 mrg default:
5878 1.1 mrg return false;
5879 1.1 mrg }
5880 1.1 mrg }
5881 1.1 mrg
5882 1.1 mrg /* Section anchors do not work. Initialization for flag_section_anchor
5883 1.1 mrg probes the existence of the anchoring target hooks and prevents
5884 1.1 mrg anchoring if they don't exist. However, we may be being used with
5885 1.1 mrg a host-side compiler that does support anchoring, and hence see
5886 1.1 mrg the anchor flag set (as it's not recalculated). So provide an
5887 1.1 mrg implementation denying anchoring. */
5888 1.1 mrg
5889 1.1 mrg static bool
5890 1.1 mrg nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5891 1.1 mrg {
5892 1.1 mrg return false;
5893 1.1 mrg }
5894 1.1 mrg
5895 1.1 mrg /* Record a symbol for mkoffload to enter into the mapping table. */
5897 1.1 mrg
5898 1.1 mrg static void
5899 1.1 mrg nvptx_record_offload_symbol (tree decl)
5900 1.1 mrg {
5901 1.1 mrg switch (TREE_CODE (decl))
5902 1.1 mrg {
5903 1.1 mrg case VAR_DECL:
5904 1.1 mrg fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5905 1.1 mrg IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5906 1.1 mrg break;
5907 1.1 mrg
5908 1.1 mrg case FUNCTION_DECL:
5909 1.1 mrg {
5910 1.1 mrg tree attr = oacc_get_fn_attrib (decl);
5911 1.1 mrg /* OpenMP offloading does not set this attribute. */
5912 1.1 mrg tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
5913 1.1 mrg
5914 1.1 mrg fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5915 1.1 mrg IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5916 1.1 mrg
5917 1.1 mrg for (; dims; dims = TREE_CHAIN (dims))
5918 1.1 mrg {
5919 1.1 mrg int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
5920 1.1 mrg
5921 1.1 mrg gcc_assert (!TREE_PURPOSE (dims));
5922 1.1 mrg fprintf (asm_out_file, ", %#x", size);
5923 1.1 mrg }
5924 1.1 mrg
5925 1.1 mrg fprintf (asm_out_file, "\n");
5926 1.1 mrg }
5927 1.1 mrg break;
5928 1.1 mrg
5929 1.1 mrg default:
5930 1.1 mrg gcc_unreachable ();
5931 1.1 mrg }
5932 1.1 mrg }
5933 1.1 mrg
5934 1.1 mrg /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5935 1.1 mrg at the start of a file. */
5936 1.1 mrg
5937 1.1 mrg static void
5938 1.1 mrg nvptx_file_start (void)
5939 1.1 mrg {
5940 1.1 mrg fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5941 1.1 mrg
5942 1.1 mrg fputs ("\t.version\t", asm_out_file);
5943 1.1 mrg fputs (ptx_version_to_string ((enum ptx_version)ptx_version_option),
5944 1.1 mrg asm_out_file);
5945 1.1 mrg fputs ("\n", asm_out_file);
5946 1.1 mrg
5947 1.1 mrg fputs ("\t.target\tsm_", asm_out_file);
5948 1.1 mrg fputs (sm_version_to_string ((enum ptx_isa)ptx_isa_option),
5949 1.1 mrg asm_out_file);
5950 1.1 mrg fputs ("\n", asm_out_file);
5951 1.1 mrg
5952 1.1 mrg fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5953 1.1 mrg
5954 1.1 mrg fputs ("// END PREAMBLE\n", asm_out_file);
5955 1.1 mrg }
5956 1.1 mrg
5957 1.1 mrg /* Emit a declaration for a worker and vector-level buffer in .shared
5958 1.1 mrg memory. */
5959 1.1 mrg
5960 1.1 mrg static void
5961 1.1 mrg write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5962 1.1 mrg {
5963 1.1 mrg const char *name = XSTR (sym, 0);
5964 1.1 mrg
5965 1.1 mrg write_var_marker (file, true, false, name);
5966 1.1 mrg fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5967 1.1 mrg align, name, size);
5968 1.1 mrg }
5969 1.1 mrg
5970 1.1 mrg /* Write out the function declarations we've collected and declare storage
5971 1.1 mrg for the broadcast buffer. */
5972 1.1 mrg
5973 1.1 mrg static void
5974 1.1 mrg nvptx_file_end (void)
5975 1.1 mrg {
5976 1.1 mrg hash_table<tree_hasher>::iterator iter;
5977 1.1 mrg tree decl;
5978 1.1 mrg FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5979 1.1 mrg nvptx_record_fndecl (decl);
5980 1.1 mrg fputs (func_decls.str().c_str(), asm_out_file);
5981 1.1 mrg
5982 1.1 mrg if (oacc_bcast_size)
5983 1.1 mrg write_shared_buffer (asm_out_file, oacc_bcast_sym,
5984 1.1 mrg oacc_bcast_align, oacc_bcast_size);
5985 1.1 mrg
5986 1.1 mrg if (worker_red_size)
5987 1.1 mrg write_shared_buffer (asm_out_file, worker_red_sym,
5988 1.1 mrg worker_red_align, worker_red_size);
5989 1.1 mrg
5990 1.1 mrg if (vector_red_size)
5991 1.1 mrg write_shared_buffer (asm_out_file, vector_red_sym,
5992 1.1 mrg vector_red_align, vector_red_size);
5993 1.1 mrg
5994 1.1 mrg if (gang_private_shared_size)
5995 1.1 mrg write_shared_buffer (asm_out_file, gang_private_shared_sym,
5996 1.1 mrg gang_private_shared_align, gang_private_shared_size);
5997 1.1 mrg
5998 1.1 mrg if (need_softstack_decl)
5999 1.1 mrg {
6000 1.1 mrg write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
6001 1.1 mrg /* 32 is the maximum number of warps in a block. Even though it's an
6002 1.1 mrg external declaration, emit the array size explicitly; otherwise, it
6003 1.1 mrg may fail at PTX JIT time if the definition is later in link order. */
6004 1.1 mrg fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
6005 1.1 mrg POINTER_SIZE);
6006 1.1 mrg }
6007 1.1 mrg if (need_unisimt_decl)
6008 1.1 mrg {
6009 1.1 mrg write_var_marker (asm_out_file, false, true, "__nvptx_uni");
6010 1.1 mrg fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
6011 1.1 mrg }
6012 1.1 mrg }
6013 1.1 mrg
6014 1.1 mrg /* Expander for the shuffle builtins. */
6015 1.1 mrg
6016 1.1 mrg static rtx
6017 1.1 mrg nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
6018 1.1 mrg {
6019 1.1 mrg if (ignore)
6020 1.1 mrg return target;
6021 1.1 mrg
6022 1.1 mrg rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
6023 1.1 mrg NULL_RTX, mode, EXPAND_NORMAL);
6024 1.1 mrg if (!REG_P (src))
6025 1.1 mrg src = copy_to_mode_reg (mode, src);
6026 1.1 mrg
6027 1.1 mrg rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
6028 1.1 mrg NULL_RTX, SImode, EXPAND_NORMAL);
6029 1.1 mrg rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
6030 1.1 mrg NULL_RTX, SImode, EXPAND_NORMAL);
6031 1.1 mrg
6032 1.1 mrg if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
6033 1.1 mrg idx = copy_to_mode_reg (SImode, idx);
6034 1.1 mrg
6035 1.1 mrg rtx pat = nvptx_gen_shuffle (target, src, idx,
6036 1.1 mrg (nvptx_shuffle_kind) INTVAL (op));
6037 1.1 mrg if (pat)
6038 1.1 mrg emit_insn (pat);
6039 1.1 mrg
6040 1.1 mrg return target;
6041 1.1 mrg }
6042 1.1 mrg
6043 1.1 mrg const char *
6044 1.1 mrg nvptx_output_red_partition (rtx dst, rtx offset)
6045 1.1 mrg {
6046 1.1 mrg const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
6047 1.1 mrg const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
6048 1.1 mrg
6049 1.1 mrg if (offset == const0_rtx)
6050 1.1 mrg fprintf (asm_out_file, zero_offset, REGNO (dst),
6051 1.1 mrg REGNO (cfun->machine->red_partition));
6052 1.1 mrg else
6053 1.1 mrg fprintf (asm_out_file, with_offset, REGNO (dst),
6054 1.1 mrg REGNO (cfun->machine->red_partition), UINTVAL (offset));
6055 1.1 mrg
6056 1.1 mrg return "";
6057 1.1 mrg }
6058 1.1 mrg
6059 1.1 mrg /* Shared-memory reduction address expander. */
6060 1.1 mrg
6061 1.1 mrg static rtx
6062 1.1 mrg nvptx_expand_shared_addr (tree exp, rtx target,
6063 1.1 mrg machine_mode ARG_UNUSED (mode), int ignore,
6064 1.1 mrg int vector)
6065 1.1 mrg {
6066 1.1 mrg if (ignore)
6067 1.1 mrg return target;
6068 1.1 mrg
6069 1.1 mrg unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
6070 1.1 mrg unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
6071 1.1 mrg unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
6072 1.1 mrg rtx addr = worker_red_sym;
6073 1.1 mrg
6074 1.1 mrg if (vector)
6075 1.1 mrg {
6076 1.1 mrg offload_attrs oa;
6077 1.1 mrg
6078 1.1 mrg populate_offload_attrs (&oa);
6079 1.1 mrg
6080 1.1 mrg unsigned int psize = ROUND_UP (size + offset, align);
6081 1.1 mrg unsigned int pnum = nvptx_mach_max_workers ();
6082 1.1 mrg vector_red_partition = MAX (vector_red_partition, psize);
6083 1.1 mrg vector_red_size = MAX (vector_red_size, psize * pnum);
6084 1.1 mrg vector_red_align = MAX (vector_red_align, align);
6085 1.1 mrg
6086 1.1 mrg if (cfun->machine->red_partition == NULL)
6087 1.1 mrg cfun->machine->red_partition = gen_reg_rtx (Pmode);
6088 1.1 mrg
6089 1.1 mrg addr = gen_reg_rtx (Pmode);
6090 1.1 mrg emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
6091 1.1 mrg }
6092 1.1 mrg else
6093 1.1 mrg {
6094 1.1 mrg worker_red_align = MAX (worker_red_align, align);
6095 1.1 mrg worker_red_size = MAX (worker_red_size, size + offset);
6096 1.1 mrg
6097 1.1 mrg if (offset)
6098 1.1 mrg {
6099 1.1 mrg addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
6100 1.1 mrg addr = gen_rtx_CONST (Pmode, addr);
6101 1.1 mrg }
6102 1.1 mrg }
6103 1.1 mrg
6104 1.1 mrg emit_move_insn (target, addr);
6105 1.1 mrg return target;
6106 1.1 mrg }
6107 1.1 mrg
6108 1.1 mrg /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
6109 1.1 mrg not require taking the address of any object, other than the memory
6110 1.1 mrg cell being operated on. */
6111 1.1 mrg
6112 1.1 mrg static rtx
6113 1.1 mrg nvptx_expand_cmp_swap (tree exp, rtx target,
6114 1.1 mrg machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
6115 1.1 mrg {
6116 1.1 mrg machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
6117 1.1 mrg
6118 1.1 mrg if (!target)
6119 1.1 mrg target = gen_reg_rtx (mode);
6120 1.1 mrg
6121 1.1 mrg rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
6122 1.1 mrg NULL_RTX, Pmode, EXPAND_NORMAL);
6123 1.1 mrg rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
6124 1.1 mrg NULL_RTX, mode, EXPAND_NORMAL);
6125 1.1 mrg rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
6126 1.1 mrg NULL_RTX, mode, EXPAND_NORMAL);
6127 1.1 mrg rtx pat;
6128 1.1 mrg
6129 1.1 mrg mem = gen_rtx_MEM (mode, mem);
6130 1.1 mrg if (!REG_P (cmp))
6131 1.1 mrg cmp = copy_to_mode_reg (mode, cmp);
6132 1.1 mrg if (!REG_P (src))
6133 1.1 mrg src = copy_to_mode_reg (mode, src);
6134 1.1 mrg
6135 1.1 mrg if (mode == SImode)
6136 1.1 mrg pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
6137 1.1 mrg else
6138 1.1 mrg pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
6139 1.1 mrg
6140 1.1 mrg emit_insn (pat);
6141 1.1 mrg
6142 1.1 mrg return target;
6143 1.1 mrg }
6144 1.1 mrg
6145 1.1 mrg
6146 1.1 mrg /* Codes for all the NVPTX builtins. */
6147 1.1 mrg enum nvptx_builtins
6148 1.1 mrg {
6149 1.1 mrg NVPTX_BUILTIN_SHUFFLE,
6150 1.1 mrg NVPTX_BUILTIN_SHUFFLELL,
6151 1.1 mrg NVPTX_BUILTIN_WORKER_ADDR,
6152 1.1 mrg NVPTX_BUILTIN_VECTOR_ADDR,
6153 1.1 mrg NVPTX_BUILTIN_CMP_SWAP,
6154 1.1 mrg NVPTX_BUILTIN_CMP_SWAPLL,
6155 1.1 mrg NVPTX_BUILTIN_MEMBAR_GL,
6156 1.1 mrg NVPTX_BUILTIN_MEMBAR_CTA,
6157 1.1 mrg NVPTX_BUILTIN_MAX
6158 1.1 mrg };
6159 1.1 mrg
6160 1.1 mrg static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
6161 1.1 mrg
6162 1.1 mrg /* Return the NVPTX builtin for CODE. */
6163 1.1 mrg
6164 1.1 mrg static tree
6165 1.1 mrg nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
6166 1.1 mrg {
6167 1.1 mrg if (code >= NVPTX_BUILTIN_MAX)
6168 1.1 mrg return error_mark_node;
6169 1.1 mrg
6170 1.1 mrg return nvptx_builtin_decls[code];
6171 1.1 mrg }
6172 1.1 mrg
6173 1.1 mrg /* Set up all builtin functions for this target. */
6174 1.1 mrg
6175 1.1 mrg static void
6176 1.1 mrg nvptx_init_builtins (void)
6177 1.1 mrg {
6178 1.1 mrg #define DEF(ID, NAME, T) \
6179 1.1 mrg (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
6180 1.1 mrg = add_builtin_function ("__builtin_nvptx_" NAME, \
6181 1.1 mrg build_function_type_list T, \
6182 1.1 mrg NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
6183 1.1 mrg #define ST sizetype
6184 1.1 mrg #define UINT unsigned_type_node
6185 1.1 mrg #define LLUINT long_long_unsigned_type_node
6186 1.1 mrg #define PTRVOID ptr_type_node
6187 1.1 mrg #define VOID void_type_node
6188 1.1 mrg
6189 1.1 mrg DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
6190 1.1 mrg DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
6191 1.1 mrg DEF (WORKER_ADDR, "worker_addr",
6192 1.1 mrg (PTRVOID, ST, UINT, UINT, NULL_TREE));
6193 1.1 mrg DEF (VECTOR_ADDR, "vector_addr",
6194 1.1 mrg (PTRVOID, ST, UINT, UINT, NULL_TREE));
6195 1.1 mrg DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
6196 1.1 mrg DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
6197 1.1 mrg DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE));
6198 1.1 mrg DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE));
6199 1.1 mrg
6200 1.1 mrg #undef DEF
6201 1.1 mrg #undef ST
6202 1.1 mrg #undef UINT
6203 1.1 mrg #undef LLUINT
6204 1.1 mrg #undef PTRVOID
6205 1.1 mrg }
6206 1.1 mrg
6207 1.1 mrg /* Expand an expression EXP that calls a built-in function,
6208 1.1 mrg with result going to TARGET if that's convenient
6209 1.1 mrg (and in mode MODE if that's convenient).
6210 1.1 mrg SUBTARGET may be used as the target for computing one of EXP's operands.
6211 1.1 mrg IGNORE is nonzero if the value is to be ignored. */
6212 1.1 mrg
6213 1.1 mrg static rtx
6214 1.1 mrg nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
6215 1.1 mrg machine_mode mode, int ignore)
6216 1.1 mrg {
6217 1.1 mrg tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
6218 1.1 mrg switch (DECL_MD_FUNCTION_CODE (fndecl))
6219 1.1 mrg {
6220 1.1 mrg case NVPTX_BUILTIN_SHUFFLE:
6221 1.1 mrg case NVPTX_BUILTIN_SHUFFLELL:
6222 1.1 mrg return nvptx_expand_shuffle (exp, target, mode, ignore);
6223 1.1 mrg
6224 1.1 mrg case NVPTX_BUILTIN_WORKER_ADDR:
6225 1.1 mrg return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
6226 1.1 mrg
6227 1.1 mrg case NVPTX_BUILTIN_VECTOR_ADDR:
6228 1.1 mrg return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
6229 1.1 mrg
6230 1.1 mrg case NVPTX_BUILTIN_CMP_SWAP:
6231 1.1 mrg case NVPTX_BUILTIN_CMP_SWAPLL:
6232 1.1 mrg return nvptx_expand_cmp_swap (exp, target, mode, ignore);
6233 1.1 mrg
6234 1.1 mrg case NVPTX_BUILTIN_MEMBAR_GL:
6235 1.1 mrg emit_insn (gen_nvptx_membar_gl ());
6236 1.1 mrg return NULL_RTX;
6237 1.1 mrg
6238 1.1 mrg case NVPTX_BUILTIN_MEMBAR_CTA:
6239 1.1 mrg emit_insn (gen_nvptx_membar_cta ());
6240 1.1 mrg return NULL_RTX;
6241 1.1 mrg
6242 1.1 mrg default: gcc_unreachable ();
6243 1.1 mrg }
6244 1.1 mrg }
6245 1.1 mrg
6246 1.1 mrg /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
6247 1.1 mrg
6248 1.1 mrg static int
6249 1.1 mrg nvptx_simt_vf ()
6250 1.1 mrg {
6251 1.1 mrg return PTX_WARP_SIZE;
6252 1.1 mrg }
6253 1.1 mrg
6254 1.1 mrg /* Return 1 if TRAIT NAME is present in the OpenMP context's
6255 1.1 mrg device trait set, return 0 if not present in any OpenMP context in the
6256 1.1 mrg whole translation unit, or -1 if not present in the current OpenMP context
6257 1.1 mrg but might be present in another OpenMP context in the same TU. */
6258 1.1 mrg
6259 1.1 mrg int
6260 1.1 mrg nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
6261 1.1 mrg const char *name)
6262 1.1 mrg {
6263 1.1 mrg switch (trait)
6264 1.1 mrg {
6265 1.1 mrg case omp_device_kind:
6266 1.1 mrg return strcmp (name, "gpu") == 0;
6267 1.1 mrg case omp_device_arch:
6268 1.1 mrg return strcmp (name, "nvptx") == 0;
6269 1.1 mrg case omp_device_isa:
6270 1.1 mrg #define NVPTX_SM(XX, SEP) \
6271 1.1 mrg { \
6272 1.1 mrg if (strcmp (name, "sm_" #XX) == 0) \
6273 1.1 mrg return ptx_isa_option == PTX_ISA_SM ## XX; \
6274 1.1 mrg }
6275 1.1 mrg #include "nvptx-sm.def"
6276 1.1 mrg #undef NVPTX_SM
6277 1.1 mrg return 0;
6278 1.1 mrg default:
6279 1.1 mrg gcc_unreachable ();
6280 1.1 mrg }
6281 1.1 mrg }
6282 1.1 mrg
6283 1.1 mrg static bool
6284 1.1 mrg nvptx_welformed_vector_length_p (int l)
6285 1.1 mrg {
6286 1.1 mrg gcc_assert (l > 0);
6287 1.1 mrg return l % PTX_WARP_SIZE == 0;
6288 1.1 mrg }
6289 1.1 mrg
6290 1.1 mrg static void
6291 1.1 mrg nvptx_apply_dim_limits (int dims[])
6292 1.1 mrg {
6293 1.1 mrg /* Check that the vector_length is not too large. */
6294 1.1 mrg if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
6295 1.1 mrg dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
6296 1.1 mrg
6297 1.1 mrg /* Check that the number of workers is not too large. */
6298 1.1 mrg if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
6299 1.1 mrg dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
6300 1.1 mrg
6301 1.1 mrg /* Ensure that num_worker * vector_length <= cta size. */
6302 1.1 mrg if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
6303 1.1 mrg && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
6304 1.1 mrg dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6305 1.1 mrg
6306 1.1 mrg /* If we need a per-worker barrier ... . */
6307 1.1 mrg if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
6308 1.1 mrg && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
6309 1.1 mrg /* Don't use more barriers than available. */
6310 1.1 mrg dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
6311 1.1 mrg PTX_NUM_PER_WORKER_BARRIERS);
6312 1.1 mrg }
6313 1.1 mrg
6314 1.1 mrg /* Return true if FNDECL contains calls to vector-partitionable routines. */
6315 1.1 mrg
6316 1.1 mrg static bool
6317 1.1 mrg has_vector_partitionable_routine_calls_p (tree fndecl)
6318 1.1 mrg {
6319 1.1 mrg if (!fndecl)
6320 1.1 mrg return false;
6321 1.1 mrg
6322 1.1 mrg basic_block bb;
6323 1.1 mrg FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
6324 1.1 mrg for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
6325 1.1 mrg gsi_next_nondebug (&i))
6326 1.1 mrg {
6327 1.1 mrg gimple *stmt = gsi_stmt (i);
6328 1.1 mrg if (gimple_code (stmt) != GIMPLE_CALL)
6329 1.1 mrg continue;
6330 1.1 mrg
6331 1.1 mrg tree callee = gimple_call_fndecl (stmt);
6332 1.1 mrg if (!callee)
6333 1.1 mrg continue;
6334 1.1 mrg
6335 1.1 mrg tree attrs = oacc_get_fn_attrib (callee);
6336 1.1 mrg if (attrs == NULL_TREE)
6337 1.1 mrg return false;
6338 1.1 mrg
6339 1.1 mrg int partition_level = oacc_fn_attrib_level (attrs);
6340 1.1 mrg bool seq_routine_p = partition_level == GOMP_DIM_MAX;
6341 1.1 mrg if (!seq_routine_p)
6342 1.1 mrg return true;
6343 1.1 mrg }
6344 1.1 mrg
6345 1.1 mrg return false;
6346 1.1 mrg }
6347 1.1 mrg
6348 1.1 mrg /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
6349 1.1 mrg DIMS has changed. */
6350 1.1 mrg
6351 1.1 mrg static void
6352 1.1 mrg nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
6353 1.1 mrg {
6354 1.1 mrg bool oacc_default_dims_p = false;
6355 1.1 mrg bool oacc_min_dims_p = false;
6356 1.1 mrg bool offload_region_p = false;
6357 1.1 mrg bool routine_p = false;
6358 1.1 mrg bool routine_seq_p = false;
6359 1.1 mrg int default_vector_length = -1;
6360 1.1 mrg
6361 1.1 mrg if (decl == NULL_TREE)
6362 1.1 mrg {
6363 1.1 mrg if (fn_level == -1)
6364 1.1 mrg oacc_default_dims_p = true;
6365 1.1 mrg else if (fn_level == -2)
6366 1.1 mrg oacc_min_dims_p = true;
6367 1.1 mrg else
6368 1.1 mrg gcc_unreachable ();
6369 1.1 mrg }
6370 1.1 mrg else if (fn_level == -1)
6371 1.1 mrg offload_region_p = true;
6372 1.1 mrg else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
6373 1.1 mrg {
6374 1.1 mrg routine_p = true;
6375 1.1 mrg routine_seq_p = fn_level == GOMP_DIM_MAX;
6376 1.1 mrg }
6377 1.1 mrg else
6378 1.1 mrg gcc_unreachable ();
6379 1.1 mrg
6380 1.1 mrg if (oacc_min_dims_p)
6381 1.1 mrg {
6382 1.1 mrg gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
6383 1.1 mrg gcc_assert (dims[GOMP_DIM_WORKER] == 1);
6384 1.1 mrg gcc_assert (dims[GOMP_DIM_GANG] == 1);
6385 1.1 mrg
6386 1.1 mrg dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6387 1.1 mrg return;
6388 1.1 mrg }
6389 1.1 mrg
6390 1.1 mrg if (routine_p)
6391 1.1 mrg {
6392 1.1 mrg if (!routine_seq_p)
6393 1.1 mrg dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6394 1.1 mrg
6395 1.1 mrg return;
6396 1.1 mrg }
6397 1.1 mrg
6398 1.1 mrg if (oacc_default_dims_p)
6399 1.1 mrg {
6400 1.1 mrg /* -1 : not set
6401 1.1 mrg 0 : set at runtime, f.i. -fopenacc-dims=-
6402 1.1 mrg >= 1: set at compile time, f.i. -fopenacc-dims=1. */
6403 1.1 mrg gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
6404 1.1 mrg gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
6405 1.1 mrg gcc_assert (dims[GOMP_DIM_GANG] >= -1);
6406 1.1 mrg
6407 1.1 mrg /* But -fopenacc-dims=- is not yet supported on trunk. */
6408 1.1 mrg gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
6409 1.1 mrg gcc_assert (dims[GOMP_DIM_WORKER] != 0);
6410 1.1 mrg gcc_assert (dims[GOMP_DIM_GANG] != 0);
6411 1.1 mrg }
6412 1.1 mrg
6413 1.1 mrg if (offload_region_p)
6414 1.1 mrg {
6415 1.1 mrg /* -1 : not set
6416 1.1 mrg 0 : set using variable, f.i. num_gangs (n)
6417 1.1 mrg >= 1: set using constant, f.i. num_gangs (1). */
6418 1.1 mrg gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
6419 1.1 mrg gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
6420 1.1 mrg gcc_assert (dims[GOMP_DIM_GANG] >= -1);
6421 1.1 mrg }
6422 1.1 mrg
6423 1.1 mrg if (offload_region_p)
6424 1.1 mrg default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
6425 1.1 mrg else
6426 1.1 mrg /* oacc_default_dims_p. */
6427 1.1 mrg default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
6428 1.1 mrg
6429 1.1 mrg int old_dims[GOMP_DIM_MAX];
6430 1.1 mrg unsigned int i;
6431 1.1 mrg for (i = 0; i < GOMP_DIM_MAX; ++i)
6432 1.1 mrg old_dims[i] = dims[i];
6433 1.1 mrg
6434 1.1 mrg const char *vector_reason = NULL;
6435 1.1 mrg if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
6436 1.1 mrg {
6437 1.1 mrg default_vector_length = PTX_WARP_SIZE;
6438 1.1 mrg
6439 1.1 mrg if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
6440 1.1 mrg {
6441 1.1 mrg vector_reason = G_("using %<vector_length (%d)%> due to call to"
6442 1.1 mrg " vector-partitionable routine, ignoring %d");
6443 1.1 mrg dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6444 1.1 mrg }
6445 1.1 mrg }
6446 1.1 mrg
6447 1.1 mrg if (dims[GOMP_DIM_VECTOR] == 0)
6448 1.1 mrg {
6449 1.1 mrg vector_reason = G_("using %<vector_length (%d)%>, ignoring runtime setting");
6450 1.1 mrg dims[GOMP_DIM_VECTOR] = default_vector_length;
6451 1.1 mrg }
6452 1.1 mrg
6453 1.1 mrg if (dims[GOMP_DIM_VECTOR] > 0
6454 1.1 mrg && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
6455 1.1 mrg dims[GOMP_DIM_VECTOR] = default_vector_length;
6456 1.1 mrg
6457 1.1 mrg nvptx_apply_dim_limits (dims);
6458 1.1 mrg
6459 1.1 mrg if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
6460 1.1 mrg warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
6461 1.1 mrg vector_reason != NULL
6462 1.1 mrg ? vector_reason
6463 1.1 mrg : G_("using %<vector_length (%d)%>, ignoring %d"),
6464 1.1 mrg dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
6465 1.1 mrg
6466 1.1 mrg if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
6467 1.1 mrg warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
6468 1.1 mrg G_("using %<num_workers (%d)%>, ignoring %d"),
6469 1.1 mrg dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
6470 1.1 mrg
6471 1.1 mrg if (oacc_default_dims_p)
6472 1.1 mrg {
6473 1.1 mrg if (dims[GOMP_DIM_VECTOR] < 0)
6474 1.1 mrg dims[GOMP_DIM_VECTOR] = default_vector_length;
6475 1.1 mrg if (dims[GOMP_DIM_WORKER] < 0)
6476 1.1 mrg dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
6477 1.1 mrg if (dims[GOMP_DIM_GANG] < 0)
6478 1.1 mrg dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
6479 1.1 mrg nvptx_apply_dim_limits (dims);
6480 1.1 mrg }
6481 1.1 mrg
6482 1.1 mrg if (offload_region_p)
6483 1.1 mrg {
6484 1.1 mrg for (i = 0; i < GOMP_DIM_MAX; i++)
6485 1.1 mrg {
6486 1.1 mrg if (!(dims[i] < 0))
6487 1.1 mrg continue;
6488 1.1 mrg
6489 1.1 mrg if ((used & GOMP_DIM_MASK (i)) == 0)
6490 1.1 mrg /* Function oacc_validate_dims will apply the minimal dimension. */
6491 1.1 mrg continue;
6492 1.1 mrg
6493 1.1 mrg dims[i] = (i == GOMP_DIM_VECTOR
6494 1.1 mrg ? default_vector_length
6495 1.1 mrg : oacc_get_default_dim (i));
6496 1.1 mrg }
6497 1.1 mrg
6498 1.1 mrg nvptx_apply_dim_limits (dims);
6499 1.1 mrg }
6500 1.1 mrg }
6501 1.1 mrg
6502 1.1 mrg /* Validate compute dimensions of an OpenACC offload or routine, fill
6503 1.1 mrg in non-unity defaults. FN_LEVEL indicates the level at which a
6504 1.1 mrg routine might spawn a loop. It is negative for non-routines. If
6505 1.1 mrg DECL is null, we are validating the default dimensions. */
6506 1.1 mrg
6507 1.1 mrg static bool
6508 1.1 mrg nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
6509 1.1 mrg {
6510 1.1 mrg int old_dims[GOMP_DIM_MAX];
6511 1.1 mrg unsigned int i;
6512 1.1 mrg
6513 1.1 mrg for (i = 0; i < GOMP_DIM_MAX; ++i)
6514 1.1 mrg old_dims[i] = dims[i];
6515 1.1 mrg
6516 1.1 mrg nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
6517 1.1 mrg
6518 1.1 mrg gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
6519 1.1 mrg if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
6520 1.1 mrg gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
6521 1.1 mrg
6522 1.1 mrg for (i = 0; i < GOMP_DIM_MAX; ++i)
6523 1.1 mrg if (old_dims[i] != dims[i])
6524 1.1 mrg return true;
6525 1.1 mrg
6526 1.1 mrg return false;
6527 1.1 mrg }
6528 1.1 mrg
6529 1.1 mrg /* Return maximum dimension size, or zero for unbounded. */
6530 1.1 mrg
6531 1.1 mrg static int
6532 1.1 mrg nvptx_dim_limit (int axis)
6533 1.1 mrg {
6534 1.1 mrg switch (axis)
6535 1.1 mrg {
6536 1.1 mrg case GOMP_DIM_VECTOR:
6537 1.1 mrg return PTX_MAX_VECTOR_LENGTH;
6538 1.1 mrg
6539 1.1 mrg default:
6540 1.1 mrg break;
6541 1.1 mrg }
6542 1.1 mrg return 0;
6543 1.1 mrg }
6544 1.1 mrg
6545 1.1 mrg /* Determine whether fork & joins are needed. */
6546 1.1 mrg
6547 1.1 mrg static bool
6548 1.1 mrg nvptx_goacc_fork_join (gcall *call, const int dims[],
6549 1.1 mrg bool ARG_UNUSED (is_fork))
6550 1.1 mrg {
6551 1.1 mrg tree arg = gimple_call_arg (call, 2);
6552 1.1 mrg unsigned axis = TREE_INT_CST_LOW (arg);
6553 1.1 mrg
6554 1.1 mrg /* We only care about worker and vector partitioning. */
6555 1.1 mrg if (axis < GOMP_DIM_WORKER)
6556 1.1 mrg return false;
6557 1.1 mrg
6558 1.1 mrg /* If the size is 1, there's no partitioning. */
6559 1.1 mrg if (dims[axis] == 1)
6560 1.1 mrg return false;
6561 1.1 mrg
6562 1.1 mrg return true;
6563 1.1 mrg }
6564 1.1 mrg
6565 1.1 mrg /* Generate a PTX builtin function call that returns the address in
6566 1.1 mrg the worker reduction buffer at OFFSET. TYPE is the type of the
6567 1.1 mrg data at that location. */
6568 1.1 mrg
6569 1.1 mrg static tree
6570 1.1 mrg nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
6571 1.1 mrg {
6572 1.1 mrg enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
6573 1.1 mrg if (vector)
6574 1.1 mrg addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
6575 1.1 mrg machine_mode mode = TYPE_MODE (type);
6576 1.1 mrg tree fndecl = nvptx_builtin_decl (addr_dim, true);
6577 1.1 mrg tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
6578 1.1 mrg tree align = build_int_cst (unsigned_type_node,
6579 1.1 mrg GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
6580 1.1 mrg tree call = build_call_expr (fndecl, 3, offset, size, align);
6581 1.1 mrg
6582 1.1 mrg return fold_convert (build_pointer_type (type), call);
6583 1.1 mrg }
6584 1.1 mrg
6585 1.1 mrg /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
6586 1.1 mrg will cast the variable if necessary. */
6587 1.1 mrg
6588 1.1 mrg static void
6589 1.1 mrg nvptx_generate_vector_shuffle (location_t loc,
6590 1.1 mrg tree dest_var, tree var, unsigned shift,
6591 1.1 mrg gimple_seq *seq)
6592 1.1 mrg {
6593 1.1 mrg unsigned fn = NVPTX_BUILTIN_SHUFFLE;
6594 1.1 mrg tree_code code = NOP_EXPR;
6595 1.1 mrg tree arg_type = unsigned_type_node;
6596 1.1 mrg tree var_type = TREE_TYPE (var);
6597 1.1 mrg tree dest_type = var_type;
6598 1.1 mrg
6599 1.1 mrg if (TREE_CODE (var_type) == COMPLEX_TYPE)
6600 1.1 mrg var_type = TREE_TYPE (var_type);
6601 1.1 mrg
6602 1.1 mrg if (TREE_CODE (var_type) == REAL_TYPE)
6603 1.1 mrg code = VIEW_CONVERT_EXPR;
6604 1.1 mrg
6605 1.1 mrg if (TYPE_SIZE (var_type)
6606 1.1 mrg == TYPE_SIZE (long_long_unsigned_type_node))
6607 1.1 mrg {
6608 1.1 mrg fn = NVPTX_BUILTIN_SHUFFLELL;
6609 1.1 mrg arg_type = long_long_unsigned_type_node;
6610 1.1 mrg }
6611 1.1 mrg
6612 1.1 mrg tree call = nvptx_builtin_decl (fn, true);
6613 1.1 mrg tree bits = build_int_cst (unsigned_type_node, shift);
6614 1.1 mrg tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
6615 1.1 mrg tree expr;
6616 1.1 mrg
6617 1.1 mrg if (var_type != dest_type)
6618 1.1 mrg {
6619 1.1 mrg /* Do real and imaginary parts separately. */
6620 1.1 mrg tree real = fold_build1 (REALPART_EXPR, var_type, var);
6621 1.1 mrg real = fold_build1 (code, arg_type, real);
6622 1.1 mrg real = build_call_expr_loc (loc, call, 3, real, bits, kind);
6623 1.1 mrg real = fold_build1 (code, var_type, real);
6624 1.1 mrg
6625 1.1 mrg tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
6626 1.1 mrg imag = fold_build1 (code, arg_type, imag);
6627 1.1 mrg imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
6628 1.1 mrg imag = fold_build1 (code, var_type, imag);
6629 1.1 mrg
6630 1.1 mrg expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
6631 1.1 mrg }
6632 1.1 mrg else
6633 1.1 mrg {
6634 1.1 mrg expr = fold_build1 (code, arg_type, var);
6635 1.1 mrg expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
6636 1.1 mrg expr = fold_build1 (code, dest_type, expr);
6637 1.1 mrg }
6638 1.1 mrg
6639 1.1 mrg gimplify_assign (dest_var, expr, seq);
6640 1.1 mrg }
6641 1.1 mrg
6642 1.1 mrg /* Lazily generate the global lock var decl and return its address. */
6643 1.1 mrg
6644 1.1 mrg static tree
6645 1.1 mrg nvptx_global_lock_addr ()
6646 1.1 mrg {
6647 1.1 mrg tree v = global_lock_var;
6648 1.1 mrg
6649 1.1 mrg if (!v)
6650 1.1 mrg {
6651 1.1 mrg tree name = get_identifier ("__reduction_lock");
6652 1.1 mrg tree type = build_qualified_type (unsigned_type_node,
6653 1.1 mrg TYPE_QUAL_VOLATILE);
6654 1.1 mrg v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
6655 1.1 mrg global_lock_var = v;
6656 1.1 mrg DECL_ARTIFICIAL (v) = 1;
6657 1.1 mrg DECL_EXTERNAL (v) = 1;
6658 1.1 mrg TREE_STATIC (v) = 1;
6659 1.1 mrg TREE_PUBLIC (v) = 1;
6660 1.1 mrg TREE_USED (v) = 1;
6661 1.1 mrg mark_addressable (v);
6662 1.1 mrg mark_decl_referenced (v);
6663 1.1 mrg }
6664 1.1 mrg
6665 1.1 mrg return build_fold_addr_expr (v);
6666 1.1 mrg }
6667 1.1 mrg
6668 1.1 mrg /* Insert code to locklessly update *PTR with *PTR OP VAR just before
6669 1.1 mrg GSI. We use a lockless scheme for nearly all case, which looks
6670 1.1 mrg like:
6671 1.1 mrg actual = initval(OP);
6672 1.1 mrg do {
6673 1.1 mrg guess = actual;
6674 1.1 mrg write = guess OP myval;
6675 1.1 mrg actual = cmp&swap (ptr, guess, write)
6676 1.1 mrg } while (actual bit-different-to guess);
6677 1.1 mrg return write;
6678 1.1 mrg
6679 1.1 mrg This relies on a cmp&swap instruction, which is available for 32-
6680 1.1 mrg and 64-bit types. Larger types must use a locking scheme. */
6681 1.1 mrg
6682 1.1 mrg static tree
6683 1.1 mrg nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
6684 1.1 mrg tree ptr, tree var, tree_code op)
6685 1.1 mrg {
6686 1.1 mrg unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
6687 1.1 mrg tree_code code = NOP_EXPR;
6688 1.1 mrg tree arg_type = unsigned_type_node;
6689 1.1 mrg tree var_type = TREE_TYPE (var);
6690 1.1 mrg
6691 1.1 mrg if (TREE_CODE (var_type) == COMPLEX_TYPE
6692 1.1 mrg || TREE_CODE (var_type) == REAL_TYPE)
6693 1.1 mrg code = VIEW_CONVERT_EXPR;
6694 1.1 mrg
6695 1.1 mrg if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
6696 1.1 mrg {
6697 1.1 mrg arg_type = long_long_unsigned_type_node;
6698 1.1 mrg fn = NVPTX_BUILTIN_CMP_SWAPLL;
6699 1.1 mrg }
6700 1.1 mrg
6701 1.1 mrg tree swap_fn = nvptx_builtin_decl (fn, true);
6702 1.1 mrg
6703 1.1 mrg gimple_seq init_seq = NULL;
6704 1.1 mrg tree init_var = make_ssa_name (arg_type);
6705 1.1 mrg tree init_expr = omp_reduction_init_op (loc, op, var_type);
6706 1.1 mrg init_expr = fold_build1 (code, arg_type, init_expr);
6707 1.1 mrg gimplify_assign (init_var, init_expr, &init_seq);
6708 1.1 mrg gimple *init_end = gimple_seq_last (init_seq);
6709 1.1 mrg
6710 1.1 mrg gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
6711 1.1 mrg
6712 1.1 mrg /* Split the block just after the init stmts. */
6713 1.1 mrg basic_block pre_bb = gsi_bb (*gsi);
6714 1.1 mrg edge pre_edge = split_block (pre_bb, init_end);
6715 1.1 mrg basic_block loop_bb = pre_edge->dest;
6716 1.1 mrg pre_bb = pre_edge->src;
6717 1.1 mrg /* Reset the iterator. */
6718 1.1 mrg *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6719 1.1 mrg
6720 1.1 mrg tree expect_var = make_ssa_name (arg_type);
6721 1.1 mrg tree actual_var = make_ssa_name (arg_type);
6722 1.1 mrg tree write_var = make_ssa_name (arg_type);
6723 1.1 mrg
6724 1.1 mrg /* Build and insert the reduction calculation. */
6725 1.1 mrg gimple_seq red_seq = NULL;
6726 1.1 mrg tree write_expr = fold_build1 (code, var_type, expect_var);
6727 1.1 mrg write_expr = fold_build2 (op, var_type, write_expr, var);
6728 1.1 mrg write_expr = fold_build1 (code, arg_type, write_expr);
6729 1.1 mrg gimplify_assign (write_var, write_expr, &red_seq);
6730 1.1 mrg
6731 1.1 mrg gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6732 1.1 mrg
6733 1.1 mrg /* Build & insert the cmp&swap sequence. */
6734 1.1 mrg gimple_seq latch_seq = NULL;
6735 1.1 mrg tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
6736 1.1 mrg ptr, expect_var, write_var);
6737 1.1 mrg gimplify_assign (actual_var, swap_expr, &latch_seq);
6738 1.1 mrg
6739 1.1 mrg gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
6740 1.1 mrg NULL_TREE, NULL_TREE);
6741 1.1 mrg gimple_seq_add_stmt (&latch_seq, cond);
6742 1.1 mrg
6743 1.1 mrg gimple *latch_end = gimple_seq_last (latch_seq);
6744 1.1 mrg gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
6745 1.1 mrg
6746 1.1 mrg /* Split the block just after the latch stmts. */
6747 1.1 mrg edge post_edge = split_block (loop_bb, latch_end);
6748 1.1 mrg basic_block post_bb = post_edge->dest;
6749 1.1 mrg loop_bb = post_edge->src;
6750 1.1 mrg *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6751 1.1 mrg
6752 1.1 mrg post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6753 1.1 mrg post_edge->probability = profile_probability::even ();
6754 1.1 mrg edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
6755 1.1 mrg loop_edge->probability = profile_probability::even ();
6756 1.1 mrg set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
6757 1.1 mrg set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
6758 1.1 mrg
6759 1.1 mrg gphi *phi = create_phi_node (expect_var, loop_bb);
6760 1.1 mrg add_phi_arg (phi, init_var, pre_edge, loc);
6761 1.1 mrg add_phi_arg (phi, actual_var, loop_edge, loc);
6762 1.1 mrg
6763 1.1 mrg loop *loop = alloc_loop ();
6764 1.1 mrg loop->header = loop_bb;
6765 1.1 mrg loop->latch = loop_bb;
6766 1.1 mrg add_loop (loop, loop_bb->loop_father);
6767 1.1 mrg
6768 1.1 mrg return fold_build1 (code, var_type, write_var);
6769 1.1 mrg }
6770 1.1 mrg
6771 1.1 mrg /* Insert code to lockfully update *PTR with *PTR OP VAR just before
6772 1.1 mrg GSI. This is necessary for types larger than 64 bits, where there
6773 1.1 mrg is no cmp&swap instruction to implement a lockless scheme. We use
6774 1.1 mrg a lock variable in global memory.
6775 1.1 mrg
6776 1.1 mrg while (cmp&swap (&lock_var, 0, 1))
6777 1.1 mrg continue;
6778 1.1 mrg T accum = *ptr;
6779 1.1 mrg accum = accum OP var;
6780 1.1 mrg *ptr = accum;
6781 1.1 mrg cmp&swap (&lock_var, 1, 0);
6782 1.1 mrg return accum;
6783 1.1 mrg
6784 1.1 mrg A lock in global memory is necessary to force execution engine
6785 1.1 mrg descheduling and avoid resource starvation that can occur if the
6786 1.1 mrg lock is in .shared memory. */
6787 1.1 mrg
6788 1.1 mrg static tree
6789 1.1 mrg nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6790 1.1 mrg tree ptr, tree var, tree_code op, int level)
6791 1.1 mrg {
6792 1.1 mrg tree var_type = TREE_TYPE (var);
6793 1.1 mrg tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6794 1.1 mrg tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6795 1.1 mrg tree uns_locked = build_int_cst (unsigned_type_node, 1);
6796 1.1 mrg
6797 1.1 mrg /* Split the block just before the gsi. Insert a gimple nop to make
6798 1.1 mrg this easier. */
6799 1.1 mrg gimple *nop = gimple_build_nop ();
6800 1.1 mrg gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6801 1.1 mrg basic_block entry_bb = gsi_bb (*gsi);
6802 1.1 mrg edge entry_edge = split_block (entry_bb, nop);
6803 1.1 mrg basic_block lock_bb = entry_edge->dest;
6804 1.1 mrg /* Reset the iterator. */
6805 1.1 mrg *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6806 1.1 mrg
6807 1.1 mrg /* Build and insert the locking sequence. */
6808 1.1 mrg gimple_seq lock_seq = NULL;
6809 1.1 mrg tree lock_var = make_ssa_name (unsigned_type_node);
6810 1.1 mrg tree lock_expr = nvptx_global_lock_addr ();
6811 1.1 mrg lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6812 1.1 mrg uns_unlocked, uns_locked);
6813 1.1 mrg gimplify_assign (lock_var, lock_expr, &lock_seq);
6814 1.1 mrg gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6815 1.1 mrg NULL_TREE, NULL_TREE);
6816 1.1 mrg gimple_seq_add_stmt (&lock_seq, cond);
6817 1.1 mrg gimple *lock_end = gimple_seq_last (lock_seq);
6818 1.1 mrg gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6819 1.1 mrg
6820 1.1 mrg /* Split the block just after the lock sequence. */
6821 1.1 mrg edge locked_edge = split_block (lock_bb, lock_end);
6822 1.1 mrg basic_block update_bb = locked_edge->dest;
6823 1.1 mrg lock_bb = locked_edge->src;
6824 1.1 mrg *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6825 1.1 mrg
6826 1.1 mrg /* Create the lock loop ... */
6827 1.1 mrg locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6828 1.1 mrg locked_edge->probability = profile_probability::even ();
6829 1.1 mrg edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6830 1.1 mrg loop_edge->probability = profile_probability::even ();
6831 1.1 mrg set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6832 1.1 mrg set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6833 1.1 mrg
6834 1.1 mrg /* ... and the loop structure. */
6835 1.1 mrg loop *lock_loop = alloc_loop ();
6836 1.1 mrg lock_loop->header = lock_bb;
6837 1.1 mrg lock_loop->latch = lock_bb;
6838 1.1 mrg lock_loop->nb_iterations_estimate = 1;
6839 1.1 mrg lock_loop->any_estimate = true;
6840 1.1 mrg add_loop (lock_loop, entry_bb->loop_father);
6841 1.1 mrg
6842 1.1 mrg /* Build the pre-barrier. */
6843 1.1 mrg gimple_seq red_seq = NULL;
6844 1.1 mrg enum nvptx_builtins barrier_builtin
6845 1.1 mrg = (level == GOMP_DIM_GANG
6846 1.1 mrg ? NVPTX_BUILTIN_MEMBAR_GL
6847 1.1 mrg : NVPTX_BUILTIN_MEMBAR_CTA);
6848 1.1 mrg tree barrier_fn = nvptx_builtin_decl (barrier_builtin, true);
6849 1.1 mrg tree barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
6850 1.1 mrg gimplify_stmt (&barrier_expr, &red_seq);
6851 1.1 mrg
6852 1.1 mrg /* Build the reduction calculation. */
6853 1.1 mrg tree acc_in = make_ssa_name (var_type);
6854 1.1 mrg tree ref_in = build_simple_mem_ref (ptr);
6855 1.1 mrg TREE_THIS_VOLATILE (ref_in) = 1;
6856 1.1 mrg gimplify_assign (acc_in, ref_in, &red_seq);
6857 1.1 mrg
6858 1.1 mrg tree acc_out = make_ssa_name (var_type);
6859 1.1 mrg tree update_expr = fold_build2 (op, var_type, ref_in, var);
6860 1.1 mrg gimplify_assign (acc_out, update_expr, &red_seq);
6861 1.1 mrg
6862 1.1 mrg tree ref_out = build_simple_mem_ref (ptr);
6863 1.1 mrg TREE_THIS_VOLATILE (ref_out) = 1;
6864 1.1 mrg gimplify_assign (ref_out, acc_out, &red_seq);
6865 1.1 mrg
6866 1.1 mrg /* Build the post-barrier. */
6867 1.1 mrg barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
6868 1.1 mrg gimplify_stmt (&barrier_expr, &red_seq);
6869 1.1 mrg
6870 1.1 mrg /* Insert the reduction calculation. */
6871 1.1 mrg gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6872 1.1 mrg
6873 1.1 mrg /* Build & insert the unlock sequence. */
6874 1.1 mrg gimple_seq unlock_seq = NULL;
6875 1.1 mrg tree unlock_expr = nvptx_global_lock_addr ();
6876 1.1 mrg unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6877 1.1 mrg uns_locked, uns_unlocked);
6878 1.1 mrg gimplify_and_add (unlock_expr, &unlock_seq);
6879 1.1 mrg gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6880 1.1 mrg
6881 1.1 mrg return acc_out;
6882 1.1 mrg }
6883 1.1 mrg
6884 1.1 mrg /* Emit a sequence to update a reduction accumlator at *PTR with the
6885 1.1 mrg value held in VAR using operator OP. Return the updated value.
6886 1.1 mrg
6887 1.1 mrg TODO: optimize for atomic ops and indepedent complex ops. */
6888 1.1 mrg
6889 1.1 mrg static tree
6890 1.1 mrg nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6891 1.1 mrg tree ptr, tree var, tree_code op, int level)
6892 1.1 mrg {
6893 1.1 mrg tree type = TREE_TYPE (var);
6894 1.1 mrg tree size = TYPE_SIZE (type);
6895 1.1 mrg
6896 1.1 mrg if (size == TYPE_SIZE (unsigned_type_node)
6897 1.1 mrg || size == TYPE_SIZE (long_long_unsigned_type_node))
6898 1.1 mrg return nvptx_lockless_update (loc, gsi, ptr, var, op);
6899 1.1 mrg else
6900 1.1 mrg return nvptx_lockfull_update (loc, gsi, ptr, var, op, level);
6901 1.1 mrg }
6902 1.1 mrg
6903 1.1 mrg /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6904 1.1 mrg
6905 1.1 mrg static void
6906 1.1 mrg nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
6907 1.1 mrg {
6908 1.1 mrg gimple_stmt_iterator gsi = gsi_for_stmt (call);
6909 1.1 mrg tree lhs = gimple_call_lhs (call);
6910 1.1 mrg tree var = gimple_call_arg (call, 2);
6911 1.1 mrg int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6912 1.1 mrg gimple_seq seq = NULL;
6913 1.1 mrg
6914 1.1 mrg push_gimplify_context (true);
6915 1.1 mrg
6916 1.1 mrg if (level != GOMP_DIM_GANG)
6917 1.1 mrg {
6918 1.1 mrg /* Copy the receiver object. */
6919 1.1 mrg tree ref_to_res = gimple_call_arg (call, 1);
6920 1.1 mrg
6921 1.1 mrg if (!integer_zerop (ref_to_res))
6922 1.1 mrg var = build_simple_mem_ref (ref_to_res);
6923 1.1 mrg }
6924 1.1 mrg
6925 1.1 mrg if (level == GOMP_DIM_WORKER
6926 1.1 mrg || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6927 1.1 mrg {
6928 1.1 mrg /* Store incoming value to worker reduction buffer. */
6929 1.1 mrg tree offset = gimple_call_arg (call, 5);
6930 1.1 mrg tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6931 1.1 mrg level == GOMP_DIM_VECTOR);
6932 1.1 mrg tree ptr = make_ssa_name (TREE_TYPE (call));
6933 1.1 mrg
6934 1.1 mrg gimplify_assign (ptr, call, &seq);
6935 1.1 mrg tree ref = build_simple_mem_ref (ptr);
6936 1.1 mrg TREE_THIS_VOLATILE (ref) = 1;
6937 1.1 mrg gimplify_assign (ref, var, &seq);
6938 1.1 mrg }
6939 1.1 mrg
6940 1.1 mrg if (lhs)
6941 1.1 mrg gimplify_assign (lhs, var, &seq);
6942 1.1 mrg
6943 1.1 mrg pop_gimplify_context (NULL);
6944 1.1 mrg gsi_replace_with_seq (&gsi, seq, true);
6945 1.1 mrg }
6946 1.1 mrg
6947 1.1 mrg /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6948 1.1 mrg
6949 1.1 mrg static void
6950 1.1 mrg nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
6951 1.1 mrg {
6952 1.1 mrg gimple_stmt_iterator gsi = gsi_for_stmt (call);
6953 1.1 mrg tree lhs = gimple_call_lhs (call);
6954 1.1 mrg tree var = gimple_call_arg (call, 2);
6955 1.1 mrg int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6956 1.1 mrg enum tree_code rcode
6957 1.1 mrg = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6958 1.1 mrg tree init = omp_reduction_init_op (gimple_location (call), rcode,
6959 1.1 mrg TREE_TYPE (var));
6960 1.1 mrg gimple_seq seq = NULL;
6961 1.1 mrg
6962 1.1 mrg push_gimplify_context (true);
6963 1.1 mrg
6964 1.1 mrg if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6965 1.1 mrg {
6966 1.1 mrg /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6967 1.1 mrg tree tid = make_ssa_name (integer_type_node);
6968 1.1 mrg tree dim_vector = gimple_call_arg (call, 3);
6969 1.1 mrg gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6970 1.1 mrg dim_vector);
6971 1.1 mrg gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6972 1.1 mrg NULL_TREE, NULL_TREE);
6973 1.1 mrg
6974 1.1 mrg gimple_call_set_lhs (tid_call, tid);
6975 1.1 mrg gimple_seq_add_stmt (&seq, tid_call);
6976 1.1 mrg gimple_seq_add_stmt (&seq, cond_stmt);
6977 1.1 mrg
6978 1.1 mrg /* Split the block just after the call. */
6979 1.1 mrg edge init_edge = split_block (gsi_bb (gsi), call);
6980 1.1 mrg basic_block init_bb = init_edge->dest;
6981 1.1 mrg basic_block call_bb = init_edge->src;
6982 1.1 mrg
6983 1.1 mrg /* Fixup flags from call_bb to init_bb. */
6984 1.1 mrg init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
6985 1.1 mrg init_edge->probability = profile_probability::even ();
6986 1.1 mrg
6987 1.1 mrg /* Set the initialization stmts. */
6988 1.1 mrg gimple_seq init_seq = NULL;
6989 1.1 mrg tree init_var = make_ssa_name (TREE_TYPE (var));
6990 1.1 mrg gimplify_assign (init_var, init, &init_seq);
6991 1.1 mrg gsi = gsi_start_bb (init_bb);
6992 1.1 mrg gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6993 1.1 mrg
6994 1.1 mrg /* Split block just after the init stmt. */
6995 1.1 mrg gsi_prev (&gsi);
6996 1.1 mrg edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6997 1.1 mrg basic_block dst_bb = inited_edge->dest;
6998 1.1 mrg
6999 1.1 mrg /* Create false edge from call_bb to dst_bb. */
7000 1.1 mrg edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
7001 1.1 mrg nop_edge->probability = profile_probability::even ();
7002 1.1 mrg
7003 1.1 mrg /* Create phi node in dst block. */
7004 1.1 mrg gphi *phi = create_phi_node (lhs, dst_bb);
7005 1.1 mrg add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
7006 1.1 mrg add_phi_arg (phi, var, nop_edge, gimple_location (call));
7007 1.1 mrg
7008 1.1 mrg /* Reset dominator of dst bb. */
7009 1.1 mrg set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
7010 1.1 mrg
7011 1.1 mrg /* Reset the gsi. */
7012 1.1 mrg gsi = gsi_for_stmt (call);
7013 1.1 mrg }
7014 1.1 mrg else
7015 1.1 mrg {
7016 1.1 mrg if (level == GOMP_DIM_GANG)
7017 1.1 mrg {
7018 1.1 mrg /* If there's no receiver object, propagate the incoming VAR. */
7019 1.1 mrg tree ref_to_res = gimple_call_arg (call, 1);
7020 1.1 mrg if (integer_zerop (ref_to_res))
7021 1.1 mrg init = var;
7022 1.1 mrg }
7023 1.1 mrg
7024 1.1 mrg if (lhs != NULL_TREE)
7025 1.1 mrg gimplify_assign (lhs, init, &seq);
7026 1.1 mrg }
7027 1.1 mrg
7028 1.1 mrg pop_gimplify_context (NULL);
7029 1.1 mrg gsi_replace_with_seq (&gsi, seq, true);
7030 1.1 mrg }
7031 1.1 mrg
7032 1.1 mrg /* NVPTX implementation of GOACC_REDUCTION_FINI. */
7033 1.1 mrg
7034 1.1 mrg static void
7035 1.1 mrg nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
7036 1.1 mrg {
7037 1.1 mrg gimple_stmt_iterator gsi = gsi_for_stmt (call);
7038 1.1 mrg tree lhs = gimple_call_lhs (call);
7039 1.1 mrg tree ref_to_res = gimple_call_arg (call, 1);
7040 1.1 mrg tree var = gimple_call_arg (call, 2);
7041 1.1 mrg int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7042 1.1 mrg enum tree_code op
7043 1.1 mrg = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
7044 1.1 mrg gimple_seq seq = NULL;
7045 1.1 mrg tree r = NULL_TREE;;
7046 1.1 mrg
7047 1.1 mrg push_gimplify_context (true);
7048 1.1 mrg
7049 1.1 mrg if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
7050 1.1 mrg {
7051 1.1 mrg /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
7052 1.1 mrg but that requires a method of emitting a unified jump at the
7053 1.1 mrg gimple level. */
7054 1.1 mrg for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
7055 1.1 mrg {
7056 1.1 mrg tree other_var = make_ssa_name (TREE_TYPE (var));
7057 1.1 mrg nvptx_generate_vector_shuffle (gimple_location (call),
7058 1.1 mrg other_var, var, shfl, &seq);
7059 1.1 mrg
7060 1.1 mrg r = make_ssa_name (TREE_TYPE (var));
7061 1.1 mrg gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
7062 1.1 mrg var, other_var), &seq);
7063 1.1 mrg var = r;
7064 1.1 mrg }
7065 1.1 mrg }
7066 1.1 mrg else
7067 1.1 mrg {
7068 1.1 mrg tree accum = NULL_TREE;
7069 1.1 mrg
7070 1.1 mrg if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
7071 1.1 mrg {
7072 1.1 mrg /* Get reduction buffer address. */
7073 1.1 mrg tree offset = gimple_call_arg (call, 5);
7074 1.1 mrg tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
7075 1.1 mrg level == GOMP_DIM_VECTOR);
7076 1.1 mrg tree ptr = make_ssa_name (TREE_TYPE (call));
7077 1.1 mrg
7078 1.1 mrg gimplify_assign (ptr, call, &seq);
7079 1.1 mrg accum = ptr;
7080 1.1 mrg }
7081 1.1 mrg else if (integer_zerop (ref_to_res))
7082 1.1 mrg r = var;
7083 1.1 mrg else
7084 1.1 mrg accum = ref_to_res;
7085 1.1 mrg
7086 1.1 mrg if (accum)
7087 1.1 mrg {
7088 1.1 mrg /* UPDATE the accumulator. */
7089 1.1 mrg gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
7090 1.1 mrg seq = NULL;
7091 1.1 mrg r = nvptx_reduction_update (gimple_location (call), &gsi,
7092 1.1 mrg accum, var, op, level);
7093 1.1 mrg }
7094 1.1 mrg }
7095 1.1 mrg
7096 1.1 mrg if (lhs)
7097 1.1 mrg gimplify_assign (lhs, r, &seq);
7098 1.1 mrg pop_gimplify_context (NULL);
7099 1.1 mrg
7100 1.1 mrg gsi_replace_with_seq (&gsi, seq, true);
7101 1.1 mrg }
7102 1.1 mrg
7103 1.1 mrg /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
7104 1.1 mrg
7105 1.1 mrg static void
7106 1.1 mrg nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
7107 1.1 mrg {
7108 1.1 mrg gimple_stmt_iterator gsi = gsi_for_stmt (call);
7109 1.1 mrg tree lhs = gimple_call_lhs (call);
7110 1.1 mrg tree var = gimple_call_arg (call, 2);
7111 1.1 mrg int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7112 1.1 mrg gimple_seq seq = NULL;
7113 1.1 mrg
7114 1.1 mrg push_gimplify_context (true);
7115 1.1 mrg if (level == GOMP_DIM_WORKER
7116 1.1 mrg || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
7117 1.1 mrg {
7118 1.1 mrg /* Read the worker reduction buffer. */
7119 1.1 mrg tree offset = gimple_call_arg (call, 5);
7120 1.1 mrg tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
7121 1.1 mrg level == GOMP_DIM_VECTOR);
7122 1.1 mrg tree ptr = make_ssa_name (TREE_TYPE (call));
7123 1.1 mrg
7124 1.1 mrg gimplify_assign (ptr, call, &seq);
7125 1.1 mrg var = build_simple_mem_ref (ptr);
7126 1.1 mrg TREE_THIS_VOLATILE (var) = 1;
7127 1.1 mrg }
7128 1.1 mrg
7129 1.1 mrg if (level != GOMP_DIM_GANG)
7130 1.1 mrg {
7131 1.1 mrg /* Write to the receiver object. */
7132 1.1 mrg tree ref_to_res = gimple_call_arg (call, 1);
7133 1.1 mrg
7134 1.1 mrg if (!integer_zerop (ref_to_res))
7135 1.1 mrg gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
7136 1.1 mrg }
7137 1.1 mrg
7138 1.1 mrg if (lhs)
7139 1.1 mrg gimplify_assign (lhs, var, &seq);
7140 1.1 mrg
7141 1.1 mrg pop_gimplify_context (NULL);
7142 1.1 mrg
7143 1.1 mrg gsi_replace_with_seq (&gsi, seq, true);
7144 1.1 mrg }
7145 1.1 mrg
7146 1.1 mrg /* NVPTX reduction expander. */
7147 1.1 mrg
7148 1.1 mrg static void
7149 1.1 mrg nvptx_goacc_reduction (gcall *call)
7150 1.1 mrg {
7151 1.1 mrg unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
7152 1.1 mrg offload_attrs oa;
7153 1.1 mrg
7154 1.1 mrg populate_offload_attrs (&oa);
7155 1.1 mrg
7156 1.1 mrg switch (code)
7157 1.1 mrg {
7158 1.1 mrg case IFN_GOACC_REDUCTION_SETUP:
7159 1.1 mrg nvptx_goacc_reduction_setup (call, &oa);
7160 1.1 mrg break;
7161 1.1 mrg
7162 1.1 mrg case IFN_GOACC_REDUCTION_INIT:
7163 1.1 mrg nvptx_goacc_reduction_init (call, &oa);
7164 1.1 mrg break;
7165 1.1 mrg
7166 1.1 mrg case IFN_GOACC_REDUCTION_FINI:
7167 1.1 mrg nvptx_goacc_reduction_fini (call, &oa);
7168 1.1 mrg break;
7169 1.1 mrg
7170 1.1 mrg case IFN_GOACC_REDUCTION_TEARDOWN:
7171 1.1 mrg nvptx_goacc_reduction_teardown (call, &oa);
7172 1.1 mrg break;
7173 1.1 mrg
7174 1.1 mrg default:
7175 1.1 mrg gcc_unreachable ();
7176 1.1 mrg }
7177 1.1 mrg }
7178 1.1 mrg
7179 1.1 mrg static bool
7180 1.1 mrg nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
7181 1.1 mrg rtx x ATTRIBUTE_UNUSED)
7182 1.1 mrg {
7183 1.1 mrg return true;
7184 1.1 mrg }
7185 1.1 mrg
7186 1.1 mrg static bool
7187 1.1 mrg nvptx_scalar_mode_supported_p (scalar_mode mode)
7188 1.1 mrg {
7189 1.1 mrg if (nvptx_experimental && mode == HFmode && TARGET_SM53)
7190 1.1 mrg return true;
7191 1.1 mrg
7192 1.1 mrg return default_scalar_mode_supported_p (mode);
7193 1.1 mrg }
7194 1.1 mrg
7195 1.1 mrg static bool
7196 1.1 mrg nvptx_libgcc_floating_mode_supported_p (scalar_float_mode mode)
7197 1.1 mrg {
7198 1.1 mrg if (nvptx_experimental && mode == HFmode && TARGET_SM53)
7199 1.1 mrg return true;
7200 1.1 mrg
7201 1.1 mrg return default_libgcc_floating_mode_supported_p (mode);
7202 1.1 mrg }
7203 1.1 mrg
7204 1.1 mrg static bool
7205 1.1 mrg nvptx_vector_mode_supported (machine_mode mode)
7206 1.1 mrg {
7207 1.1 mrg return (mode == V2SImode
7208 1.1 mrg || mode == V2DImode);
7209 1.1 mrg }
7210 1.1 mrg
7211 1.1 mrg /* Return the preferred mode for vectorizing scalar MODE. */
7212 1.1 mrg
7213 1.1 mrg static machine_mode
7214 1.1 mrg nvptx_preferred_simd_mode (scalar_mode mode)
7215 1.1 mrg {
7216 1.1 mrg switch (mode)
7217 1.1 mrg {
7218 1.1 mrg case E_DImode:
7219 1.1 mrg return V2DImode;
7220 1.1 mrg case E_SImode:
7221 1.1 mrg return V2SImode;
7222 1.1 mrg
7223 1.1 mrg default:
7224 1.1 mrg return default_preferred_simd_mode (mode);
7225 1.1 mrg }
7226 1.1 mrg }
7227 1.1 mrg
7228 1.1 mrg unsigned int
7229 1.1 mrg nvptx_data_alignment (const_tree type, unsigned int basic_align)
7230 1.1 mrg {
7231 1.1 mrg if (TREE_CODE (type) == INTEGER_TYPE)
7232 1.1 mrg {
7233 1.1 mrg unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
7234 1.1 mrg if (size == GET_MODE_SIZE (TImode))
7235 1.1 mrg return GET_MODE_BITSIZE (maybe_split_mode (TImode));
7236 1.1 mrg }
7237 1.1 mrg
7238 1.1 mrg return basic_align;
7239 1.1 mrg }
7240 1.1 mrg
7241 1.1 mrg /* Implement TARGET_MODES_TIEABLE_P. */
7242 1.1 mrg
7243 1.1 mrg static bool
7244 1.1 mrg nvptx_modes_tieable_p (machine_mode, machine_mode)
7245 1.1 mrg {
7246 1.1 mrg return false;
7247 1.1 mrg }
7248 1.1 mrg
7249 1.1 mrg /* Implement TARGET_HARD_REGNO_NREGS. */
7250 1.1 mrg
7251 1.1 mrg static unsigned int
7252 1.1 mrg nvptx_hard_regno_nregs (unsigned int, machine_mode)
7253 1.1 mrg {
7254 1.1 mrg return 1;
7255 1.1 mrg }
7256 1.1 mrg
7257 1.1 mrg /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
7258 1.1 mrg
7259 1.1 mrg static bool
7260 1.1 mrg nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
7261 1.1 mrg {
7262 1.1 mrg return false;
7263 1.1 mrg }
7264 1.1 mrg
7265 1.1 mrg /* Implement TARGET_TRULY_NOOP_TRUNCATION. */
7266 1.1 mrg
7267 1.1 mrg static bool
7268 1.1 mrg nvptx_truly_noop_truncation (poly_uint64, poly_uint64)
7269 1.1 mrg {
7270 1.1 mrg return false;
7271 1.1 mrg }
7272 1.1 mrg
7273 1.1 mrg /* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL. */
7274 1.1 mrg
7275 1.1 mrg static tree
7276 1.1 mrg nvptx_goacc_adjust_private_decl (location_t loc, tree decl, int level)
7277 1.1 mrg {
7278 1.1 mrg gcc_checking_assert (!lookup_attribute ("oacc gang-private",
7279 1.1 mrg DECL_ATTRIBUTES (decl)));
7280 1.1 mrg
7281 1.1 mrg /* Set "oacc gang-private" attribute for gang-private variable
7282 1.1 mrg declarations. */
7283 1.1 mrg if (level == GOMP_DIM_GANG)
7284 1.1 mrg {
7285 1.1 mrg tree id = get_identifier ("oacc gang-private");
7286 1.1 mrg /* For later diagnostic purposes, pass LOC as VALUE (wrapped as a
7287 1.1 mrg TREE). */
7288 1.1 mrg tree loc_tree = build_empty_stmt (loc);
7289 1.1 mrg DECL_ATTRIBUTES (decl)
7290 1.1 mrg = tree_cons (id, loc_tree, DECL_ATTRIBUTES (decl));
7291 1.1 mrg }
7292 1.1 mrg
7293 1.1 mrg return decl;
7294 1.1 mrg }
7295 1.1 mrg
7296 1.1 mrg /* Implement TARGET_GOACC_EXPAND_VAR_DECL. */
7297 1.1 mrg
7298 1.1 mrg static rtx
7299 1.1 mrg nvptx_goacc_expand_var_decl (tree var)
7300 1.1 mrg {
7301 1.1 mrg /* Place "oacc gang-private" variables in shared memory. */
7302 1.1 mrg if (tree attr = lookup_attribute ("oacc gang-private",
7303 1.1 mrg DECL_ATTRIBUTES (var)))
7304 1.1 mrg {
7305 1.1 mrg gcc_checking_assert (VAR_P (var));
7306 1.1 mrg
7307 1.1 mrg unsigned int offset, *poffset;
7308 1.1 mrg poffset = gang_private_shared_hmap.get (var);
7309 1.1 mrg if (poffset)
7310 1.1 mrg offset = *poffset;
7311 1.1 mrg else
7312 1.1 mrg {
7313 1.1 mrg unsigned HOST_WIDE_INT align = DECL_ALIGN (var);
7314 1.1 mrg gang_private_shared_size
7315 1.1 mrg = (gang_private_shared_size + align - 1) & ~(align - 1);
7316 1.1 mrg if (gang_private_shared_align < align)
7317 1.1 mrg gang_private_shared_align = align;
7318 1.1 mrg
7319 1.1 mrg offset = gang_private_shared_size;
7320 1.1 mrg bool existed = gang_private_shared_hmap.put (var, offset);
7321 1.1 mrg gcc_checking_assert (!existed);
7322 1.1 mrg gang_private_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var));
7323 1.1 mrg
7324 1.1 mrg location_t loc = EXPR_LOCATION (TREE_VALUE (attr));
7325 1.1 mrg #if 0 /* For some reason, this doesn't work. */
7326 1.1 mrg if (dump_enabled_p ())
7327 1.1 mrg {
7328 1.1 mrg dump_flags_t l_dump_flags
7329 1.1 mrg = get_openacc_privatization_dump_flags ();
7330 1.1 mrg
7331 1.1 mrg const dump_user_location_t d_u_loc
7332 1.1 mrg = dump_user_location_t::from_location_t (loc);
7333 1.1 mrg /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
7334 1.1 mrg #if __GNUC__ >= 10
7335 1.1 mrg # pragma GCC diagnostic push
7336 1.1 mrg # pragma GCC diagnostic ignored "-Wformat"
7337 1.1 mrg #endif
7338 1.1 mrg dump_printf_loc (l_dump_flags, d_u_loc,
7339 1.1 mrg "variable %<%T%> adjusted for OpenACC"
7340 1.1 mrg " privatization level: %qs\n",
7341 1.1 mrg var, "gang");
7342 1.1 mrg #if __GNUC__ >= 10
7343 1.1 mrg # pragma GCC diagnostic pop
7344 1.1 mrg #endif
7345 1.1 mrg }
7346 1.1 mrg #else /* ..., thus emulate that, good enough for testsuite usage. */
7347 1.1 mrg if (param_openacc_privatization != OPENACC_PRIVATIZATION_QUIET)
7348 1.1 mrg inform (loc,
7349 1.1 mrg "variable %qD adjusted for OpenACC privatization level:"
7350 1.1 mrg " %qs",
7351 1.1 mrg var, "gang");
7352 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS))
7353 1.1 mrg {
7354 1.1 mrg /* 'dumpfile.cc:dump_loc' */
7355 1.1 mrg fprintf (dump_file, "%s:%d:%d: ", LOCATION_FILE (loc),
7356 1.1 mrg LOCATION_LINE (loc), LOCATION_COLUMN (loc));
7357 1.1 mrg fprintf (dump_file, "%s: ", "note");
7358 1.1 mrg
7359 1.1 mrg fprintf (dump_file,
7360 1.1 mrg "variable '");
7361 1.1 mrg print_generic_expr (dump_file, var, TDF_SLIM);
7362 1.1 mrg fprintf (dump_file,
7363 1.1 mrg "' adjusted for OpenACC privatization level: '%s'\n",
7364 1.1 mrg "gang");
7365 1.1 mrg }
7366 1.1 mrg #endif
7367 1.1 mrg }
7368 1.1 mrg rtx addr = plus_constant (Pmode, gang_private_shared_sym, offset);
7369 1.1 mrg return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr);
7370 1.1 mrg }
7371 1.1 mrg
7372 1.1 mrg return NULL_RTX;
7373 1.1 mrg }
7374 1.1 mrg
7375 1.1 mrg static GTY(()) tree nvptx_previous_fndecl;
7376 1.1 mrg
7377 1.1 mrg static void
7378 1.1 mrg nvptx_set_current_function (tree fndecl)
7379 1.1 mrg {
7380 1.1 mrg if (!fndecl || fndecl == nvptx_previous_fndecl)
7381 1.1 mrg return;
7382 1.1 mrg
7383 1.1 mrg gang_private_shared_hmap.empty ();
7384 1.1 mrg nvptx_previous_fndecl = fndecl;
7385 1.1 mrg vector_red_partition = 0;
7386 1.1 mrg oacc_bcast_partition = 0;
7387 1.1 mrg }
7388 1.1 mrg
7389 1.1 mrg /* Implement TARGET_LIBC_HAS_FUNCTION. */
7390 1.1 mrg
7391 1.1 mrg bool
7392 1.1 mrg nvptx_libc_has_function (enum function_class fn_class, tree type)
7393 1.1 mrg {
7394 1.1 mrg if (fn_class == function_sincos)
7395 1.1 mrg {
7396 1.1 mrg if (type != NULL_TREE)
7397 1.1 mrg /* Currently, newlib does not support sincosl. */
7398 1.1 mrg return type == float_type_node || type == double_type_node;
7399 1.1 mrg else
7400 1.1 mrg return true;
7401 1.1 mrg }
7402 1.1 mrg
7403 1.1 mrg return default_libc_has_function (fn_class, type);
7404 1.1 mrg }
7405 1.1 mrg
7406 1.1 mrg bool
7407 1.1 mrg nvptx_mem_local_p (rtx mem)
7408 1.1 mrg {
7409 1.1 mrg gcc_assert (GET_CODE (mem) == MEM);
7410 1.1 mrg
7411 1.1 mrg struct address_info info;
7412 1.1 mrg decompose_mem_address (&info, mem);
7413 1.1 mrg
7414 1.1 mrg if (info.base != NULL && REG_P (*info.base)
7415 1.1 mrg && REGNO_PTR_FRAME_P (REGNO (*info.base)))
7416 1.1 mrg {
7417 1.1 mrg if (TARGET_SOFT_STACK)
7418 1.1 mrg {
7419 1.1 mrg /* Frame-related doesn't mean local. */
7420 1.1 mrg }
7421 1.1 mrg else
7422 1.1 mrg return true;
7423 1.1 mrg }
7424 1.1 mrg
7425 1.1 mrg return false;
7426 1.1 mrg }
7427 1.1 mrg
7428 1.1 mrg /* Define locally, for use in NVPTX_ASM_OUTPUT_DEF. */
7429 1.1 mrg #define SET_ASM_OP ".alias "
7430 1.1 mrg
7431 1.1 mrg /* Define locally, for use in nvptx_asm_output_def_from_decls. Add NVPTX_
7432 1.1 mrg prefix to avoid clash with ASM_OUTPUT_DEF from nvptx.h.
7433 1.1 mrg Copy of ASM_OUTPUT_DEF from defaults.h, with added terminating
7434 1.1 mrg semicolon. */
7435 1.1 mrg #define NVPTX_ASM_OUTPUT_DEF(FILE,LABEL1,LABEL2) \
7436 1.1 mrg do \
7437 1.1 mrg { \
7438 1.1 mrg fprintf ((FILE), "%s", SET_ASM_OP); \
7439 1.1 mrg assemble_name (FILE, LABEL1); \
7440 1.1 mrg fprintf (FILE, ","); \
7441 1.1 mrg assemble_name (FILE, LABEL2); \
7442 1.1 mrg fprintf (FILE, ";\n"); \
7443 1.1 mrg } \
7444 1.1 mrg while (0)
7445 1.1 mrg
7446 1.1 mrg void
7447 1.1 mrg nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value)
7448 1.1 mrg {
7449 1.1 mrg if (nvptx_alias == 0 || !TARGET_PTX_6_3)
7450 1.1 mrg {
7451 1.1 mrg /* Copied from assemble_alias. */
7452 1.1 mrg error_at (DECL_SOURCE_LOCATION (name),
7453 1.1 mrg "alias definitions not supported in this configuration");
7454 1.1 mrg TREE_ASM_WRITTEN (name) = 1;
7455 1.1 mrg return;
7456 1.1 mrg }
7457 1.1 mrg
7458 1.1 mrg if (lookup_attribute ("weak", DECL_ATTRIBUTES (name)))
7459 1.1 mrg {
7460 1.1 mrg /* Prevent execution FAILs for gcc.dg/globalalias.c and
7461 1.1 mrg gcc.dg/pr77587.c. */
7462 1.1 mrg error_at (DECL_SOURCE_LOCATION (name),
7463 1.1 mrg "weak alias definitions not supported in this configuration");
7464 1.1 mrg TREE_ASM_WRITTEN (name) = 1;
7465 1.1 mrg return;
7466 1.1 mrg }
7467 1.1 mrg
7468 1.1 mrg /* Ptx also doesn't support value having weak linkage, but we can't detect
7469 1.1 mrg that here, so we'll end up with:
7470 1.1 mrg "error: Function test with .weak scope cannot be aliased".
7471 1.1 mrg See gcc.dg/localalias.c. */
7472 1.1 mrg
7473 1.1 mrg if (TREE_CODE (name) != FUNCTION_DECL)
7474 1.1 mrg {
7475 1.1 mrg error_at (DECL_SOURCE_LOCATION (name),
7476 1.1 mrg "non-function alias definitions not supported"
7477 1.1 mrg " in this configuration");
7478 1.1 mrg TREE_ASM_WRITTEN (name) = 1;
7479 1.1 mrg return;
7480 1.1 mrg }
7481 1.1 mrg
7482 1.1 mrg if (!cgraph_node::get (name)->referred_to_p ())
7483 1.1 mrg /* Prevent "Internal error: reference to deleted section". */
7484 1.1 mrg return;
7485 1.1 mrg
7486 1.1 mrg std::stringstream s;
7487 1.1 mrg write_fn_proto (s, false, get_fnname_from_decl (name), name);
7488 1.1 mrg fputs (s.str ().c_str (), stream);
7489 1.1 mrg
7490 1.1 mrg tree id = DECL_ASSEMBLER_NAME (name);
7491 1.1 mrg NVPTX_ASM_OUTPUT_DEF (stream, IDENTIFIER_POINTER (id),
7492 1.1 mrg IDENTIFIER_POINTER (value));
7493 1.1 mrg }
7494 1.1 mrg
7495 1.1 mrg #undef NVPTX_ASM_OUTPUT_DEF
7496 1.1 mrg #undef SET_ASM_OP
7497 1.1 mrg
7498 1.1 mrg #undef TARGET_OPTION_OVERRIDE
7499 1.1 mrg #define TARGET_OPTION_OVERRIDE nvptx_option_override
7500 1.1 mrg
7501 1.1 mrg #undef TARGET_ATTRIBUTE_TABLE
7502 1.1 mrg #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
7503 1.1 mrg
7504 1.1 mrg #undef TARGET_LRA_P
7505 1.1 mrg #define TARGET_LRA_P hook_bool_void_false
7506 1.1 mrg
7507 1.1 mrg #undef TARGET_LEGITIMATE_ADDRESS_P
7508 1.1 mrg #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
7509 1.1 mrg
7510 1.1 mrg #undef TARGET_PROMOTE_FUNCTION_MODE
7511 1.1 mrg #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
7512 1.1 mrg
7513 1.1 mrg #undef TARGET_FUNCTION_ARG
7514 1.1 mrg #define TARGET_FUNCTION_ARG nvptx_function_arg
7515 1.1 mrg #undef TARGET_FUNCTION_INCOMING_ARG
7516 1.1 mrg #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
7517 1.1 mrg #undef TARGET_FUNCTION_ARG_ADVANCE
7518 1.1 mrg #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
7519 1.1 mrg #undef TARGET_FUNCTION_ARG_BOUNDARY
7520 1.1 mrg #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
7521 1.1 mrg #undef TARGET_PASS_BY_REFERENCE
7522 1.1 mrg #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
7523 1.1 mrg #undef TARGET_FUNCTION_VALUE_REGNO_P
7524 1.1 mrg #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
7525 1.1 mrg #undef TARGET_FUNCTION_VALUE
7526 1.1 mrg #define TARGET_FUNCTION_VALUE nvptx_function_value
7527 1.1 mrg #undef TARGET_LIBCALL_VALUE
7528 1.1 mrg #define TARGET_LIBCALL_VALUE nvptx_libcall_value
7529 1.1 mrg #undef TARGET_FUNCTION_OK_FOR_SIBCALL
7530 1.1 mrg #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
7531 1.1 mrg #undef TARGET_GET_DRAP_RTX
7532 1.1 mrg #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
7533 1.1 mrg #undef TARGET_SPLIT_COMPLEX_ARG
7534 1.1 mrg #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
7535 1.1 mrg #undef TARGET_RETURN_IN_MEMORY
7536 1.1 mrg #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
7537 1.1 mrg #undef TARGET_OMIT_STRUCT_RETURN_REG
7538 1.1 mrg #define TARGET_OMIT_STRUCT_RETURN_REG true
7539 1.1 mrg #undef TARGET_STRICT_ARGUMENT_NAMING
7540 1.1 mrg #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
7541 1.1 mrg #undef TARGET_CALL_ARGS
7542 1.1 mrg #define TARGET_CALL_ARGS nvptx_call_args
7543 1.1 mrg #undef TARGET_END_CALL_ARGS
7544 1.1 mrg #define TARGET_END_CALL_ARGS nvptx_end_call_args
7545 1.1 mrg
7546 1.1 mrg #undef TARGET_ASM_FILE_START
7547 1.1 mrg #define TARGET_ASM_FILE_START nvptx_file_start
7548 1.1 mrg #undef TARGET_ASM_FILE_END
7549 1.1 mrg #define TARGET_ASM_FILE_END nvptx_file_end
7550 1.1 mrg #undef TARGET_ASM_GLOBALIZE_LABEL
7551 1.1 mrg #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
7552 1.1 mrg #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
7553 1.1 mrg #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
7554 1.1 mrg #undef TARGET_PRINT_OPERAND
7555 1.1 mrg #define TARGET_PRINT_OPERAND nvptx_print_operand
7556 1.1 mrg #undef TARGET_PRINT_OPERAND_ADDRESS
7557 1.1 mrg #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
7558 1.1 mrg #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
7559 1.1 mrg #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
7560 1.1 mrg #undef TARGET_ASM_INTEGER
7561 1.1 mrg #define TARGET_ASM_INTEGER nvptx_assemble_integer
7562 1.1 mrg #undef TARGET_ASM_DECL_END
7563 1.1 mrg #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
7564 1.1 mrg #undef TARGET_ASM_DECLARE_CONSTANT_NAME
7565 1.1 mrg #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
7566 1.1 mrg #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
7567 1.1 mrg #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
7568 1.1 mrg #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
7569 1.1 mrg #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
7570 1.1 mrg
7571 1.1 mrg #undef TARGET_MACHINE_DEPENDENT_REORG
7572 1.1 mrg #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
7573 1.1 mrg #undef TARGET_NO_REGISTER_ALLOCATION
7574 1.1 mrg #define TARGET_NO_REGISTER_ALLOCATION true
7575 1.1 mrg
7576 1.1 mrg #undef TARGET_ENCODE_SECTION_INFO
7577 1.1 mrg #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
7578 1.1 mrg #undef TARGET_RECORD_OFFLOAD_SYMBOL
7579 1.1 mrg #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
7580 1.1 mrg
7581 1.1 mrg #undef TARGET_VECTOR_ALIGNMENT
7582 1.1 mrg #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
7583 1.1 mrg
7584 1.1 mrg #undef TARGET_CANNOT_COPY_INSN_P
7585 1.1 mrg #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
7586 1.1 mrg
7587 1.1 mrg #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
7588 1.1 mrg #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
7589 1.1 mrg
7590 1.1 mrg #undef TARGET_INIT_BUILTINS
7591 1.1 mrg #define TARGET_INIT_BUILTINS nvptx_init_builtins
7592 1.1 mrg #undef TARGET_EXPAND_BUILTIN
7593 1.1 mrg #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
7594 1.1 mrg #undef TARGET_BUILTIN_DECL
7595 1.1 mrg #define TARGET_BUILTIN_DECL nvptx_builtin_decl
7596 1.1 mrg
7597 1.1 mrg #undef TARGET_SIMT_VF
7598 1.1 mrg #define TARGET_SIMT_VF nvptx_simt_vf
7599 1.1 mrg
7600 1.1 mrg #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
7601 1.1 mrg #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
7602 1.1 mrg
7603 1.1 mrg #undef TARGET_GOACC_VALIDATE_DIMS
7604 1.1 mrg #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
7605 1.1 mrg
7606 1.1 mrg #undef TARGET_GOACC_DIM_LIMIT
7607 1.1 mrg #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
7608 1.1 mrg
7609 1.1 mrg #undef TARGET_GOACC_FORK_JOIN
7610 1.1 mrg #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
7611 1.1 mrg
7612 1.1 mrg #undef TARGET_GOACC_REDUCTION
7613 1.1 mrg #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
7614 1.1 mrg
7615 1.1 mrg #undef TARGET_CANNOT_FORCE_CONST_MEM
7616 1.1 mrg #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
7617 1.1 mrg
7618 1.1 mrg #undef TARGET_SCALAR_MODE_SUPPORTED_P
7619 1.1 mrg #define TARGET_SCALAR_MODE_SUPPORTED_P nvptx_scalar_mode_supported_p
7620 1.1 mrg
7621 1.1 mrg #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
7622 1.1 mrg #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
7623 1.1 mrg nvptx_libgcc_floating_mode_supported_p
7624 1.1 mrg
7625 1.1 mrg #undef TARGET_VECTOR_MODE_SUPPORTED_P
7626 1.1 mrg #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
7627 1.1 mrg
7628 1.1 mrg #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
7629 1.1 mrg #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
7630 1.1 mrg nvptx_preferred_simd_mode
7631 1.1 mrg
7632 1.1 mrg #undef TARGET_MODES_TIEABLE_P
7633 1.1 mrg #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
7634 1.1 mrg
7635 1.1 mrg #undef TARGET_HARD_REGNO_NREGS
7636 1.1 mrg #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
7637 1.1 mrg
7638 1.1 mrg #undef TARGET_CAN_CHANGE_MODE_CLASS
7639 1.1 mrg #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
7640 1.1 mrg
7641 1.1 mrg #undef TARGET_TRULY_NOOP_TRUNCATION
7642 1.1 mrg #define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation
7643 1.1 mrg
7644 1.1 mrg #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
7645 1.1 mrg #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
7646 1.1 mrg
7647 1.1 mrg #undef TARGET_GOACC_ADJUST_PRIVATE_DECL
7648 1.1 mrg #define TARGET_GOACC_ADJUST_PRIVATE_DECL nvptx_goacc_adjust_private_decl
7649 1.1 mrg
7650 1.1 mrg #undef TARGET_GOACC_EXPAND_VAR_DECL
7651 1.1 mrg #define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl
7652 1.1 mrg
7653 1.1 mrg #undef TARGET_SET_CURRENT_FUNCTION
7654 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
7655
7656 #undef TARGET_LIBC_HAS_FUNCTION
7657 #define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
7658
7659 struct gcc_target targetm = TARGET_INITIALIZER;
7660
7661 #include "gt-nvptx.h"
7662