plugin-gcn.c revision 1.1.1.3 1 /* Plugin for AMD GCN execution.
2
3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* {{{ Includes and defines */
30
31 #include "config.h"
32 #include "symcat.h"
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <pthread.h>
37 #include <inttypes.h>
38 #include <stdbool.h>
39 #include <limits.h>
40 #include <hsa.h>
41 #include <hsa_ext_amd.h>
42 #include <dlfcn.h>
43 #include <signal.h>
44 #include "libgomp-plugin.h"
45 #include "config/gcn/libgomp-gcn.h" /* For struct output. */
46 #include "gomp-constants.h"
47 #include <elf.h>
48 #include "oacc-plugin.h"
49 #include "oacc-int.h"
50 #include <assert.h>
51
52 /* These probably won't be in elf.h for a while. */
53 #ifndef R_AMDGPU_NONE
54 #define R_AMDGPU_NONE 0
55 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57 #define R_AMDGPU_ABS64 3 /* S + A */
58 #define R_AMDGPU_REL32 4 /* S + A - P */
59 #define R_AMDGPU_REL64 5 /* S + A - P */
60 #define R_AMDGPU_ABS32 6 /* S + A */
61 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
66 #define R_AMDGPU_RELATIVE64 13 /* B + A */
67 #endif
68
69 /* GCN specific definitions for asynchronous queues. */
70
71 #define ASYNC_QUEUE_SIZE 64
72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
73 #define DEBUG_QUEUES 0
74 #define DEBUG_THREAD_SLEEP 0
75 #define DEBUG_THREAD_SIGNAL 0
76
77 /* Defaults. */
78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
79
80 /* Secure getenv() which returns NULL if running as SUID/SGID. */
81 #ifndef HAVE_SECURE_GETENV
82 #ifdef HAVE___SECURE_GETENV
83 #define secure_getenv __secure_getenv
84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
86
87 #include <unistd.h>
88
89 /* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
91
92 static char *
93 secure_getenv (const char *name)
94 {
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
96 return getenv (name);
97 else
98 return NULL;
99 }
100
101 #else
102 #define secure_getenv getenv
103 #endif
104 #endif
105
106 /* }}} */
107 /* {{{ Types */
108
109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
110
111 struct gcn_thread
112 {
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
114 int async;
115 };
116
117 /* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
119
120 struct hsa_runtime_fn_info
121 {
122 /* HSA runtime. */
123 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
124 const char **status_string);
125 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
126 void *value);
127 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
128 hsa_agent_info_t attribute,
129 void *value);
130 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
131 hsa_isa_info_t attribute,
132 uint32_t index,
133 void *value);
134 hsa_status_t (*hsa_init_fn) (void);
135 hsa_status_t (*hsa_iterate_agents_fn)
136 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
137 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
138 hsa_region_info_t attribute,
139 void *value);
140 hsa_status_t (*hsa_queue_create_fn)
141 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
142 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
143 void *data, uint32_t private_segment_size,
144 uint32_t group_segment_size, hsa_queue_t **queue);
145 hsa_status_t (*hsa_agent_iterate_regions_fn)
146 (hsa_agent_t agent,
147 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
148 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
149 hsa_status_t (*hsa_executable_create_fn)
150 (hsa_profile_t profile, hsa_executable_state_t executable_state,
151 const char *options, hsa_executable_t *executable);
152 hsa_status_t (*hsa_executable_global_variable_define_fn)
153 (hsa_executable_t executable, const char *variable_name, void *address);
154 hsa_status_t (*hsa_executable_load_code_object_fn)
155 (hsa_executable_t executable, hsa_agent_t agent,
156 hsa_code_object_t code_object, const char *options);
157 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
158 const char *options);
159 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
160 uint32_t num_consumers,
161 const hsa_agent_t *consumers,
162 hsa_signal_t *signal);
163 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
164 void **ptr);
165 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
166 hsa_access_permission_t access);
167 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
168 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
169 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
170 hsa_status_t (*hsa_executable_get_symbol_fn)
171 (hsa_executable_t executable, const char *module_name,
172 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
173 hsa_executable_symbol_t *symbol);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn)
175 (hsa_executable_symbol_t executable_symbol,
176 hsa_executable_symbol_info_t attribute, void *value);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn)
178 (hsa_executable_t executable,
179 hsa_status_t (*callback)(hsa_executable_t executable,
180 hsa_executable_symbol_t symbol, void *data),
181 void *data);
182 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
183 uint64_t value);
184 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
185 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
190 (hsa_signal_t signal, hsa_signal_condition_t condition,
191 hsa_signal_value_t compare_value, uint64_t timeout_hint,
192 hsa_wait_state_t wait_state_hint);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
194 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
195
196 hsa_status_t (*hsa_code_object_deserialize_fn)
197 (void *serialized_code_object, size_t serialized_code_object_size,
198 const char *options, hsa_code_object_t *code_object);
199 hsa_status_t (*hsa_amd_memory_lock_fn)
200 (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
201 void **agent_ptr);
202 hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
203 hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
204 (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
205 const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
206 const hsa_dim3_t *range, hsa_agent_t copy_agent,
207 hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
208 const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
209 };
210
211 /* Structure describing the run-time and grid properties of an HSA kernel
212 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
213
214 struct GOMP_kernel_launch_attributes
215 {
216 /* Number of dimensions the workload has. Maximum number is 3. */
217 uint32_t ndim;
218 /* Size of the grid in the three respective dimensions. */
219 uint32_t gdims[3];
220 /* Size of work-groups in the respective dimensions. */
221 uint32_t wdims[3];
222 };
223
224 /* Collection of information needed for a dispatch of a kernel from a
225 kernel. */
226
227 struct kernel_dispatch
228 {
229 struct agent_info *agent;
230 /* Pointer to a command queue associated with a kernel dispatch agent. */
231 void *queue;
232 /* Pointer to a memory space used for kernel arguments passing. */
233 void *kernarg_address;
234 /* Kernel object. */
235 uint64_t object;
236 /* Synchronization signal used for dispatch synchronization. */
237 uint64_t signal;
238 /* Private segment size. */
239 uint32_t private_segment_size;
240 /* Group segment size. */
241 uint32_t group_segment_size;
242 };
243
244 /* Structure of the kernargs segment, supporting console output.
245
246 This needs to match the definitions in Newlib, and the expectations
247 in libgomp target code. */
248
249 struct kernargs {
250 struct kernargs_abi abi;
251
252 /* Output data. */
253 struct output output_data;
254 };
255
256 /* A queue entry for a future asynchronous launch. */
257
258 struct kernel_launch
259 {
260 struct kernel_info *kernel;
261 void *vars;
262 struct GOMP_kernel_launch_attributes kla;
263 };
264
265 /* A queue entry for a future callback. */
266
267 struct callback
268 {
269 void (*fn)(void *);
270 void *data;
271 };
272
273 /* A data struct for the copy_data callback. */
274
275 struct copy_data
276 {
277 void *dst;
278 const void *src;
279 size_t len;
280 struct goacc_asyncqueue *aq;
281 };
282
283 /* A queue entry for a placeholder. These correspond to a wait event. */
284
285 struct placeholder
286 {
287 int executed;
288 pthread_cond_t cond;
289 pthread_mutex_t mutex;
290 };
291
292 /* A queue entry for a wait directive. */
293
294 struct asyncwait_info
295 {
296 struct placeholder *placeholderp;
297 };
298
299 /* Encode the type of an entry in an async queue. */
300
301 enum entry_type
302 {
303 KERNEL_LAUNCH,
304 CALLBACK,
305 ASYNC_WAIT,
306 ASYNC_PLACEHOLDER
307 };
308
309 /* An entry in an async queue. */
310
311 struct queue_entry
312 {
313 enum entry_type type;
314 union {
315 struct kernel_launch launch;
316 struct callback callback;
317 struct asyncwait_info asyncwait;
318 struct placeholder placeholder;
319 } u;
320 };
321
322 /* An async queue header.
323
324 OpenMP may create one of these.
325 OpenACC may create many. */
326
327 struct goacc_asyncqueue
328 {
329 struct agent_info *agent;
330 hsa_queue_t *hsa_queue;
331
332 pthread_t thread_drain_queue;
333 pthread_mutex_t mutex;
334 pthread_cond_t queue_cond_in;
335 pthread_cond_t queue_cond_out;
336 struct queue_entry queue[ASYNC_QUEUE_SIZE];
337 int queue_first;
338 int queue_n;
339 int drain_queue_stop;
340
341 int id;
342 struct goacc_asyncqueue *prev;
343 struct goacc_asyncqueue *next;
344 };
345
346 /* Mkoffload uses this structure to describe a kernel.
347
348 OpenMP kernel dimensions are passed at runtime.
349 OpenACC kernel dimensions are passed at compile time, here. */
350
351 struct hsa_kernel_description
352 {
353 const char *name;
354 int oacc_dims[3]; /* Only present for GCN kernels. */
355 int sgpr_count;
356 int vpgr_count;
357 };
358
359 /* Mkoffload uses this structure to describe an offload variable. */
360
361 struct global_var_info
362 {
363 const char *name;
364 void *address;
365 };
366
367 /* Mkoffload uses this structure to describe all the kernels in a
368 loadable module. These are passed the libgomp via static constructors. */
369
370 struct gcn_image_desc
371 {
372 struct gcn_image {
373 size_t size;
374 void *image;
375 } *gcn_image;
376 const unsigned kernel_count;
377 struct hsa_kernel_description *kernel_infos;
378 const unsigned ind_func_count;
379 const unsigned global_variable_count;
380 };
381
382 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
383 support.
384 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
385
386 typedef enum {
387 EF_AMDGPU_MACH_UNSUPPORTED = -1,
388 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
389 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
390 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
391 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
392 EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
393 EF_AMDGPU_MACH_AMDGCN_GFX90c = 0x032,
394 EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
395 EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045,
396 EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
397 EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044
398 } EF_AMDGPU_MACH;
399
400 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
401 typedef EF_AMDGPU_MACH gcn_isa;
402
403 /* Description of an HSA GPU agent (device) and the program associated with
404 it. */
405
406 struct agent_info
407 {
408 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
409 hsa_agent_t id;
410 /* The user-visible device number. */
411 int device_id;
412 /* Whether the agent has been initialized. The fields below are usable only
413 if it has been. */
414 bool initialized;
415
416 /* The instruction set architecture of the device. */
417 gcn_isa device_isa;
418 /* Name of the agent. */
419 char name[64];
420 /* Name of the vendor of the agent. */
421 char vendor_name[64];
422 /* Command queues of the agent. */
423 hsa_queue_t *sync_queue;
424 struct goacc_asyncqueue *async_queues, *omp_async_queue;
425 pthread_mutex_t async_queues_mutex;
426
427 /* The HSA memory region from which to allocate kernel arguments. */
428 hsa_region_t kernarg_region;
429
430 /* The HSA memory region from which to allocate device data. */
431 hsa_region_t data_region;
432
433 /* Allocated ephemeral memories (team arena and stack space). */
434 struct ephemeral_memories_list *ephemeral_memories_list;
435 pthread_mutex_t ephemeral_memories_write_lock;
436
437 /* Read-write lock that protects kernels which are running or about to be run
438 from interference with loading and unloading of images. Needs to be
439 locked for reading while a kernel is being run, and for writing if the
440 list of modules is manipulated (and thus the HSA program invalidated). */
441 pthread_rwlock_t module_rwlock;
442
443 /* The module associated with this kernel. */
444 struct module_info *module;
445
446 /* Mutex enforcing that only one thread will finalize the HSA program. A
447 thread should have locked agent->module_rwlock for reading before
448 acquiring it. */
449 pthread_mutex_t prog_mutex;
450 /* Flag whether the HSA program that consists of all the modules has been
451 finalized. */
452 bool prog_finalized;
453 /* HSA executable - the finalized program that is used to locate kernels. */
454 hsa_executable_t executable;
455 };
456
457 /* Information required to identify, finalize and run any given kernel. */
458
459 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
460
461 struct kernel_info
462 {
463 /* Name of the kernel, required to locate it within the GCN object-code
464 module. */
465 const char *name;
466 /* The specific agent the kernel has been or will be finalized for and run
467 on. */
468 struct agent_info *agent;
469 /* The specific module where the kernel takes place. */
470 struct module_info *module;
471 /* Information provided by mkoffload associated with the kernel. */
472 struct hsa_kernel_description *description;
473 /* Mutex enforcing that at most once thread ever initializes a kernel for
474 use. A thread should have locked agent->module_rwlock for reading before
475 acquiring it. */
476 pthread_mutex_t init_mutex;
477 /* Flag indicating whether the kernel has been initialized and all fields
478 below it contain valid data. */
479 bool initialized;
480 /* Flag indicating that the kernel has a problem that blocks an execution. */
481 bool initialization_failed;
482 /* The object to be put into the dispatch queue. */
483 uint64_t object;
484 /* Required size of kernel arguments. */
485 uint32_t kernarg_segment_size;
486 /* Required size of group segment. */
487 uint32_t group_segment_size;
488 /* Required size of private segment. */
489 uint32_t private_segment_size;
490 /* Set up for OpenMP or OpenACC? */
491 enum offload_kind kind;
492 };
493
494 /* Information about a particular GCN module, its image and kernels. */
495
496 struct module_info
497 {
498 /* The description with which the program has registered the image. */
499 struct gcn_image_desc *image_desc;
500 /* GCN heap allocation. */
501 struct heap *heap;
502 /* Physical boundaries of the loaded module. */
503 Elf64_Addr phys_address_start;
504 Elf64_Addr phys_address_end;
505
506 bool constructors_run_p;
507 struct kernel_info *init_array_func, *fini_array_func;
508
509 /* Number of kernels in this module. */
510 int kernel_count;
511 /* An array of kernel_info structures describing each kernel in this
512 module. */
513 struct kernel_info kernels[];
514 };
515
516 /* A linked list of memory arenas allocated on the device.
517 These are used by OpenMP, as a means to optimize per-team malloc,
518 and for host-accessible stack space. */
519
520 struct ephemeral_memories_list
521 {
522 struct ephemeral_memories_list *next;
523
524 /* The size is determined by the number of teams and threads. */
525 size_t size;
526 /* The device address allocated memory. */
527 void *address;
528 /* A flag to prevent two asynchronous kernels trying to use the same memory.
529 The mutex is locked until the kernel exits. */
530 pthread_mutex_t in_use;
531 };
532
533 /* Information about the whole HSA environment and all of its agents. */
534
535 struct hsa_context_info
536 {
537 /* Whether the structure has been initialized. */
538 bool initialized;
539 /* Number of usable GPU HSA agents in the system. */
540 int agent_count;
541 /* Array of agent_info structures describing the individual HSA agents. */
542 struct agent_info *agents;
543 /* Driver version string. */
544 char driver_version_s[30];
545 };
546
547 /* }}} */
548 /* {{{ Global variables */
549
550 /* Information about the whole HSA environment and all of its agents. */
551
552 static struct hsa_context_info hsa_context;
553
554 /* HSA runtime functions that are initialized in init_hsa_context. */
555
556 static struct hsa_runtime_fn_info hsa_fns;
557
558 /* Heap space, allocated target-side, provided for use of newlib malloc.
559 Each module should have it's own heap allocated.
560 Beware that heap usage increases with OpenMP teams. See also arenas. */
561
562 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
563
564 /* Ephemeral memory sizes for each kernel launch. */
565
566 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
567 static int stack_size = DEFAULT_GCN_STACK_SIZE;
568 static int lowlat_size = -1;
569
570 /* Flag to decide whether print to stderr information about what is going on.
571 Set in init_debug depending on environment variables. */
572
573 static bool debug;
574
575 /* Flag to decide if the runtime should suppress a possible fallback to host
576 execution. */
577
578 static bool suppress_host_fallback;
579
580 /* Flag to locate HSA runtime shared library that is dlopened
581 by this plug-in. */
582
583 static const char *hsa_runtime_lib;
584
585 /* Flag to decide if the runtime should support also CPU devices (can be
586 a simulator). */
587
588 static bool support_cpu_devices;
589
590 /* Runtime dimension overrides. Zero indicates default. */
591
592 static int override_x_dim = 0;
593 static int override_z_dim = 0;
594
595 /* }}} */
596 /* {{{ Debug & Diagnostic */
597
598 /* Print a message to stderr if GCN_DEBUG value is set to true. */
599
600 #define DEBUG_PRINT(...) \
601 do \
602 { \
603 if (debug) \
604 { \
605 fprintf (stderr, __VA_ARGS__); \
606 } \
607 } \
608 while (false);
609
610 /* Flush stderr if GCN_DEBUG value is set to true. */
611
612 #define DEBUG_FLUSH() \
613 do { \
614 if (debug) \
615 fflush (stderr); \
616 } while (false)
617
618 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
619 is set to true. */
620
621 #define DEBUG_LOG(prefix, ...) \
622 do \
623 { \
624 DEBUG_PRINT (prefix); \
625 DEBUG_PRINT (__VA_ARGS__); \
626 DEBUG_FLUSH (); \
627 } while (false)
628
629 /* Print a debugging message to stderr. */
630
631 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
632
633 /* Print a warning message to stderr. */
634
635 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
636
637 /* Print HSA warning STR with an HSA STATUS code. */
638
639 static void
640 hsa_warn (const char *str, hsa_status_t status)
641 {
642 if (!debug)
643 return;
644
645 const char *hsa_error_msg = "[unknown]";
646 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
647
648 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
649 hsa_error_msg);
650 }
651
652 /* Report a fatal error STR together with the HSA error corresponding to STATUS
653 and terminate execution of the current process. */
654
655 static void
656 hsa_fatal (const char *str, hsa_status_t status)
657 {
658 const char *hsa_error_msg = "[unknown]";
659 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
660 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
661 hsa_error_msg);
662 }
663
664 /* Like hsa_fatal, except only report error message, and return FALSE
665 for propagating error processing to outside of plugin. */
666
667 static bool
668 hsa_error (const char *str, hsa_status_t status)
669 {
670 const char *hsa_error_msg = "[unknown]";
671 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
672 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
673 hsa_error_msg);
674 return false;
675 }
676
677 /* Dump information about the available hardware. */
678
679 static void
680 dump_hsa_system_info (void)
681 {
682 hsa_status_t status;
683
684 hsa_endianness_t endianness;
685 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
686 &endianness);
687 if (status == HSA_STATUS_SUCCESS)
688 switch (endianness)
689 {
690 case HSA_ENDIANNESS_LITTLE:
691 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
692 break;
693 case HSA_ENDIANNESS_BIG:
694 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
695 break;
696 default:
697 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
698 }
699 else
700 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
701
702 uint8_t extensions[128];
703 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
704 &extensions);
705 if (status == HSA_STATUS_SUCCESS)
706 {
707 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
708 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
709 }
710 else
711 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
712 }
713
714 /* Dump information about the available hardware. */
715
716 static void
717 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
718 {
719 switch (machine_model)
720 {
721 case HSA_MACHINE_MODEL_SMALL:
722 GCN_DEBUG ("%s: SMALL\n", s);
723 break;
724 case HSA_MACHINE_MODEL_LARGE:
725 GCN_DEBUG ("%s: LARGE\n", s);
726 break;
727 default:
728 GCN_WARNING ("%s: UNKNOWN\n", s);
729 break;
730 }
731 }
732
733 /* Dump information about the available hardware. */
734
735 static void
736 dump_profile (hsa_profile_t profile, const char *s)
737 {
738 switch (profile)
739 {
740 case HSA_PROFILE_FULL:
741 GCN_DEBUG ("%s: FULL\n", s);
742 break;
743 case HSA_PROFILE_BASE:
744 GCN_DEBUG ("%s: BASE\n", s);
745 break;
746 default:
747 GCN_WARNING ("%s: UNKNOWN\n", s);
748 break;
749 }
750 }
751
752 /* Dump information about a device memory region. */
753
754 static hsa_status_t
755 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
756 {
757 hsa_status_t status;
758
759 hsa_region_segment_t segment;
760 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
761 &segment);
762 if (status == HSA_STATUS_SUCCESS)
763 {
764 if (segment == HSA_REGION_SEGMENT_GLOBAL)
765 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
766 else if (segment == HSA_REGION_SEGMENT_READONLY)
767 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
768 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
769 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
770 else if (segment == HSA_REGION_SEGMENT_GROUP)
771 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
772 else
773 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
774 }
775 else
776 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
777
778 if (segment == HSA_REGION_SEGMENT_GLOBAL)
779 {
780 uint32_t flags;
781 status
782 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
783 &flags);
784 if (status == HSA_STATUS_SUCCESS)
785 {
786 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
787 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
788 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
789 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
790 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
791 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
792 }
793 else
794 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
795 }
796
797 size_t size;
798 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
799 if (status == HSA_STATUS_SUCCESS)
800 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
801 else
802 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
803
804 status
805 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
806 &size);
807 if (status == HSA_STATUS_SUCCESS)
808 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
809 else
810 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
811
812 bool alloc_allowed;
813 status
814 = hsa_fns.hsa_region_get_info_fn (region,
815 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
816 &alloc_allowed);
817 if (status == HSA_STATUS_SUCCESS)
818 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
819 else
820 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
821
822 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
823 return HSA_STATUS_SUCCESS;
824
825 status
826 = hsa_fns.hsa_region_get_info_fn (region,
827 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
828 &size);
829 if (status == HSA_STATUS_SUCCESS)
830 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
831 else
832 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
833
834 size_t align;
835 status
836 = hsa_fns.hsa_region_get_info_fn (region,
837 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
838 &align);
839 if (status == HSA_STATUS_SUCCESS)
840 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
841 else
842 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
843
844 return HSA_STATUS_SUCCESS;
845 }
846
847 /* Dump information about all the device memory regions. */
848
849 static void
850 dump_hsa_regions (hsa_agent_t agent)
851 {
852 hsa_status_t status;
853 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
854 dump_hsa_region,
855 NULL);
856 if (status != HSA_STATUS_SUCCESS)
857 hsa_error ("Dumping hsa regions failed", status);
858 }
859
860 /* Dump information about the available devices. */
861
862 static hsa_status_t
863 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
864 {
865 hsa_status_t status;
866
867 char buf[64];
868 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
869 &buf);
870 if (status == HSA_STATUS_SUCCESS)
871 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
872 else
873 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
874
875 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
876 &buf);
877 if (status == HSA_STATUS_SUCCESS)
878 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
879 else
880 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
881
882 hsa_machine_model_t machine_model;
883 status
884 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
885 &machine_model);
886 if (status == HSA_STATUS_SUCCESS)
887 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
888 else
889 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
890
891 hsa_profile_t profile;
892 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
893 &profile);
894 if (status == HSA_STATUS_SUCCESS)
895 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
896 else
897 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
898
899 hsa_device_type_t device_type;
900 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
901 &device_type);
902 if (status == HSA_STATUS_SUCCESS)
903 {
904 switch (device_type)
905 {
906 case HSA_DEVICE_TYPE_CPU:
907 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
908 break;
909 case HSA_DEVICE_TYPE_GPU:
910 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
911 break;
912 case HSA_DEVICE_TYPE_DSP:
913 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
914 break;
915 default:
916 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
917 break;
918 }
919 }
920 else
921 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
922
923 uint32_t cu_count;
924 status = hsa_fns.hsa_agent_get_info_fn
925 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
926 if (status == HSA_STATUS_SUCCESS)
927 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
928 else
929 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
930
931 uint32_t size;
932 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
933 &size);
934 if (status == HSA_STATUS_SUCCESS)
935 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
936 else
937 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
938
939 uint32_t max_dim;
940 status = hsa_fns.hsa_agent_get_info_fn (agent,
941 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
942 &max_dim);
943 if (status == HSA_STATUS_SUCCESS)
944 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
945 else
946 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
947
948 uint32_t max_size;
949 status = hsa_fns.hsa_agent_get_info_fn (agent,
950 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
951 &max_size);
952 if (status == HSA_STATUS_SUCCESS)
953 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
954 else
955 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
956
957 uint32_t grid_max_dim;
958 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
959 &grid_max_dim);
960 if (status == HSA_STATUS_SUCCESS)
961 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
962 else
963 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
964
965 uint32_t grid_max_size;
966 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
967 &grid_max_size);
968 if (status == HSA_STATUS_SUCCESS)
969 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
970 else
971 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
972
973 dump_hsa_regions (agent);
974
975 return HSA_STATUS_SUCCESS;
976 }
977
978 /* Forward reference. */
979
980 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
981
982 /* Helper function for dump_executable_symbols. */
983
984 static hsa_status_t
985 dump_executable_symbol (hsa_executable_t executable,
986 hsa_executable_symbol_t symbol,
987 void *data __attribute__((unused)))
988 {
989 char *name = get_executable_symbol_name (symbol);
990
991 if (name)
992 {
993 GCN_DEBUG ("executable symbol: %s\n", name);
994 free (name);
995 }
996
997 return HSA_STATUS_SUCCESS;
998 }
999
1000 /* Dump all global symbol in an executable. */
1001
1002 static void
1003 dump_executable_symbols (hsa_executable_t executable)
1004 {
1005 hsa_status_t status;
1006 status
1007 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1008 dump_executable_symbol,
1009 NULL);
1010 if (status != HSA_STATUS_SUCCESS)
1011 hsa_fatal ("Could not dump HSA executable symbols", status);
1012 }
1013
1014 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1015
1016 static void
1017 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1018 {
1019 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1020
1021 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1022 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1023 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1024 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1025 (void*)kernargs->abi.heap_ptr);
1026 fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1027 "", (void*)kernargs->abi.arena_ptr,
1028 kernargs->abi.arena_size_per_team);
1029 fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1030 "", (void*)kernargs->abi.stack_ptr,
1031 kernargs->abi.stack_size_per_thread);
1032 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1033 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1034 dispatch->private_segment_size);
1035 fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1036 "", dispatch->group_segment_size);
1037 fprintf (stderr, "\n");
1038 }
1039
1040 /* }}} */
1041 /* {{{ Utility functions */
1042
1043 /* Cast the thread local storage to gcn_thread. */
1044
1045 static inline struct gcn_thread *
1046 gcn_thread (void)
1047 {
1048 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1049 }
1050
1051 /* Initialize debug and suppress_host_fallback according to the environment. */
1052
1053 static void
1054 init_environment_variables (void)
1055 {
1056 if (secure_getenv ("GCN_DEBUG"))
1057 debug = true;
1058 else
1059 debug = false;
1060
1061 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1062 suppress_host_fallback = true;
1063 else
1064 suppress_host_fallback = false;
1065
1066 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1067 if (hsa_runtime_lib == NULL)
1068 hsa_runtime_lib = "libhsa-runtime64.so.1";
1069
1070 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1071
1072 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1073 if (!x)
1074 x = secure_getenv ("GCN_NUM_GANGS");
1075 if (x)
1076 override_x_dim = atoi (x);
1077
1078 const char *z = secure_getenv ("GCN_NUM_THREADS");
1079 if (!z)
1080 z = secure_getenv ("GCN_NUM_WORKERS");
1081 if (z)
1082 override_z_dim = atoi (z);
1083
1084 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1085 if (heap)
1086 {
1087 size_t tmp = atol (heap);
1088 if (tmp)
1089 gcn_kernel_heap_size = tmp;
1090 }
1091
1092 const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1093 if (arena)
1094 {
1095 int tmp = atoi (arena);
1096 if (tmp)
1097 team_arena_size = tmp;;
1098 }
1099
1100 const char *stack = secure_getenv ("GCN_STACK_SIZE");
1101 if (stack)
1102 {
1103 int tmp = atoi (stack);
1104 if (tmp)
1105 stack_size = tmp;;
1106 }
1107
1108 const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1109 if (lowlat)
1110 lowlat_size = atoi (lowlat);
1111 }
1112
1113 /* Return malloc'd string with name of SYMBOL. */
1114
1115 static char *
1116 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1117 {
1118 hsa_status_t status;
1119 char *res;
1120 uint32_t len;
1121 const hsa_executable_symbol_info_t info_name_length
1122 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1123
1124 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1125 &len);
1126 if (status != HSA_STATUS_SUCCESS)
1127 {
1128 hsa_error ("Could not get length of symbol name", status);
1129 return NULL;
1130 }
1131
1132 res = GOMP_PLUGIN_malloc (len + 1);
1133
1134 const hsa_executable_symbol_info_t info_name
1135 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1136
1137 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1138
1139 if (status != HSA_STATUS_SUCCESS)
1140 {
1141 hsa_error ("Could not get symbol name", status);
1142 free (res);
1143 return NULL;
1144 }
1145
1146 res[len] = '\0';
1147
1148 return res;
1149 }
1150
1151 /* Get the number of GPU Compute Units. */
1152
1153 static int
1154 get_cu_count (struct agent_info *agent)
1155 {
1156 uint32_t cu_count;
1157 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1158 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1159 if (status == HSA_STATUS_SUCCESS)
1160 return cu_count;
1161 else
1162 return 64; /* The usual number for older devices. */
1163 }
1164
1165 /* Calculate the maximum grid size for OMP threads / OACC workers.
1166 This depends on the kernel's resource usage levels. */
1167
1168 static int
1169 limit_worker_threads (int threads)
1170 {
1171 /* FIXME Do something more inteligent here.
1172 GCN can always run 4 threads within a Compute Unit, but
1173 more than that depends on register usage. */
1174 if (threads > 16)
1175 threads = 16;
1176 return threads;
1177 }
1178
1179 /* This sets the maximum number of teams to twice the number of GPU Compute
1180 Units to avoid memory waste and corresponding memory access faults. */
1181
1182 static int
1183 limit_teams (int teams, struct agent_info *agent)
1184 {
1185 int max_teams = 2 * get_cu_count (agent);
1186 if (teams > max_teams)
1187 teams = max_teams;
1188 return teams;
1189 }
1190
1191 /* Parse the target attributes INPUT provided by the compiler and return true
1192 if we should run anything all. If INPUT is NULL, fill DEF with default
1193 values, then store INPUT or DEF into *RESULT.
1194
1195 This is used for OpenMP only. */
1196
1197 static bool
1198 parse_target_attributes (void **input,
1199 struct GOMP_kernel_launch_attributes *def,
1200 struct GOMP_kernel_launch_attributes **result,
1201 struct agent_info *agent)
1202 {
1203 if (!input)
1204 GOMP_PLUGIN_fatal ("No target arguments provided");
1205
1206 bool grid_attrs_found = false;
1207 bool gcn_dims_found = false;
1208 int gcn_teams = 0;
1209 int gcn_threads = 0;
1210 while (*input)
1211 {
1212 intptr_t id = (intptr_t) *input++, val;
1213
1214 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1215 val = (intptr_t) *input++;
1216 else
1217 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1218
1219 val = (val > INT_MAX) ? INT_MAX : val;
1220
1221 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1222 && ((id & GOMP_TARGET_ARG_ID_MASK)
1223 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1224 {
1225 grid_attrs_found = true;
1226 break;
1227 }
1228 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1229 == GOMP_TARGET_ARG_DEVICE_ALL)
1230 {
1231 gcn_dims_found = true;
1232 switch (id & GOMP_TARGET_ARG_ID_MASK)
1233 {
1234 case GOMP_TARGET_ARG_NUM_TEAMS:
1235 gcn_teams = limit_teams (val, agent);
1236 break;
1237 case GOMP_TARGET_ARG_THREAD_LIMIT:
1238 gcn_threads = limit_worker_threads (val);
1239 break;
1240 default:
1241 ;
1242 }
1243 }
1244 }
1245
1246 if (gcn_dims_found)
1247 {
1248 bool gfx900_workaround_p = false;
1249
1250 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1251 && gcn_threads == 0 && override_z_dim == 0)
1252 {
1253 gfx900_workaround_p = true;
1254 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1255 "threads to at most 4 per team.\n");
1256 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1257 "GCN_NUM_THREADS=16\n");
1258 }
1259
1260 /* Ideally, when a dimension isn't explicitly specified, we should
1261 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1262 In practice, we tune for peak performance on BabelStream, which
1263 for OpenACC is currently 32 threads per CU. */
1264 def->ndim = 3;
1265 if (gcn_teams <= 0 && gcn_threads <= 0)
1266 {
1267 /* Set up a reasonable number of teams and threads. */
1268 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1269 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1270 def->gdims[2] = gcn_threads;
1271 }
1272 else if (gcn_teams <= 0 && gcn_threads > 0)
1273 {
1274 /* Auto-scale the number of teams with the number of threads. */
1275 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1276 def->gdims[2] = gcn_threads;
1277 }
1278 else if (gcn_teams > 0 && gcn_threads <= 0)
1279 {
1280 int max_threads = gfx900_workaround_p ? 4 : 16;
1281
1282 /* Auto-scale the number of threads with the number of teams. */
1283 def->gdims[0] = gcn_teams;
1284 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1285 if (def->gdims[2] == 0)
1286 def->gdims[2] = 1;
1287 else if (def->gdims[2] > max_threads)
1288 def->gdims[2] = max_threads;
1289 }
1290 else
1291 {
1292 def->gdims[0] = gcn_teams;
1293 def->gdims[2] = gcn_threads;
1294 }
1295 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1296 def->wdims[0] = 1; /* Single team per work-group. */
1297 def->wdims[1] = 64;
1298 def->wdims[2] = 16;
1299 *result = def;
1300 return true;
1301 }
1302 else if (!grid_attrs_found)
1303 {
1304 def->ndim = 1;
1305 def->gdims[0] = 1;
1306 def->gdims[1] = 1;
1307 def->gdims[2] = 1;
1308 def->wdims[0] = 1;
1309 def->wdims[1] = 1;
1310 def->wdims[2] = 1;
1311 *result = def;
1312 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1313 return true;
1314 }
1315
1316 struct GOMP_kernel_launch_attributes *kla;
1317 kla = (struct GOMP_kernel_launch_attributes *) *input;
1318 *result = kla;
1319 if (kla->ndim == 0 || kla->ndim > 3)
1320 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1321
1322 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1323 unsigned i;
1324 for (i = 0; i < kla->ndim; i++)
1325 {
1326 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1327 kla->gdims[i], kla->wdims[i]);
1328 if (kla->gdims[i] == 0)
1329 return false;
1330 }
1331 return true;
1332 }
1333
1334 /* Return the group size given the requested GROUP size, GRID size and number
1335 of grid dimensions NDIM. */
1336
1337 static uint32_t
1338 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1339 {
1340 if (group == 0)
1341 {
1342 /* TODO: Provide a default via environment or device characteristics. */
1343 if (ndim == 1)
1344 group = 64;
1345 else if (ndim == 2)
1346 group = 8;
1347 else
1348 group = 4;
1349 }
1350
1351 if (group > grid)
1352 group = grid;
1353 return group;
1354 }
1355
1356 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1357
1358 static void
1359 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1360 {
1361 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1362 }
1363
1364 /* A never-called callback for the HSA command queues. These signal events
1365 that we don't use, so we trigger an error.
1366
1367 This "queue" is not to be confused with the async queues, below. */
1368
1369 static void
1370 hsa_queue_callback (hsa_status_t status,
1371 hsa_queue_t *queue __attribute__ ((unused)),
1372 void *data __attribute__ ((unused)))
1373 {
1374 hsa_fatal ("Asynchronous queue error", status);
1375 }
1376
1377 /* }}} */
1378 /* {{{ HSA initialization */
1379
1380 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1381
1382 static bool
1383 init_hsa_runtime_functions (void)
1384 {
1385 #define DLSYM_FN(function) \
1386 hsa_fns.function##_fn = dlsym (handle, #function); \
1387 if (hsa_fns.function##_fn == NULL) \
1388 GOMP_PLUGIN_fatal ("'%s' is missing '%s'", hsa_runtime_lib, #function);
1389 #define DLSYM_OPT_FN(function) \
1390 hsa_fns.function##_fn = dlsym (handle, #function);
1391
1392 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1393 if (handle == NULL)
1394 return false;
1395
1396 DLSYM_FN (hsa_status_string)
1397 DLSYM_FN (hsa_system_get_info)
1398 DLSYM_FN (hsa_agent_get_info)
1399 DLSYM_FN (hsa_init)
1400 DLSYM_FN (hsa_iterate_agents)
1401 DLSYM_FN (hsa_region_get_info)
1402 DLSYM_FN (hsa_queue_create)
1403 DLSYM_FN (hsa_agent_iterate_regions)
1404 DLSYM_FN (hsa_executable_destroy)
1405 DLSYM_FN (hsa_executable_create)
1406 DLSYM_FN (hsa_executable_global_variable_define)
1407 DLSYM_FN (hsa_executable_load_code_object)
1408 DLSYM_FN (hsa_executable_freeze)
1409 DLSYM_FN (hsa_signal_create)
1410 DLSYM_FN (hsa_memory_allocate)
1411 DLSYM_FN (hsa_memory_assign_agent)
1412 DLSYM_FN (hsa_memory_copy)
1413 DLSYM_FN (hsa_memory_free)
1414 DLSYM_FN (hsa_signal_destroy)
1415 DLSYM_FN (hsa_executable_get_symbol)
1416 DLSYM_FN (hsa_executable_symbol_get_info)
1417 DLSYM_FN (hsa_executable_iterate_symbols)
1418 DLSYM_FN (hsa_queue_add_write_index_release)
1419 DLSYM_FN (hsa_queue_load_read_index_acquire)
1420 DLSYM_FN (hsa_signal_wait_acquire)
1421 DLSYM_FN (hsa_signal_store_relaxed)
1422 DLSYM_FN (hsa_signal_store_release)
1423 DLSYM_FN (hsa_signal_load_acquire)
1424 DLSYM_FN (hsa_queue_destroy)
1425 DLSYM_FN (hsa_code_object_deserialize)
1426 DLSYM_OPT_FN (hsa_amd_memory_lock)
1427 DLSYM_OPT_FN (hsa_amd_memory_unlock)
1428 DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
1429 return true;
1430 #undef DLSYM_OPT_FN
1431 #undef DLSYM_FN
1432 }
1433
1434 static gcn_isa isa_code (const char *isa);
1435
1436 /* Return true if the agent is a GPU and can accept of concurrent submissions
1437 from different threads. */
1438
1439 static bool
1440 suitable_hsa_agent_p (hsa_agent_t agent)
1441 {
1442 hsa_device_type_t device_type;
1443 hsa_status_t status
1444 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1445 &device_type);
1446 if (status != HSA_STATUS_SUCCESS)
1447 return false;
1448
1449 switch (device_type)
1450 {
1451 case HSA_DEVICE_TYPE_GPU:
1452 {
1453 char name[64];
1454 hsa_status_t status
1455 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, name);
1456 if (status != HSA_STATUS_SUCCESS
1457 || isa_code (name) == EF_AMDGPU_MACH_UNSUPPORTED)
1458 {
1459 GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
1460 status == HSA_STATUS_SUCCESS ? name : "invalid");
1461 return false;
1462 }
1463 }
1464 break;
1465 case HSA_DEVICE_TYPE_CPU:
1466 if (!support_cpu_devices)
1467 return false;
1468 break;
1469 default:
1470 return false;
1471 }
1472
1473 uint32_t features = 0;
1474 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1475 &features);
1476 if (status != HSA_STATUS_SUCCESS
1477 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1478 return false;
1479 hsa_queue_type_t queue_type;
1480 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1481 &queue_type);
1482 if (status != HSA_STATUS_SUCCESS
1483 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1484 return false;
1485
1486 return true;
1487 }
1488
1489 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1490 agent_count in hsa_context. */
1491
1492 static hsa_status_t
1493 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1494 {
1495 if (suitable_hsa_agent_p (agent))
1496 hsa_context.agent_count++;
1497 return HSA_STATUS_SUCCESS;
1498 }
1499
1500 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1501 id to the describing structure in the hsa context. The index of the
1502 structure is pointed to by DATA, increment it afterwards. */
1503
1504 static hsa_status_t
1505 assign_agent_ids (hsa_agent_t agent, void *data)
1506 {
1507 if (suitable_hsa_agent_p (agent))
1508 {
1509 int *agent_index = (int *) data;
1510 hsa_context.agents[*agent_index].id = agent;
1511 ++*agent_index;
1512 }
1513 return HSA_STATUS_SUCCESS;
1514 }
1515
1516 /* Initialize hsa_context if it has not already been done.
1517 If !PROBE: returns TRUE on success.
1518 If PROBE: returns TRUE on success or if the plugin/device shall be silently
1519 ignored, and otherwise emits an error and returns FALSE. */
1520
1521 static bool
1522 init_hsa_context (bool probe)
1523 {
1524 hsa_status_t status;
1525 int agent_index = 0;
1526
1527 if (hsa_context.initialized)
1528 return true;
1529 init_environment_variables ();
1530 if (!init_hsa_runtime_functions ())
1531 {
1532 const char *msg = "Run-time could not be dynamically opened";
1533 if (suppress_host_fallback)
1534 GOMP_PLUGIN_fatal ("%s\n", msg);
1535 else
1536 GCN_WARNING ("%s\n", msg);
1537 return probe ? true : false;
1538 }
1539 status = hsa_fns.hsa_init_fn ();
1540 if (status != HSA_STATUS_SUCCESS)
1541 return hsa_error ("Run-time could not be initialized", status);
1542 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1543
1544 if (debug)
1545 dump_hsa_system_info ();
1546
1547 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1548 if (status != HSA_STATUS_SUCCESS)
1549 return hsa_error ("GCN GPU devices could not be enumerated", status);
1550 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1551
1552 hsa_context.agents
1553 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1554 * sizeof (struct agent_info));
1555 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1556 if (status != HSA_STATUS_SUCCESS)
1557 return hsa_error ("Scanning compute agents failed", status);
1558 if (agent_index != hsa_context.agent_count)
1559 {
1560 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1561 return false;
1562 }
1563
1564 if (debug)
1565 {
1566 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1567 if (status != HSA_STATUS_SUCCESS)
1568 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1569 }
1570
1571 uint16_t minor, major;
1572 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1573 &minor);
1574 if (status != HSA_STATUS_SUCCESS)
1575 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1576 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1577 &major);
1578 if (status != HSA_STATUS_SUCCESS)
1579 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1580
1581 size_t len = sizeof hsa_context.driver_version_s;
1582 int printed = snprintf (hsa_context.driver_version_s, len,
1583 "HSA Runtime %hu.%hu", (unsigned short int)major,
1584 (unsigned short int)minor);
1585 if (printed >= len)
1586 GCN_WARNING ("HSA runtime version string was truncated."
1587 "Version %hu.%hu is too long.", (unsigned short int)major,
1588 (unsigned short int)minor);
1589
1590 hsa_context.initialized = true;
1591 return true;
1592 }
1593
1594 /* Verify that hsa_context has already been initialized and return the
1595 agent_info structure describing device number N. Return NULL on error. */
1596
1597 static struct agent_info *
1598 get_agent_info (int n)
1599 {
1600 if (!hsa_context.initialized)
1601 {
1602 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1603 return NULL;
1604 }
1605 if (n >= hsa_context.agent_count)
1606 {
1607 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1608 return NULL;
1609 }
1610 if (!hsa_context.agents[n].initialized)
1611 {
1612 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1613 return NULL;
1614 }
1615 return &hsa_context.agents[n];
1616 }
1617
1618 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1619
1620 Selects (breaks at) a suitable region of type KIND. */
1621
1622 static hsa_status_t
1623 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1624 hsa_region_global_flag_t kind)
1625 {
1626 hsa_status_t status;
1627 hsa_region_segment_t segment;
1628
1629 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1630 &segment);
1631 if (status != HSA_STATUS_SUCCESS)
1632 return status;
1633 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1634 return HSA_STATUS_SUCCESS;
1635
1636 uint32_t flags;
1637 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1638 &flags);
1639 if (status != HSA_STATUS_SUCCESS)
1640 return status;
1641 if (flags & kind)
1642 {
1643 *retval = region;
1644 return HSA_STATUS_INFO_BREAK;
1645 }
1646 return HSA_STATUS_SUCCESS;
1647 }
1648
1649 /* Callback of hsa_agent_iterate_regions.
1650
1651 Selects a kernargs memory region. */
1652
1653 static hsa_status_t
1654 get_kernarg_memory_region (hsa_region_t region, void *data)
1655 {
1656 return get_memory_region (region, (hsa_region_t *)data,
1657 HSA_REGION_GLOBAL_FLAG_KERNARG);
1658 }
1659
1660 /* Callback of hsa_agent_iterate_regions.
1661
1662 Selects a coarse-grained memory region suitable for the heap and
1663 offload data. */
1664
1665 static hsa_status_t
1666 get_data_memory_region (hsa_region_t region, void *data)
1667 {
1668 return get_memory_region (region, (hsa_region_t *)data,
1669 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1670 }
1671
1672 static int
1673 elf_gcn_isa_field (Elf64_Ehdr *image)
1674 {
1675 return image->e_flags & EF_AMDGPU_MACH_MASK;
1676 }
1677
1678 const static char *gcn_gfx803_s = "gfx803";
1679 const static char *gcn_gfx900_s = "gfx900";
1680 const static char *gcn_gfx906_s = "gfx906";
1681 const static char *gcn_gfx908_s = "gfx908";
1682 const static char *gcn_gfx90a_s = "gfx90a";
1683 const static char *gcn_gfx90c_s = "gfx90c";
1684 const static char *gcn_gfx1030_s = "gfx1030";
1685 const static char *gcn_gfx1036_s = "gfx1036";
1686 const static char *gcn_gfx1100_s = "gfx1100";
1687 const static char *gcn_gfx1103_s = "gfx1103";
1688 const static int gcn_isa_name_len = 7;
1689
1690 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1691 support the ISA. */
1692
1693 static const char*
1694 isa_hsa_name (int isa) {
1695 switch(isa)
1696 {
1697 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1698 return gcn_gfx803_s;
1699 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1700 return gcn_gfx900_s;
1701 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1702 return gcn_gfx906_s;
1703 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1704 return gcn_gfx908_s;
1705 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1706 return gcn_gfx90a_s;
1707 case EF_AMDGPU_MACH_AMDGCN_GFX90c:
1708 return gcn_gfx90c_s;
1709 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1710 return gcn_gfx1030_s;
1711 case EF_AMDGPU_MACH_AMDGCN_GFX1036:
1712 return gcn_gfx1036_s;
1713 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1714 return gcn_gfx1100_s;
1715 case EF_AMDGPU_MACH_AMDGCN_GFX1103:
1716 return gcn_gfx1103_s;
1717 }
1718 return NULL;
1719 }
1720
1721 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1722 with -march) or NULL if we do not support the ISA.
1723 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1724
1725 static const char*
1726 isa_gcc_name (int isa) {
1727 switch(isa)
1728 {
1729 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1730 return "fiji";
1731 default:
1732 return isa_hsa_name (isa);
1733 }
1734 }
1735
1736 /* Returns the code which is used in the GCN object code to identify the ISA with
1737 the given name (as used by the HSA runtime). */
1738
1739 static gcn_isa
1740 isa_code(const char *isa) {
1741 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1742 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1743
1744 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1745 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1746
1747 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1748 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1749
1750 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1751 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1752
1753 if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1754 return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1755
1756 if (!strncmp (isa, gcn_gfx90c_s, gcn_isa_name_len))
1757 return EF_AMDGPU_MACH_AMDGCN_GFX90c;
1758
1759 if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
1760 return EF_AMDGPU_MACH_AMDGCN_GFX1030;
1761
1762 if (!strncmp (isa, gcn_gfx1036_s, gcn_isa_name_len))
1763 return EF_AMDGPU_MACH_AMDGCN_GFX1036;
1764
1765 if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len))
1766 return EF_AMDGPU_MACH_AMDGCN_GFX1100;
1767
1768 if (!strncmp (isa, gcn_gfx1103_s, gcn_isa_name_len))
1769 return EF_AMDGPU_MACH_AMDGCN_GFX1103;
1770
1771 return EF_AMDGPU_MACH_UNSUPPORTED;
1772 }
1773
1774 /* CDNA2 devices have twice as many VGPRs compared to older devices. */
1775
1776 static int
1777 max_isa_vgprs (int isa)
1778 {
1779 switch (isa)
1780 {
1781 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1782 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1783 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1784 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1785 return 256;
1786 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1787 return 512;
1788 case EF_AMDGPU_MACH_AMDGCN_GFX90c:
1789 return 256;
1790 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1791 case EF_AMDGPU_MACH_AMDGCN_GFX1036:
1792 return 512; /* 512 SIMD32 = 256 wavefrontsize64. */
1793 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1794 case EF_AMDGPU_MACH_AMDGCN_GFX1103:
1795 return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */
1796 }
1797 GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1798 }
1799
1800 /* }}} */
1801 /* {{{ Run */
1802
1803 /* Create or reuse a team arena and stack space.
1804
1805 Team arenas are used by OpenMP to avoid calling malloc multiple times
1806 while setting up each team. This is purely a performance optimization.
1807
1808 The stack space is used by all kernels. We must allocate it in such a
1809 way that the reverse offload implmentation can access the data.
1810
1811 Allocating this memory costs performance, so this function will reuse an
1812 existing allocation if a large enough one is idle.
1813 The memory lock is released, but not deallocated, when the kernel exits. */
1814
1815 static void
1816 configure_ephemeral_memories (struct kernel_info *kernel,
1817 struct kernargs_abi *kernargs, int num_teams,
1818 int num_threads)
1819 {
1820 struct agent_info *agent = kernel->agent;
1821 struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1822 struct ephemeral_memories_list *item;
1823
1824 int actual_arena_size = (kernel->kind == KIND_OPENMP
1825 ? team_arena_size : 0);
1826 int actual_arena_total_size = actual_arena_size * num_teams;
1827 size_t size = (actual_arena_total_size
1828 + num_teams * num_threads * stack_size);
1829
1830 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1831 {
1832 if (item->size < size)
1833 continue;
1834
1835 if (pthread_mutex_trylock (&item->in_use) == 0)
1836 break;
1837 }
1838
1839 if (!item)
1840 {
1841 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1842 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1843 num_teams, num_threads, size);
1844
1845 if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1846 {
1847 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1848 return;
1849 }
1850 item = malloc (sizeof (*item));
1851 item->size = size;
1852 item->next = NULL;
1853 *next_ptr = item;
1854
1855 if (pthread_mutex_init (&item->in_use, NULL))
1856 {
1857 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1858 return;
1859 }
1860 if (pthread_mutex_lock (&item->in_use))
1861 {
1862 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1863 return;
1864 }
1865 if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1866 {
1867 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1868 return;
1869 }
1870
1871 hsa_status_t status;
1872 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1873 &item->address);
1874 if (status != HSA_STATUS_SUCCESS)
1875 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1876 status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1877 HSA_ACCESS_PERMISSION_RW);
1878 if (status != HSA_STATUS_SUCCESS)
1879 hsa_fatal ("Could not assign arena & stack memory to device", status);
1880 }
1881
1882 kernargs->arena_ptr = (actual_arena_total_size
1883 ? (uint64_t)item->address
1884 : 0);
1885 kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1886 kernargs->arena_size_per_team = actual_arena_size;
1887 kernargs->stack_size_per_thread = stack_size;
1888 }
1889
1890 /* Mark an ephemeral memory space available for reuse. */
1891
1892 static void
1893 release_ephemeral_memories (struct agent_info* agent, void *address)
1894 {
1895 struct ephemeral_memories_list *item;
1896
1897 for (item = agent->ephemeral_memories_list; item; item = item->next)
1898 {
1899 if (item->address == address)
1900 {
1901 if (pthread_mutex_unlock (&item->in_use))
1902 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1903 return;
1904 }
1905 }
1906 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1907 }
1908
1909 /* Clean up all the allocated team arenas. */
1910
1911 static bool
1912 destroy_ephemeral_memories (struct agent_info *agent)
1913 {
1914 struct ephemeral_memories_list *item, *next;
1915
1916 for (item = agent->ephemeral_memories_list; item; item = next)
1917 {
1918 next = item->next;
1919 hsa_fns.hsa_memory_free_fn (item->address);
1920 if (pthread_mutex_destroy (&item->in_use))
1921 {
1922 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1923 return false;
1924 }
1925 free (item);
1926 }
1927 agent->ephemeral_memories_list = NULL;
1928
1929 return true;
1930 }
1931
1932 /* Allocate memory on a specified device. */
1933
1934 static void *
1935 alloc_by_agent (struct agent_info *agent, size_t size)
1936 {
1937 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1938
1939 void *ptr;
1940 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1941 size, &ptr);
1942 if (status != HSA_STATUS_SUCCESS)
1943 {
1944 hsa_error ("Could not allocate device memory", status);
1945 return NULL;
1946 }
1947
1948 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1949 HSA_ACCESS_PERMISSION_RW);
1950 if (status != HSA_STATUS_SUCCESS)
1951 {
1952 hsa_error ("Could not assign data memory to device", status);
1953 return NULL;
1954 }
1955
1956 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1957 bool profiling_dispatch_p
1958 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1959 if (profiling_dispatch_p)
1960 {
1961 acc_prof_info *prof_info = thr->prof_info;
1962 acc_event_info data_event_info;
1963 acc_api_info *api_info = thr->api_info;
1964
1965 prof_info->event_type = acc_ev_alloc;
1966
1967 data_event_info.data_event.event_type = prof_info->event_type;
1968 data_event_info.data_event.valid_bytes
1969 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1970 data_event_info.data_event.parent_construct
1971 = acc_construct_parallel;
1972 data_event_info.data_event.implicit = 1;
1973 data_event_info.data_event.tool_info = NULL;
1974 data_event_info.data_event.var_name = NULL;
1975 data_event_info.data_event.bytes = size;
1976 data_event_info.data_event.host_ptr = NULL;
1977 data_event_info.data_event.device_ptr = (void *) ptr;
1978
1979 api_info->device_api = acc_device_api_other;
1980
1981 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1982 api_info);
1983 }
1984
1985 return ptr;
1986 }
1987
1988 /* Create kernel dispatch data structure for given KERNEL, along with
1989 the necessary device signals and memory allocations. */
1990
1991 static struct kernel_dispatch *
1992 create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1993 int num_threads)
1994 {
1995 struct agent_info *agent = kernel->agent;
1996 struct kernel_dispatch *shadow
1997 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1998
1999 shadow->agent = kernel->agent;
2000 shadow->object = kernel->object;
2001
2002 hsa_signal_t sync_signal;
2003 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
2004 if (status != HSA_STATUS_SUCCESS)
2005 hsa_fatal ("Error creating the GCN sync signal", status);
2006
2007 shadow->signal = sync_signal.handle;
2008 shadow->private_segment_size = kernel->private_segment_size;
2009
2010 if (lowlat_size < 0)
2011 {
2012 /* Divide the LDS between the number of running teams.
2013 Allocate not less than is defined in the kernel metadata. */
2014 int teams_per_cu = num_teams / get_cu_count (agent);
2015 int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
2016 shadow->group_segment_size
2017 = (kernel->group_segment_size > LDS_per_team
2018 ? kernel->group_segment_size
2019 : LDS_per_team);;
2020 }
2021 else if (lowlat_size < GCN_LOWLAT_HEAP+8)
2022 /* Ensure that there's space for the OpenMP libgomp data. */
2023 shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
2024 else
2025 shadow->group_segment_size = (lowlat_size > 65536
2026 ? 65536
2027 : lowlat_size);
2028
2029 /* We expect kernels to request a single pointer, explicitly, and the
2030 rest of struct kernargs, implicitly. If they request anything else
2031 then something is wrong. */
2032 if (kernel->kernarg_segment_size > 8)
2033 {
2034 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
2035 return NULL;
2036 }
2037
2038 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
2039 sizeof (struct kernargs),
2040 &shadow->kernarg_address);
2041 if (status != HSA_STATUS_SUCCESS)
2042 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
2043 struct kernargs *kernargs = shadow->kernarg_address;
2044
2045 /* Zero-initialize the output_data (minimum needed). */
2046 kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
2047 kernargs->output_data.next_output = 0;
2048 for (unsigned i = 0;
2049 i < (sizeof (kernargs->output_data.queue)
2050 / sizeof (kernargs->output_data.queue[0]));
2051 i++)
2052 kernargs->output_data.queue[i].written = 0;
2053 kernargs->output_data.consumed = 0;
2054
2055 /* Pass in the heap location. */
2056 kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
2057
2058 /* Create the ephemeral memory spaces. */
2059 configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
2060
2061 /* Ensure we can recognize unset return values. */
2062 kernargs->output_data.return_value = 0xcafe0000;
2063
2064 return shadow;
2065 }
2066
2067 static void
2068 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
2069 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
2070 {
2071 int dev_num = dev_num64;
2072 GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
2073 NULL);
2074 }
2075
2076 /* Output any data written to console output from the kernel. It is expected
2077 that this function is polled during kernel execution.
2078
2079 We print all entries from the last item printed to the next entry without
2080 a "written" flag. If the "final" flag is set then it'll continue right to
2081 the end.
2082
2083 The print buffer is circular, but the from and to locations don't wrap when
2084 the buffer does, so the output limit is UINT_MAX. The target blocks on
2085 output when the buffer is full. */
2086
2087 static void
2088 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2089 bool final)
2090 {
2091 unsigned int limit = (sizeof (kernargs->output_data.queue)
2092 / sizeof (kernargs->output_data.queue[0]));
2093
2094 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2095 __ATOMIC_ACQUIRE);
2096 unsigned int to = kernargs->output_data.next_output;
2097
2098 if (from > to)
2099 {
2100 /* Overflow. */
2101 if (final)
2102 printf ("GCN print buffer overflowed.\n");
2103 return;
2104 }
2105
2106 unsigned int i;
2107 for (i = from; i < to; i++)
2108 {
2109 struct printf_data *data = &kernargs->output_data.queue[i%limit];
2110
2111 if (!data->written && !final)
2112 break;
2113
2114 switch (data->type)
2115 {
2116 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2117 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2118 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2119 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
2120 case 4:
2121 process_reverse_offload (data->value_u64[0], data->value_u64[1],
2122 data->value_u64[2], data->value_u64[3],
2123 data->value_u64[4], data->value_u64[5]);
2124 break;
2125 default: printf ("GCN print buffer error!\n"); break;
2126 }
2127 data->written = 0;
2128 __atomic_store_n (&kernargs->output_data.consumed, i+1,
2129 __ATOMIC_RELEASE);
2130 }
2131 fflush (stdout);
2132 }
2133
2134 /* Release data structure created for a kernel dispatch in SHADOW argument,
2135 and clean up the signal and memory allocations. */
2136
2137 static void
2138 release_kernel_dispatch (struct kernel_dispatch *shadow)
2139 {
2140 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2141
2142 struct kernargs *kernargs = shadow->kernarg_address;
2143 void *addr = (void *)kernargs->abi.arena_ptr;
2144 if (!addr)
2145 addr = (void *)kernargs->abi.stack_ptr;
2146 release_ephemeral_memories (shadow->agent, addr);
2147
2148 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2149
2150 hsa_signal_t s;
2151 s.handle = shadow->signal;
2152 hsa_fns.hsa_signal_destroy_fn (s);
2153
2154 free (shadow);
2155 }
2156
2157 /* Extract the properties from a kernel binary. */
2158
2159 static void
2160 init_kernel_properties (struct kernel_info *kernel)
2161 {
2162 hsa_status_t status;
2163 struct agent_info *agent = kernel->agent;
2164 hsa_executable_symbol_t kernel_symbol;
2165 char *buf = alloca (strlen (kernel->name) + 4);
2166 sprintf (buf, "%s.kd", kernel->name);
2167 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2168 buf, agent->id,
2169 0, &kernel_symbol);
2170 if (status != HSA_STATUS_SUCCESS)
2171 {
2172 hsa_warn ("Could not find symbol for kernel in the code object", status);
2173 fprintf (stderr, "not found name: '%s'\n", buf);
2174 dump_executable_symbols (agent->executable);
2175 goto failure;
2176 }
2177 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2178 status = hsa_fns.hsa_executable_symbol_get_info_fn
2179 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2180 if (status != HSA_STATUS_SUCCESS)
2181 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2182 status = hsa_fns.hsa_executable_symbol_get_info_fn
2183 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2184 &kernel->kernarg_segment_size);
2185 if (status != HSA_STATUS_SUCCESS)
2186 hsa_fatal ("Could not get info about kernel argument size", status);
2187 status = hsa_fns.hsa_executable_symbol_get_info_fn
2188 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2189 &kernel->group_segment_size);
2190 if (status != HSA_STATUS_SUCCESS)
2191 hsa_fatal ("Could not get info about kernel group segment size", status);
2192 status = hsa_fns.hsa_executable_symbol_get_info_fn
2193 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2194 &kernel->private_segment_size);
2195 if (status != HSA_STATUS_SUCCESS)
2196 hsa_fatal ("Could not get info about kernel private segment size",
2197 status);
2198
2199 /* The kernel type is not known until something tries to launch it. */
2200 kernel->kind = KIND_UNKNOWN;
2201
2202 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2203 "following segment sizes: \n", kernel->name);
2204 GCN_DEBUG (" group_segment_size: %u\n",
2205 (unsigned) kernel->group_segment_size);
2206 GCN_DEBUG (" private_segment_size: %u\n",
2207 (unsigned) kernel->private_segment_size);
2208 GCN_DEBUG (" kernarg_segment_size: %u\n",
2209 (unsigned) kernel->kernarg_segment_size);
2210 return;
2211
2212 failure:
2213 kernel->initialization_failed = true;
2214 }
2215
2216 /* Do all the work that is necessary before running KERNEL for the first time.
2217 The function assumes the program has been created, finalized and frozen by
2218 create_and_finalize_hsa_program. */
2219
2220 static void
2221 init_kernel (struct kernel_info *kernel)
2222 {
2223 if (pthread_mutex_lock (&kernel->init_mutex))
2224 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2225 if (kernel->initialized)
2226 {
2227 if (pthread_mutex_unlock (&kernel->init_mutex))
2228 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2229 "mutex");
2230
2231 return;
2232 }
2233
2234 init_kernel_properties (kernel);
2235
2236 if (!kernel->initialization_failed)
2237 {
2238 GCN_DEBUG ("\n");
2239
2240 kernel->initialized = true;
2241 }
2242 if (pthread_mutex_unlock (&kernel->init_mutex))
2243 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2244 "mutex");
2245 }
2246
2247 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2248 launch attributes from KLA.
2249
2250 MODULE_LOCKED indicates that the caller already holds the lock and
2251 run_kernel need not lock it again.
2252 If AQ is NULL then agent->sync_queue will be used. */
2253
2254 static void
2255 run_kernel (struct kernel_info *kernel, void *vars,
2256 struct GOMP_kernel_launch_attributes *kla,
2257 struct goacc_asyncqueue *aq, bool module_locked)
2258 {
2259 struct agent_info *agent = kernel->agent;
2260 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2261 kernel->description->vpgr_count);
2262
2263 /* Reduce the number of threads/workers if there are insufficient
2264 VGPRs available to run the kernels together. */
2265 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2266 {
2267 int max_vgprs = max_isa_vgprs (agent->device_isa);
2268 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2269 int max_threads = (max_vgprs / granulated_vgprs) * 4;
2270 if (kla->gdims[2] > max_threads)
2271 {
2272 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2273 " per team/gang - reducing to %d threads/workers.\n",
2274 kla->gdims[2], max_threads);
2275 kla->gdims[2] = max_threads;
2276 }
2277 }
2278
2279 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2280 (aq ? aq->id : 0));
2281 GCN_DEBUG ("GCN launch attribs: gdims:[");
2282 int i;
2283 for (i = 0; i < kla->ndim; ++i)
2284 {
2285 if (i)
2286 DEBUG_PRINT (", ");
2287 DEBUG_PRINT ("%u", kla->gdims[i]);
2288 }
2289 DEBUG_PRINT ("], normalized gdims:[");
2290 for (i = 0; i < kla->ndim; ++i)
2291 {
2292 if (i)
2293 DEBUG_PRINT (", ");
2294 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2295 }
2296 DEBUG_PRINT ("], wdims:[");
2297 for (i = 0; i < kla->ndim; ++i)
2298 {
2299 if (i)
2300 DEBUG_PRINT (", ");
2301 DEBUG_PRINT ("%u", kla->wdims[i]);
2302 }
2303 DEBUG_PRINT ("]\n");
2304 DEBUG_FLUSH ();
2305
2306 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2307 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2308
2309 if (!agent->initialized)
2310 GOMP_PLUGIN_fatal ("Agent must be initialized");
2311
2312 if (!kernel->initialized)
2313 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2314
2315 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2316
2317 uint64_t index
2318 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2319 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2320
2321 /* Wait until the queue is not full before writing the packet. */
2322 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2323 >= command_q->size)
2324 ;
2325
2326 /* Do not allow the dimensions to be overridden when running
2327 constructors or destructors. */
2328 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2329 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2330
2331 hsa_kernel_dispatch_packet_t *packet;
2332 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2333 + index % command_q->size;
2334
2335 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2336 packet->grid_size_x = override_x ? : kla->gdims[0];
2337 packet->workgroup_size_x = get_group_size (kla->ndim,
2338 packet->grid_size_x,
2339 kla->wdims[0]);
2340
2341 if (kla->ndim >= 2)
2342 {
2343 packet->grid_size_y = kla->gdims[1];
2344 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2345 kla->wdims[1]);
2346 }
2347 else
2348 {
2349 packet->grid_size_y = 1;
2350 packet->workgroup_size_y = 1;
2351 }
2352
2353 if (kla->ndim == 3)
2354 {
2355 packet->grid_size_z = limit_worker_threads (override_z
2356 ? : kla->gdims[2]);
2357 packet->workgroup_size_z = get_group_size (kla->ndim,
2358 packet->grid_size_z,
2359 kla->wdims[2]);
2360 }
2361 else
2362 {
2363 packet->grid_size_z = 1;
2364 packet->workgroup_size_z = 1;
2365 }
2366
2367 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2368 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2369 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2370 packet->grid_size_x / packet->workgroup_size_x,
2371 packet->grid_size_y / packet->workgroup_size_y,
2372 packet->grid_size_z / packet->workgroup_size_z,
2373 packet->workgroup_size_x, packet->workgroup_size_y,
2374 packet->workgroup_size_z);
2375
2376 struct kernel_dispatch *shadow
2377 = create_kernel_dispatch (kernel, packet->grid_size_x,
2378 packet->grid_size_z);
2379 shadow->queue = command_q;
2380
2381 if (debug)
2382 {
2383 fprintf (stderr, "\nKernel has following dependencies:\n");
2384 print_kernel_dispatch (shadow, 2);
2385 }
2386
2387 packet->private_segment_size = shadow->private_segment_size;
2388 packet->group_segment_size = shadow->group_segment_size;
2389 packet->kernel_object = shadow->object;
2390 packet->kernarg_address = shadow->kernarg_address;
2391 hsa_signal_t s;
2392 s.handle = shadow->signal;
2393 packet->completion_signal = s;
2394 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2395 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2396
2397 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2398
2399 uint16_t header;
2400 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2401 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2402 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2403
2404 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2405 agent->device_id);
2406
2407 packet_store_release ((uint32_t *) packet, header,
2408 (uint16_t) kla->ndim
2409 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2410
2411 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2412 index);
2413
2414 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2415
2416 /* Root signal waits with 1ms timeout. */
2417 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2418 1000 * 1000,
2419 HSA_WAIT_STATE_BLOCKED) != 0)
2420 {
2421 console_output (kernel, shadow->kernarg_address, false);
2422 }
2423 console_output (kernel, shadow->kernarg_address, true);
2424
2425 struct kernargs *kernargs = shadow->kernarg_address;
2426 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2427
2428 release_kernel_dispatch (shadow);
2429
2430 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2431 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2432
2433 unsigned int upper = (return_value & ~0xffff) >> 16;
2434 if (upper == 0xcafe)
2435 ; // exit not called, normal termination.
2436 else if (upper == 0xffff)
2437 ; // exit called.
2438 else
2439 {
2440 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2441 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2442 return_value);
2443 abort ();
2444 }
2445
2446 if (upper == 0xffff)
2447 {
2448 unsigned int signal = (return_value >> 8) & 0xff;
2449
2450 if (signal == SIGABRT)
2451 {
2452 GCN_WARNING ("GCN Kernel aborted\n");
2453 abort ();
2454 }
2455 else if (signal != 0)
2456 {
2457 GCN_WARNING ("GCN Kernel received unknown signal\n");
2458 abort ();
2459 }
2460
2461 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2462 exit (return_value & 0xff);
2463 }
2464 }
2465
2466 /* }}} */
2467 /* {{{ Load/Unload */
2468
2469 /* Initialize KERNEL from D and other parameters. Return true on success. */
2470
2471 static bool
2472 init_basic_kernel_info (struct kernel_info *kernel,
2473 struct hsa_kernel_description *d,
2474 struct agent_info *agent,
2475 struct module_info *module)
2476 {
2477 kernel->agent = agent;
2478 kernel->module = module;
2479 kernel->name = d->name;
2480 kernel->description = d;
2481 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2482 {
2483 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2484 return false;
2485 }
2486 return true;
2487 }
2488
2489 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2490
2491 static bool
2492 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2493 {
2494 int isa_field = elf_gcn_isa_field (image);
2495 const char* isa_s = isa_hsa_name (isa_field);
2496 if (!isa_s)
2497 {
2498 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2499 return false;
2500 }
2501
2502 if (isa_field != agent->device_isa)
2503 {
2504 char msg[120];
2505 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2506 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2507 assert (agent_isa_s);
2508 assert (agent_isa_gcc_s);
2509
2510 snprintf (msg, sizeof msg,
2511 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2512 "Try to recompile with '-foffload-options=-march=%s'.\n",
2513 isa_s, agent_isa_s, agent_isa_gcc_s);
2514
2515 hsa_error (msg, HSA_STATUS_ERROR);
2516 return false;
2517 }
2518
2519 return true;
2520 }
2521
2522 /* Create and finalize the program consisting of all loaded modules. */
2523
2524 static bool
2525 create_and_finalize_hsa_program (struct agent_info *agent)
2526 {
2527 hsa_status_t status;
2528 bool res = true;
2529 if (pthread_mutex_lock (&agent->prog_mutex))
2530 {
2531 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2532 return false;
2533 }
2534 if (agent->prog_finalized)
2535 goto final;
2536
2537 status
2538 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2539 HSA_EXECUTABLE_STATE_UNFROZEN,
2540 "", &agent->executable);
2541 if (status != HSA_STATUS_SUCCESS)
2542 {
2543 hsa_error ("Could not create GCN executable", status);
2544 goto fail;
2545 }
2546
2547 /* Load any GCN modules. */
2548 struct module_info *module = agent->module;
2549 if (module)
2550 {
2551 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2552
2553 if (!isa_matches_agent (agent, image))
2554 goto fail;
2555
2556 hsa_code_object_t co = { 0 };
2557 status = hsa_fns.hsa_code_object_deserialize_fn
2558 (module->image_desc->gcn_image->image,
2559 module->image_desc->gcn_image->size,
2560 NULL, &co);
2561 if (status != HSA_STATUS_SUCCESS)
2562 {
2563 hsa_error ("Could not deserialize GCN code object", status);
2564 goto fail;
2565 }
2566
2567 status = hsa_fns.hsa_executable_load_code_object_fn
2568 (agent->executable, agent->id, co, "");
2569 if (status != HSA_STATUS_SUCCESS)
2570 {
2571 hsa_error ("Could not load GCN code object", status);
2572 goto fail;
2573 }
2574
2575 if (!module->heap)
2576 {
2577 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2578 gcn_kernel_heap_size,
2579 (void**)&module->heap);
2580 if (status != HSA_STATUS_SUCCESS)
2581 {
2582 hsa_error ("Could not allocate memory for GCN heap", status);
2583 goto fail;
2584 }
2585
2586 status = hsa_fns.hsa_memory_assign_agent_fn
2587 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2588 if (status != HSA_STATUS_SUCCESS)
2589 {
2590 hsa_error ("Could not assign GCN heap memory to device", status);
2591 goto fail;
2592 }
2593
2594 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2595 &gcn_kernel_heap_size,
2596 sizeof (gcn_kernel_heap_size));
2597 }
2598
2599 }
2600
2601 if (debug)
2602 dump_executable_symbols (agent->executable);
2603
2604 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2605 if (status != HSA_STATUS_SUCCESS)
2606 {
2607 hsa_error ("Could not freeze the GCN executable", status);
2608 goto fail;
2609 }
2610
2611 final:
2612 agent->prog_finalized = true;
2613
2614 if (pthread_mutex_unlock (&agent->prog_mutex))
2615 {
2616 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2617 res = false;
2618 }
2619
2620 return res;
2621
2622 fail:
2623 res = false;
2624 goto final;
2625 }
2626
2627 /* Free the HSA program in agent and everything associated with it and set
2628 agent->prog_finalized and the initialized flags of all kernels to false.
2629 Return TRUE on success. */
2630
2631 static bool
2632 destroy_hsa_program (struct agent_info *agent)
2633 {
2634 if (!agent->prog_finalized)
2635 return true;
2636
2637 hsa_status_t status;
2638
2639 GCN_DEBUG ("Destroying the current GCN program.\n");
2640
2641 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2642 if (status != HSA_STATUS_SUCCESS)
2643 return hsa_error ("Could not destroy GCN executable", status);
2644
2645 if (agent->module)
2646 {
2647 int i;
2648 for (i = 0; i < agent->module->kernel_count; i++)
2649 agent->module->kernels[i].initialized = false;
2650
2651 if (agent->module->heap)
2652 {
2653 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2654 agent->module->heap = NULL;
2655 }
2656 }
2657 agent->prog_finalized = false;
2658 return true;
2659 }
2660
2661 /* Deinitialize all information associated with MODULE and kernels within
2662 it. Return TRUE on success. */
2663
2664 static bool
2665 destroy_module (struct module_info *module, bool locked)
2666 {
2667 /* Run destructors before destroying module. */
2668 struct GOMP_kernel_launch_attributes kla =
2669 { 3,
2670 /* Grid size. */
2671 { 1, 64, 1 },
2672 /* Work-group size. */
2673 { 1, 64, 1 }
2674 };
2675
2676 if (module->fini_array_func)
2677 {
2678 init_kernel (module->fini_array_func);
2679 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2680 }
2681 module->constructors_run_p = false;
2682
2683 int i;
2684 for (i = 0; i < module->kernel_count; i++)
2685 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2686 {
2687 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2688 "mutex");
2689 return false;
2690 }
2691
2692 return true;
2693 }
2694
2695 /* }}} */
2696 /* {{{ Async */
2697
2698 /* Callback of dispatch queues to report errors. */
2699
2700 static void
2701 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2702 {
2703 struct queue_entry *entry = &aq->queue[index];
2704
2705 switch (entry->type)
2706 {
2707 case KERNEL_LAUNCH:
2708 if (DEBUG_QUEUES)
2709 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2710 aq->agent->device_id, aq->id, index);
2711 run_kernel (entry->u.launch.kernel,
2712 entry->u.launch.vars,
2713 &entry->u.launch.kla, aq, false);
2714 if (DEBUG_QUEUES)
2715 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2716 aq->agent->device_id, aq->id, index);
2717 break;
2718
2719 case CALLBACK:
2720 if (DEBUG_QUEUES)
2721 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2722 aq->agent->device_id, aq->id, index);
2723 entry->u.callback.fn (entry->u.callback.data);
2724 if (DEBUG_QUEUES)
2725 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2726 aq->agent->device_id, aq->id, index);
2727 break;
2728
2729 case ASYNC_WAIT:
2730 {
2731 /* FIXME: is it safe to access a placeholder that may already have
2732 been executed? */
2733 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2734
2735 if (DEBUG_QUEUES)
2736 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2737 aq->agent->device_id, aq->id, index);
2738
2739 pthread_mutex_lock (&placeholderp->mutex);
2740
2741 while (!placeholderp->executed)
2742 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2743
2744 pthread_mutex_unlock (&placeholderp->mutex);
2745
2746 if (pthread_cond_destroy (&placeholderp->cond))
2747 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2748
2749 if (pthread_mutex_destroy (&placeholderp->mutex))
2750 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2751
2752 if (DEBUG_QUEUES)
2753 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2754 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2755 }
2756 break;
2757
2758 case ASYNC_PLACEHOLDER:
2759 pthread_mutex_lock (&entry->u.placeholder.mutex);
2760 entry->u.placeholder.executed = 1;
2761 pthread_cond_signal (&entry->u.placeholder.cond);
2762 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2763 break;
2764
2765 default:
2766 GOMP_PLUGIN_fatal ("Unknown queue element");
2767 }
2768 }
2769
2770 /* This function is run as a thread to service an async queue in the
2771 background. It runs continuously until the stop flag is set. */
2772
2773 static void *
2774 drain_queue (void *thread_arg)
2775 {
2776 struct goacc_asyncqueue *aq = thread_arg;
2777
2778 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2779 {
2780 aq->drain_queue_stop = 2;
2781 return NULL;
2782 }
2783
2784 pthread_mutex_lock (&aq->mutex);
2785
2786 while (true)
2787 {
2788 if (aq->drain_queue_stop)
2789 break;
2790
2791 if (aq->queue_n > 0)
2792 {
2793 pthread_mutex_unlock (&aq->mutex);
2794 execute_queue_entry (aq, aq->queue_first);
2795
2796 pthread_mutex_lock (&aq->mutex);
2797 aq->queue_first = ((aq->queue_first + 1)
2798 % ASYNC_QUEUE_SIZE);
2799 aq->queue_n--;
2800
2801 if (DEBUG_THREAD_SIGNAL)
2802 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2803 aq->agent->device_id, aq->id);
2804 pthread_cond_broadcast (&aq->queue_cond_out);
2805 pthread_mutex_unlock (&aq->mutex);
2806
2807 if (DEBUG_QUEUES)
2808 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2809 aq->id);
2810 pthread_mutex_lock (&aq->mutex);
2811 }
2812 else
2813 {
2814 if (DEBUG_THREAD_SLEEP)
2815 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2816 aq->agent->device_id, aq->id);
2817 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2818 if (DEBUG_THREAD_SLEEP)
2819 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2820 aq->agent->device_id, aq->id);
2821 }
2822 }
2823
2824 aq->drain_queue_stop = 2;
2825 if (DEBUG_THREAD_SIGNAL)
2826 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2827 aq->agent->device_id, aq->id);
2828 pthread_cond_broadcast (&aq->queue_cond_out);
2829 pthread_mutex_unlock (&aq->mutex);
2830
2831 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2832 return NULL;
2833 }
2834
2835 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2836 is not usually the case. This is just a debug tool. */
2837
2838 static void
2839 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2840 {
2841 pthread_mutex_lock (&aq->mutex);
2842
2843 while (aq->queue_n > 0)
2844 {
2845 execute_queue_entry (aq, aq->queue_first);
2846
2847 aq->queue_first = ((aq->queue_first + 1)
2848 % ASYNC_QUEUE_SIZE);
2849 aq->queue_n--;
2850 }
2851
2852 pthread_mutex_unlock (&aq->mutex);
2853 }
2854
2855 /* Block the current thread until an async queue is writable. The aq->mutex
2856 lock should be held on entry, and remains locked on exit. */
2857
2858 static void
2859 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2860 {
2861 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2862 {
2863 /* Queue is full. Wait for it to not be full. */
2864 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2865 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2866 }
2867 }
2868
2869 /* Request an asynchronous kernel launch on the specified queue. This
2870 may block if the queue is full, but returns without waiting for the
2871 kernel to run. */
2872
2873 static void
2874 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2875 void *vars, struct GOMP_kernel_launch_attributes *kla)
2876 {
2877 assert (aq->agent == kernel->agent);
2878
2879 pthread_mutex_lock (&aq->mutex);
2880
2881 wait_for_queue_nonfull (aq);
2882
2883 int queue_last = ((aq->queue_first + aq->queue_n)
2884 % ASYNC_QUEUE_SIZE);
2885 if (DEBUG_QUEUES)
2886 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2887 aq->id, queue_last);
2888
2889 aq->queue[queue_last].type = KERNEL_LAUNCH;
2890 aq->queue[queue_last].u.launch.kernel = kernel;
2891 aq->queue[queue_last].u.launch.vars = vars;
2892 aq->queue[queue_last].u.launch.kla = *kla;
2893
2894 aq->queue_n++;
2895
2896 if (DEBUG_THREAD_SIGNAL)
2897 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2898 aq->agent->device_id, aq->id);
2899 pthread_cond_signal (&aq->queue_cond_in);
2900
2901 pthread_mutex_unlock (&aq->mutex);
2902 }
2903
2904 /* Request an asynchronous callback on the specified queue. The callback
2905 function will be called, with the given opaque data, from the appropriate
2906 async thread, when all previous items on that queue are complete. */
2907
2908 static void
2909 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2910 void *data)
2911 {
2912 pthread_mutex_lock (&aq->mutex);
2913
2914 wait_for_queue_nonfull (aq);
2915
2916 int queue_last = ((aq->queue_first + aq->queue_n)
2917 % ASYNC_QUEUE_SIZE);
2918 if (DEBUG_QUEUES)
2919 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2920 aq->id, queue_last);
2921
2922 aq->queue[queue_last].type = CALLBACK;
2923 aq->queue[queue_last].u.callback.fn = fn;
2924 aq->queue[queue_last].u.callback.data = data;
2925
2926 aq->queue_n++;
2927
2928 if (DEBUG_THREAD_SIGNAL)
2929 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2930 aq->agent->device_id, aq->id);
2931 pthread_cond_signal (&aq->queue_cond_in);
2932
2933 pthread_mutex_unlock (&aq->mutex);
2934 }
2935
2936 /* Request that a given async thread wait for another thread (unspecified) to
2937 reach the given placeholder. The wait will occur when all previous entries
2938 on the queue are complete. A placeholder is effectively a kind of signal
2939 which simply sets a flag when encountered in a queue. */
2940
2941 static void
2942 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2943 struct placeholder *placeholderp)
2944 {
2945 pthread_mutex_lock (&aq->mutex);
2946
2947 wait_for_queue_nonfull (aq);
2948
2949 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2950 if (DEBUG_QUEUES)
2951 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2952 aq->id, queue_last);
2953
2954 aq->queue[queue_last].type = ASYNC_WAIT;
2955 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2956
2957 aq->queue_n++;
2958
2959 if (DEBUG_THREAD_SIGNAL)
2960 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2961 aq->agent->device_id, aq->id);
2962 pthread_cond_signal (&aq->queue_cond_in);
2963
2964 pthread_mutex_unlock (&aq->mutex);
2965 }
2966
2967 /* Add a placeholder into an async queue. When the async thread reaches the
2968 placeholder it will set the "executed" flag to true and continue.
2969 Another thread may be waiting on this thread reaching the placeholder. */
2970
2971 static struct placeholder *
2972 queue_push_placeholder (struct goacc_asyncqueue *aq)
2973 {
2974 struct placeholder *placeholderp;
2975
2976 pthread_mutex_lock (&aq->mutex);
2977
2978 wait_for_queue_nonfull (aq);
2979
2980 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2981 if (DEBUG_QUEUES)
2982 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2983 aq->id, queue_last);
2984
2985 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2986 placeholderp = &aq->queue[queue_last].u.placeholder;
2987
2988 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2989 {
2990 pthread_mutex_unlock (&aq->mutex);
2991 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2992 }
2993
2994 if (pthread_cond_init (&placeholderp->cond, NULL))
2995 {
2996 pthread_mutex_unlock (&aq->mutex);
2997 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2998 }
2999
3000 placeholderp->executed = 0;
3001
3002 aq->queue_n++;
3003
3004 if (DEBUG_THREAD_SIGNAL)
3005 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
3006 aq->agent->device_id, aq->id);
3007 pthread_cond_signal (&aq->queue_cond_in);
3008
3009 pthread_mutex_unlock (&aq->mutex);
3010
3011 return placeholderp;
3012 }
3013
3014 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
3015
3016 static void
3017 finalize_async_thread (struct goacc_asyncqueue *aq)
3018 {
3019 pthread_mutex_lock (&aq->mutex);
3020 if (aq->drain_queue_stop == 2)
3021 {
3022 pthread_mutex_unlock (&aq->mutex);
3023 return;
3024 }
3025
3026 aq->drain_queue_stop = 1;
3027
3028 if (DEBUG_THREAD_SIGNAL)
3029 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3030 aq->agent->device_id, aq->id);
3031 pthread_cond_signal (&aq->queue_cond_in);
3032
3033 while (aq->drain_queue_stop != 2)
3034 {
3035 if (DEBUG_THREAD_SLEEP)
3036 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3037 " to sleep\n", aq->agent->device_id, aq->id);
3038 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3039 if (DEBUG_THREAD_SLEEP)
3040 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
3041 aq->agent->device_id, aq->id);
3042 }
3043
3044 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
3045 aq->id);
3046 pthread_mutex_unlock (&aq->mutex);
3047
3048 int err = pthread_join (aq->thread_drain_queue, NULL);
3049 if (err != 0)
3050 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3051 aq->agent->device_id, aq->id, strerror (err));
3052 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3053 }
3054
3055 /* Set up an async queue for OpenMP. There will be only one. The
3056 implementation simply uses an OpenACC async queue.
3057 FIXME: is this thread-safe if two threads call this function? */
3058
3059 static void
3060 maybe_init_omp_async (struct agent_info *agent)
3061 {
3062 if (!agent->omp_async_queue)
3063 agent->omp_async_queue
3064 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3065 }
3066
3067 /* A wrapper that works around an issue in the HSA runtime with host-to-device
3068 copies from read-only pages. */
3069
3070 static void
3071 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3072 {
3073 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3074
3075 if (status == HSA_STATUS_SUCCESS)
3076 return;
3077
3078 /* It appears that the copy fails if the source data is in a read-only page.
3079 We can't detect that easily, so try copying the data to a temporary buffer
3080 and doing the copy again if we got an error above. */
3081
3082 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3083 "[%p:+%d]\n", (void *) src, (int) len);
3084
3085 void *src_copy = malloc (len);
3086 memcpy (src_copy, src, len);
3087 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3088 free (src_copy);
3089 if (status != HSA_STATUS_SUCCESS)
3090 GOMP_PLUGIN_error ("memory copy failed");
3091 }
3092
3093 /* Copy data to or from a device. This is intended for use as an async
3094 callback event. */
3095
3096 static void
3097 copy_data (void *data_)
3098 {
3099 struct copy_data *data = (struct copy_data *)data_;
3100 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3101 data->aq->agent->device_id, data->aq->id, data->len, data->src,
3102 data->dst);
3103 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3104 free (data);
3105 }
3106
3107 /* Request an asynchronous data copy, to or from a device, on a given queue.
3108 The event will be registered as a callback. */
3109
3110 static void
3111 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3112 size_t len)
3113 {
3114 if (DEBUG_QUEUES)
3115 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3116 aq->agent->device_id, aq->id, len, src, dst);
3117 struct copy_data *data
3118 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3119 data->dst = dst;
3120 data->src = src;
3121 data->len = len;
3122 data->aq = aq;
3123 queue_push_callback (aq, copy_data, data);
3124 }
3125
3126 /* Return true if the given queue is currently empty. */
3127
3128 static int
3129 queue_empty (struct goacc_asyncqueue *aq)
3130 {
3131 pthread_mutex_lock (&aq->mutex);
3132 int res = aq->queue_n == 0 ? 1 : 0;
3133 pthread_mutex_unlock (&aq->mutex);
3134
3135 return res;
3136 }
3137
3138 /* Wait for a given queue to become empty. This implements an OpenACC wait
3139 directive. */
3140
3141 static void
3142 wait_queue (struct goacc_asyncqueue *aq)
3143 {
3144 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3145 {
3146 drain_queue_synchronous (aq);
3147 return;
3148 }
3149
3150 pthread_mutex_lock (&aq->mutex);
3151
3152 while (aq->queue_n > 0)
3153 {
3154 if (DEBUG_THREAD_SLEEP)
3155 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3156 aq->agent->device_id, aq->id);
3157 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3158 if (DEBUG_THREAD_SLEEP)
3159 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3160 aq->id);
3161 }
3162
3163 pthread_mutex_unlock (&aq->mutex);
3164 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3165 }
3166
3167 /* }}} */
3168 /* {{{ OpenACC support */
3169
3170 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3171
3172 static void
3173 gcn_exec (struct kernel_info *kernel,
3174 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3175 struct goacc_asyncqueue *aq)
3176 {
3177 if (!GOMP_OFFLOAD_can_run (kernel))
3178 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3179
3180 /* If we get here then this must be an OpenACC kernel. */
3181 kernel->kind = KIND_OPENACC;
3182
3183 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3184 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3185 {
3186 struct hsa_kernel_description *d
3187 = &kernel->module->image_desc->kernel_infos[i];
3188 if (d->name == kernel->name)
3189 {
3190 hsa_kernel_desc = d;
3191 break;
3192 }
3193 }
3194
3195 /* We may have statically-determined dimensions in
3196 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3197 invocation at runtime in dims[]. We allow static dimensions to take
3198 priority over dynamic dimensions when present (non-zero). */
3199 if (hsa_kernel_desc->oacc_dims[0] > 0)
3200 dims[0] = hsa_kernel_desc->oacc_dims[0];
3201 if (hsa_kernel_desc->oacc_dims[1] > 0)
3202 dims[1] = hsa_kernel_desc->oacc_dims[1];
3203 if (hsa_kernel_desc->oacc_dims[2] > 0)
3204 dims[2] = hsa_kernel_desc->oacc_dims[2];
3205
3206 /* Ideally, when a dimension isn't explicitly specified, we should
3207 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3208 In practice, we tune for peak performance on BabelStream, which
3209 for OpenACC is currently 32 threads per CU. */
3210 if (dims[0] == 0 && dims[1] == 0)
3211 {
3212 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3213 number. There isn't really a correct answer for this without a clue
3214 about the problem size, so let's do a reasonable number of workers
3215 and gangs. */
3216
3217 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3218 dims[1] = 8; /* Workers. */
3219 }
3220 else if (dims[0] == 0 && dims[1] > 0)
3221 {
3222 /* Auto-scale the number of gangs with the requested number of workers. */
3223 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3224 }
3225 else if (dims[0] > 0 && dims[1] == 0)
3226 {
3227 /* Auto-scale the number of workers with the requested number of gangs. */
3228 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3229 if (dims[1] == 0)
3230 dims[1] = 1;
3231 if (dims[1] > 16)
3232 dims[1] = 16;
3233 }
3234
3235 /* The incoming dimensions are expressed in terms of gangs, workers, and
3236 vectors. The HSA dimensions are expressed in terms of "work-items",
3237 which means multiples of vector lanes.
3238
3239 The "grid size" specifies the size of the problem space, and the
3240 "work-group size" specifies how much of that we want a single compute
3241 unit to chew on at once.
3242
3243 The three dimensions do not really correspond to hardware, but the
3244 important thing is that the HSA runtime will launch as many
3245 work-groups as it takes to process the entire grid, and each
3246 work-group will contain as many wave-fronts as it takes to process
3247 the work-items in that group.
3248
3249 Essentially, as long as we set the Y dimension to 64 (the number of
3250 vector lanes in hardware), and the Z group size to the maximum (16),
3251 then we will get the gangs (X) and workers (Z) launched as we expect.
3252
3253 The reason for the apparent reversal of vector and worker dimension
3254 order is to do with the way the run-time distributes work-items across
3255 v1 and v2. */
3256 struct GOMP_kernel_launch_attributes kla =
3257 {3,
3258 /* Grid size. */
3259 {dims[0], 64, dims[1]},
3260 /* Work-group size. */
3261 {1, 64, 16}
3262 };
3263
3264 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3265 acc_prof_info *prof_info = thr->prof_info;
3266 acc_event_info enqueue_launch_event_info;
3267 acc_api_info *api_info = thr->api_info;
3268 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3269 if (profiling_dispatch_p)
3270 {
3271 prof_info->event_type = acc_ev_enqueue_launch_start;
3272
3273 enqueue_launch_event_info.launch_event.event_type
3274 = prof_info->event_type;
3275 enqueue_launch_event_info.launch_event.valid_bytes
3276 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3277 enqueue_launch_event_info.launch_event.parent_construct
3278 = acc_construct_parallel;
3279 enqueue_launch_event_info.launch_event.implicit = 1;
3280 enqueue_launch_event_info.launch_event.tool_info = NULL;
3281 enqueue_launch_event_info.launch_event.kernel_name
3282 = (char *) kernel->name;
3283 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3284 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3285 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3286
3287 api_info->device_api = acc_device_api_other;
3288
3289 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3290 &enqueue_launch_event_info, api_info);
3291 }
3292
3293 if (!async)
3294 run_kernel (kernel, devaddrs, &kla, NULL, false);
3295 else
3296 queue_push_launch (aq, kernel, devaddrs, &kla);
3297
3298 if (profiling_dispatch_p)
3299 {
3300 prof_info->event_type = acc_ev_enqueue_launch_end;
3301 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3302 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3303 &enqueue_launch_event_info,
3304 api_info);
3305 }
3306 }
3307
3308 /* }}} */
3309 /* {{{ Generic Plugin API */
3310
3311 /* Return the name of the accelerator, which is "gcn". */
3312
3313 const char *
3314 GOMP_OFFLOAD_get_name (void)
3315 {
3316 return "gcn";
3317 }
3318
3319 /* Return the specific capabilities the HSA accelerator have. */
3320
3321 unsigned int
3322 GOMP_OFFLOAD_get_caps (void)
3323 {
3324 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3325 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3326 | GOMP_OFFLOAD_CAP_OPENACC_200;
3327 }
3328
3329 /* Identify as GCN accelerator. */
3330
3331 int
3332 GOMP_OFFLOAD_get_type (void)
3333 {
3334 return OFFLOAD_TARGET_TYPE_GCN;
3335 }
3336
3337 /* Return the libgomp version number we're compatible with. There is
3338 no requirement for cross-version compatibility. */
3339
3340 unsigned
3341 GOMP_OFFLOAD_version (void)
3342 {
3343 return GOMP_VERSION;
3344 }
3345
3346 /* Return the number of GCN devices on the system. */
3347
3348 int
3349 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3350 {
3351 if (!init_hsa_context (true))
3352 exit (EXIT_FAILURE);
3353 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3354 devices were present. */
3355 if (hsa_context.agent_count > 0
3356 && ((omp_requires_mask
3357 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3358 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
3359 return -1;
3360 return hsa_context.agent_count;
3361 }
3362
3363 /* Initialize device (agent) number N so that it can be used for computation.
3364 Return TRUE on success. */
3365
3366 bool
3367 GOMP_OFFLOAD_init_device (int n)
3368 {
3369 if (!init_hsa_context (false))
3370 return false;
3371 if (n >= hsa_context.agent_count)
3372 {
3373 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3374 return false;
3375 }
3376 struct agent_info *agent = &hsa_context.agents[n];
3377
3378 if (agent->initialized)
3379 return true;
3380
3381 agent->device_id = n;
3382
3383 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3384 {
3385 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3386 return false;
3387 }
3388 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3389 {
3390 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3391 return false;
3392 }
3393 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3394 {
3395 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3396 return false;
3397 }
3398 if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
3399 {
3400 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3401 return false;
3402 }
3403 agent->async_queues = NULL;
3404 agent->omp_async_queue = NULL;
3405 agent->ephemeral_memories_list = NULL;
3406
3407 uint32_t queue_size;
3408 hsa_status_t status;
3409 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3410 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3411 &queue_size);
3412 if (status != HSA_STATUS_SUCCESS)
3413 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3414 status);
3415
3416 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3417 &agent->name);
3418 if (status != HSA_STATUS_SUCCESS)
3419 return hsa_error ("Error querying the name of the agent", status);
3420
3421 agent->device_isa = isa_code (agent->name);
3422 if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED)
3423 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3424
3425 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3426 &agent->vendor_name);
3427 if (status != HSA_STATUS_SUCCESS)
3428 return hsa_error ("Error querying the vendor name of the agent", status);
3429
3430 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3431 HSA_QUEUE_TYPE_MULTI,
3432 hsa_queue_callback, NULL, UINT32_MAX,
3433 UINT32_MAX, &agent->sync_queue);
3434 if (status != HSA_STATUS_SUCCESS)
3435 return hsa_error ("Error creating command queue", status);
3436
3437 agent->kernarg_region.handle = (uint64_t) -1;
3438 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3439 get_kernarg_memory_region,
3440 &agent->kernarg_region);
3441 if (status != HSA_STATUS_SUCCESS
3442 && status != HSA_STATUS_INFO_BREAK)
3443 hsa_error ("Scanning memory regions failed", status);
3444 if (agent->kernarg_region.handle == (uint64_t) -1)
3445 {
3446 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3447 "arguments");
3448 return false;
3449 }
3450 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3451 dump_hsa_region (agent->kernarg_region, NULL);
3452
3453 agent->data_region.handle = (uint64_t) -1;
3454 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3455 get_data_memory_region,
3456 &agent->data_region);
3457 if (status != HSA_STATUS_SUCCESS
3458 && status != HSA_STATUS_INFO_BREAK)
3459 hsa_error ("Scanning memory regions failed", status);
3460 if (agent->data_region.handle == (uint64_t) -1)
3461 {
3462 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3463 "data");
3464 return false;
3465 }
3466 GCN_DEBUG ("Selected device data memory region:\n");
3467 dump_hsa_region (agent->data_region, NULL);
3468
3469 GCN_DEBUG ("GCN agent %d initialized\n", n);
3470
3471 agent->initialized = true;
3472 return true;
3473 }
3474
3475 /* Load GCN object-code module described by struct gcn_image_desc in
3476 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3477 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3478 contain the on-device addresses of the functions for reverse offload. To be
3479 freed by the caller. */
3480
3481 int
3482 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3483 struct addr_pair **target_table,
3484 uint64_t **rev_fn_table,
3485 uint64_t *host_ind_fn_table)
3486 {
3487 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3488 {
3489 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3490 " (expected %u, received %u)",
3491 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3492 return -1;
3493 }
3494
3495 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3496 struct agent_info *agent;
3497 struct addr_pair *pair;
3498 struct module_info *module;
3499 struct kernel_info *kernel;
3500 int kernel_count = image_desc->kernel_count;
3501 unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3502 ? image_desc->ind_func_count : 0;
3503 unsigned var_count = image_desc->global_variable_count;
3504 /* Currently, "others" is a struct of ICVS. */
3505 int other_count = 1;
3506
3507 agent = get_agent_info (ord);
3508 if (!agent)
3509 return -1;
3510
3511 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3512 {
3513 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3514 return -1;
3515 }
3516 if (agent->prog_finalized
3517 && !destroy_hsa_program (agent))
3518 return -1;
3519
3520 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3521 GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
3522 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3523 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3524 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3525 * sizeof (struct addr_pair));
3526 *target_table = pair;
3527 module = (struct module_info *)
3528 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3529 + kernel_count * sizeof (struct kernel_info));
3530 module->image_desc = image_desc;
3531 module->kernel_count = kernel_count;
3532 module->heap = NULL;
3533 module->constructors_run_p = false;
3534
3535 kernel = &module->kernels[0];
3536
3537 /* Allocate memory for kernel dependencies. */
3538 for (unsigned i = 0; i < kernel_count; i++)
3539 {
3540 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3541 if (!init_basic_kernel_info (kernel, d, agent, module))
3542 return -1;
3543 if (strcmp (d->name, "_init_array") == 0)
3544 module->init_array_func = kernel;
3545 else if (strcmp (d->name, "_fini_array") == 0)
3546 module->fini_array_func = kernel;
3547 else
3548 {
3549 pair->start = (uintptr_t) kernel;
3550 pair->end = (uintptr_t) (kernel + 1);
3551 pair++;
3552 }
3553 kernel++;
3554 }
3555
3556 agent->module = module;
3557 if (pthread_rwlock_unlock (&agent->module_rwlock))
3558 {
3559 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3560 return -1;
3561 }
3562
3563 if (!create_and_finalize_hsa_program (agent))
3564 return -1;
3565
3566 if (var_count > 0)
3567 {
3568 hsa_status_t status;
3569 hsa_executable_symbol_t var_symbol;
3570 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3571 ".offload_var_table",
3572 agent->id,
3573 0, &var_symbol);
3574
3575 if (status != HSA_STATUS_SUCCESS)
3576 hsa_fatal ("Could not find symbol for variable in the code object",
3577 status);
3578
3579 uint64_t var_table_addr;
3580 status = hsa_fns.hsa_executable_symbol_get_info_fn
3581 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3582 &var_table_addr);
3583 if (status != HSA_STATUS_SUCCESS)
3584 hsa_fatal ("Could not extract a variable from its symbol", status);
3585
3586 struct {
3587 uint64_t addr;
3588 uint64_t size;
3589 } var_table[var_count];
3590 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3591 (void*)var_table_addr, sizeof (var_table));
3592
3593 for (unsigned i = 0; i < var_count; i++)
3594 {
3595 pair->start = var_table[i].addr;
3596 pair->end = var_table[i].addr + var_table[i].size;
3597 GCN_DEBUG ("Found variable at %p with size %lu\n",
3598 (void *)var_table[i].addr, var_table[i].size);
3599 pair++;
3600 }
3601 }
3602
3603 if (ind_func_count > 0)
3604 {
3605 hsa_status_t status;
3606
3607 /* Read indirect function table from image. */
3608 hsa_executable_symbol_t ind_funcs_symbol;
3609 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3610 ".offload_ind_func_table",
3611 agent->id,
3612 0, &ind_funcs_symbol);
3613
3614 if (status != HSA_STATUS_SUCCESS)
3615 hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3616 "code object", status);
3617
3618 uint64_t ind_funcs_table_addr;
3619 status = hsa_fns.hsa_executable_symbol_get_info_fn
3620 (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3621 &ind_funcs_table_addr);
3622 if (status != HSA_STATUS_SUCCESS)
3623 hsa_fatal ("Could not extract a variable from its symbol", status);
3624
3625 uint64_t ind_funcs_table[ind_func_count];
3626 GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3627 (void*) ind_funcs_table_addr,
3628 sizeof (ind_funcs_table));
3629
3630 /* Build host->target address map for indirect functions. */
3631 uint64_t ind_fn_map[ind_func_count * 2 + 1];
3632 for (unsigned i = 0; i < ind_func_count; i++)
3633 {
3634 ind_fn_map[i * 2] = host_ind_fn_table[i];
3635 ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3636 GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3637 i, host_ind_fn_table[i], ind_funcs_table[i]);
3638 }
3639 ind_fn_map[ind_func_count * 2] = 0;
3640
3641 /* Write the map onto the target. */
3642 void *map_target_addr
3643 = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3644 GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3645
3646 GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3647 (void*) ind_fn_map,
3648 sizeof (ind_fn_map));
3649
3650 /* Write address of the map onto the target. */
3651 hsa_executable_symbol_t symbol;
3652
3653 status
3654 = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3655 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3656 agent->id, 0, &symbol);
3657 if (status != HSA_STATUS_SUCCESS)
3658 hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3659 status);
3660
3661 uint64_t varptr;
3662 uint32_t varsize;
3663
3664 status = hsa_fns.hsa_executable_symbol_get_info_fn
3665 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3666 &varptr);
3667 if (status != HSA_STATUS_SUCCESS)
3668 hsa_fatal ("Could not extract a variable from its symbol", status);
3669 status = hsa_fns.hsa_executable_symbol_get_info_fn
3670 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3671 &varsize);
3672 if (status != HSA_STATUS_SUCCESS)
3673 hsa_fatal ("Could not extract a variable size from its symbol",
3674 status);
3675
3676 GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3677 varptr, varsize);
3678
3679 GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3680 &map_target_addr,
3681 sizeof (map_target_addr));
3682 }
3683
3684 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3685
3686 hsa_status_t status;
3687 hsa_executable_symbol_t var_symbol;
3688 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3689 XSTRING (GOMP_ADDITIONAL_ICVS),
3690 agent->id, 0, &var_symbol);
3691 if (status == HSA_STATUS_SUCCESS)
3692 {
3693 uint64_t varptr;
3694 uint32_t varsize;
3695
3696 status = hsa_fns.hsa_executable_symbol_get_info_fn
3697 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3698 &varptr);
3699 if (status != HSA_STATUS_SUCCESS)
3700 hsa_fatal ("Could not extract a variable from its symbol", status);
3701 status = hsa_fns.hsa_executable_symbol_get_info_fn
3702 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3703 &varsize);
3704 if (status != HSA_STATUS_SUCCESS)
3705 hsa_fatal ("Could not extract a variable size from its symbol",
3706 status);
3707
3708 pair->start = varptr;
3709 pair->end = varptr + varsize;
3710 }
3711 else
3712 {
3713 /* The variable was not in this image. */
3714 GCN_DEBUG ("Variable not found in image: %s\n",
3715 XSTRING (GOMP_ADDITIONAL_ICVS));
3716 pair->start = pair->end = 0;
3717 }
3718
3719 /* Ensure that constructors are run first. */
3720 struct GOMP_kernel_launch_attributes kla =
3721 { 3,
3722 /* Grid size. */
3723 { 1, 64, 1 },
3724 /* Work-group size. */
3725 { 1, 64, 1 }
3726 };
3727
3728 if (module->init_array_func)
3729 {
3730 init_kernel (module->init_array_func);
3731 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3732 }
3733 module->constructors_run_p = true;
3734
3735 /* Don't report kernels that libgomp need not know about. */
3736 if (module->init_array_func)
3737 kernel_count--;
3738 if (module->fini_array_func)
3739 kernel_count--;
3740
3741 if (rev_fn_table != NULL && kernel_count == 0)
3742 *rev_fn_table = NULL;
3743 else if (rev_fn_table != NULL)
3744 {
3745 hsa_status_t status;
3746 hsa_executable_symbol_t var_symbol;
3747 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3748 ".offload_func_table",
3749 agent->id, 0, &var_symbol);
3750 if (status != HSA_STATUS_SUCCESS)
3751 hsa_fatal ("Could not find symbol for variable in the code object",
3752 status);
3753 uint64_t fn_table_addr;
3754 status = hsa_fns.hsa_executable_symbol_get_info_fn
3755 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3756 &fn_table_addr);
3757 if (status != HSA_STATUS_SUCCESS)
3758 hsa_fatal ("Could not extract a variable from its symbol", status);
3759 *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3760 GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3761 (void*) fn_table_addr,
3762 kernel_count * sizeof (uint64_t));
3763 }
3764
3765 return kernel_count + var_count + other_count;
3766 }
3767
3768 /* Unload GCN object-code module described by struct gcn_image_desc in
3769 TARGET_DATA from agent number N. Return TRUE on success. */
3770
3771 bool
3772 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3773 {
3774 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3775 {
3776 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3777 " (expected %u, received %u)",
3778 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3779 return false;
3780 }
3781
3782 struct agent_info *agent;
3783 agent = get_agent_info (n);
3784 if (!agent)
3785 return false;
3786
3787 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3788 {
3789 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3790 return false;
3791 }
3792
3793 if (!agent->module || agent->module->image_desc != target_data)
3794 {
3795 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3796 "loaded before");
3797 return false;
3798 }
3799
3800 if (!destroy_module (agent->module, true))
3801 return false;
3802 free (agent->module);
3803 agent->module = NULL;
3804 if (!destroy_hsa_program (agent))
3805 return false;
3806 if (pthread_rwlock_unlock (&agent->module_rwlock))
3807 {
3808 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3809 return false;
3810 }
3811 return true;
3812 }
3813
3814 /* Deinitialize all information and status associated with agent number N. We
3815 do not attempt any synchronization, assuming the user and libgomp will not
3816 attempt deinitialization of a device that is in any way being used at the
3817 same time. Return TRUE on success. */
3818
3819 bool
3820 GOMP_OFFLOAD_fini_device (int n)
3821 {
3822 struct agent_info *agent = get_agent_info (n);
3823 if (!agent)
3824 return false;
3825
3826 if (!agent->initialized)
3827 return true;
3828
3829 if (agent->omp_async_queue)
3830 {
3831 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3832 agent->omp_async_queue = NULL;
3833 }
3834
3835 if (agent->module)
3836 {
3837 if (!destroy_module (agent->module, false))
3838 return false;
3839 free (agent->module);
3840 agent->module = NULL;
3841 }
3842
3843 if (!destroy_ephemeral_memories (agent))
3844 return false;
3845
3846 if (!destroy_hsa_program (agent))
3847 return false;
3848
3849 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3850 if (status != HSA_STATUS_SUCCESS)
3851 return hsa_error ("Error destroying command queue", status);
3852
3853 if (pthread_mutex_destroy (&agent->prog_mutex))
3854 {
3855 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3856 return false;
3857 }
3858 if (pthread_rwlock_destroy (&agent->module_rwlock))
3859 {
3860 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3861 return false;
3862 }
3863
3864 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3865 {
3866 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3867 return false;
3868 }
3869 if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
3870 {
3871 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3872 return false;
3873 }
3874 agent->initialized = false;
3875 return true;
3876 }
3877
3878 /* Return true if the HSA runtime can run function FN_PTR. */
3879
3880 bool
3881 GOMP_OFFLOAD_can_run (void *fn_ptr)
3882 {
3883 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3884
3885 init_kernel (kernel);
3886 if (kernel->initialization_failed)
3887 GOMP_PLUGIN_fatal ("kernel initialization failed");
3888
3889 return true;
3890 }
3891
3892 /* Allocate memory on device N. */
3893
3894 void *
3895 GOMP_OFFLOAD_alloc (int n, size_t size)
3896 {
3897 struct agent_info *agent = get_agent_info (n);
3898 return alloc_by_agent (agent, size);
3899 }
3900
3901 /* Free memory from device N. */
3902
3903 bool
3904 GOMP_OFFLOAD_free (int device, void *ptr)
3905 {
3906 GCN_DEBUG ("Freeing memory on device %d\n", device);
3907
3908 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3909 if (status != HSA_STATUS_SUCCESS)
3910 {
3911 hsa_error ("Could not free device memory", status);
3912 return false;
3913 }
3914
3915 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3916 bool profiling_dispatch_p
3917 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3918 if (profiling_dispatch_p)
3919 {
3920 acc_prof_info *prof_info = thr->prof_info;
3921 acc_event_info data_event_info;
3922 acc_api_info *api_info = thr->api_info;
3923
3924 prof_info->event_type = acc_ev_free;
3925
3926 data_event_info.data_event.event_type = prof_info->event_type;
3927 data_event_info.data_event.valid_bytes
3928 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3929 data_event_info.data_event.parent_construct
3930 = acc_construct_parallel;
3931 data_event_info.data_event.implicit = 1;
3932 data_event_info.data_event.tool_info = NULL;
3933 data_event_info.data_event.var_name = NULL;
3934 data_event_info.data_event.bytes = 0;
3935 data_event_info.data_event.host_ptr = NULL;
3936 data_event_info.data_event.device_ptr = (void *) ptr;
3937
3938 api_info->device_api = acc_device_api_other;
3939
3940 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3941 api_info);
3942 }
3943
3944 return true;
3945 }
3946
3947 /* Copy data from DEVICE to host. */
3948
3949 bool
3950 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3951 {
3952 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3953 src, dst);
3954 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3955 if (status != HSA_STATUS_SUCCESS)
3956 GOMP_PLUGIN_error ("memory copy failed");
3957 return true;
3958 }
3959
3960 /* Copy data from host to DEVICE. */
3961
3962 bool
3963 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3964 {
3965 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3966 device, dst);
3967 hsa_memory_copy_wrapper (dst, src, n);
3968 return true;
3969 }
3970
3971 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3972
3973 bool
3974 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3975 {
3976 struct gcn_thread *thread_data = gcn_thread ();
3977
3978 if (thread_data && !async_synchronous_p (thread_data->async))
3979 {
3980 struct agent_info *agent = get_agent_info (device);
3981 maybe_init_omp_async (agent);
3982 queue_push_copy (agent->omp_async_queue, dst, src, n);
3983 return true;
3984 }
3985
3986 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3987 device, src, device, dst);
3988 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3989 if (status != HSA_STATUS_SUCCESS)
3990 GOMP_PLUGIN_error ("memory copy failed");
3991 return true;
3992 }
3993
3994 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
3995 measured in bytes. So we have:
3996
3997 dim1_size: number of bytes to copy on innermost dimension ("row")
3998 dim0_len: number of rows to copy
3999 dst: base pointer for destination of copy
4000 dst_offset1_size: innermost row offset (for dest), in bytes
4001 dst_offset0_len: offset, number of rows (for dest)
4002 dst_dim1_size: whole-array dest row length, in bytes (pitch)
4003 src: base pointer for source of copy
4004 src_offset1_size: innermost row offset (for source), in bytes
4005 src_offset0_len: offset, number of rows (for source)
4006 src_dim1_size: whole-array source row length, in bytes (pitch)
4007 */
4008
4009 int
4010 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
4011 size_t dim0_len, void *dst, size_t dst_offset1_size,
4012 size_t dst_offset0_len, size_t dst_dim1_size,
4013 const void *src, size_t src_offset1_size,
4014 size_t src_offset0_len, size_t src_dim1_size)
4015 {
4016 if (!hsa_fns.hsa_amd_memory_lock_fn
4017 || !hsa_fns.hsa_amd_memory_unlock_fn
4018 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4019 return -1;
4020
4021 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4022 out quietly if we have anything oddly-aligned rather than letting the
4023 driver raise an error. */
4024 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4025 return -1;
4026
4027 if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
4028 return -1;
4029
4030 /* Only handle host to device or device to host transfers here. */
4031 if ((dst_ord == -1 && src_ord == -1)
4032 || (dst_ord != -1 && src_ord != -1))
4033 return -1;
4034
4035 hsa_amd_copy_direction_t dir
4036 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4037 hsa_agent_t copy_agent;
4038
4039 /* We need to pin (lock) host memory before we start the transfer. Try to
4040 lock the minimum size necessary, i.e. using partial first/last rows of the
4041 whole array. Something like this:
4042
4043 rows -->
4044 ..............
4045 c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4046 o | ++#######+++++ <- whole row
4047 l | ++#######+++++ <- "
4048 s v ++#######..... <- last row apart from trailing remainder
4049 ..............
4050
4051 We could split very large transfers into several rectangular copies, but
4052 that is unimplemented for now. */
4053
4054 size_t bounded_size_host, first_elem_offset_host;
4055 void *host_ptr;
4056 if (dir == hsaHostToDevice)
4057 {
4058 bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
4059 first_elem_offset_host = src_offset0_len * src_dim1_size
4060 + src_offset1_size;
4061 host_ptr = (void *) src;
4062 struct agent_info *agent = get_agent_info (dst_ord);
4063 copy_agent = agent->id;
4064 }
4065 else
4066 {
4067 bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
4068 first_elem_offset_host = dst_offset0_len * dst_dim1_size
4069 + dst_offset1_size;
4070 host_ptr = dst;
4071 struct agent_info *agent = get_agent_info (src_ord);
4072 copy_agent = agent->id;
4073 }
4074
4075 void *agent_ptr;
4076
4077 hsa_status_t status
4078 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4079 bounded_size_host, NULL, 0, &agent_ptr);
4080 /* We can't lock the host memory: don't give up though, we might still be
4081 able to use the slow path in our caller. So, don't make this an
4082 error. */
4083 if (status != HSA_STATUS_SUCCESS)
4084 return -1;
4085
4086 hsa_pitched_ptr_t dstpp, srcpp;
4087 hsa_dim3_t dst_offsets, src_offsets, ranges;
4088
4089 int retval = 1;
4090
4091 hsa_signal_t completion_signal;
4092 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4093 if (status != HSA_STATUS_SUCCESS)
4094 {
4095 retval = -1;
4096 goto unlock;
4097 }
4098
4099 if (dir == hsaHostToDevice)
4100 {
4101 srcpp.base = agent_ptr - first_elem_offset_host;
4102 dstpp.base = dst;
4103 }
4104 else
4105 {
4106 srcpp.base = (void *) src;
4107 dstpp.base = agent_ptr - first_elem_offset_host;
4108 }
4109
4110 srcpp.pitch = src_dim1_size;
4111 srcpp.slice = 0;
4112
4113 src_offsets.x = src_offset1_size;
4114 src_offsets.y = src_offset0_len;
4115 src_offsets.z = 0;
4116
4117 dstpp.pitch = dst_dim1_size;
4118 dstpp.slice = 0;
4119
4120 dst_offsets.x = dst_offset1_size;
4121 dst_offsets.y = dst_offset0_len;
4122 dst_offsets.z = 0;
4123
4124 ranges.x = dim1_size;
4125 ranges.y = dim0_len;
4126 ranges.z = 1;
4127
4128 status
4129 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4130 &src_offsets, &ranges,
4131 copy_agent, dir, 0, NULL,
4132 completion_signal);
4133 /* If the rectangular copy fails, we might still be able to use the slow
4134 path. We need to unlock the host memory though, so don't return
4135 immediately. */
4136 if (status != HSA_STATUS_SUCCESS)
4137 retval = -1;
4138 else
4139 hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4140 HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
4141 HSA_WAIT_STATE_ACTIVE);
4142
4143 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4144
4145 unlock:
4146 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4147 if (status != HSA_STATUS_SUCCESS)
4148 hsa_fatal ("Could not unlock host memory", status);
4149
4150 return retval;
4151 }
4152
4153 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4154 measured in bytes. So we have:
4155
4156 dim2_size: number of bytes to copy on innermost dimension ("row")
4157 dim1_len: number of rows per slice to copy
4158 dim0_len: number of slices to copy
4159 dst: base pointer for destination of copy
4160 dst_offset2_size: innermost row offset (for dest), in bytes
4161 dst_offset1_len: offset, number of rows (for dest)
4162 dst_offset0_len: offset, number of slices (for dest)
4163 dst_dim2_size: whole-array dest row length, in bytes (pitch)
4164 dst_dim1_len: whole-array number of rows in slice (for dest)
4165 src: base pointer for source of copy
4166 src_offset2_size: innermost row offset (for source), in bytes
4167 src_offset1_len: offset, number of rows (for source)
4168 src_offset0_len: offset, number of slices (for source)
4169 src_dim2_size: whole-array source row length, in bytes (pitch)
4170 src_dim1_len: whole-array number of rows in slice (for source)
4171 */
4172
4173 int
4174 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
4175 size_t dim1_len, size_t dim0_len, void *dst,
4176 size_t dst_offset2_size, size_t dst_offset1_len,
4177 size_t dst_offset0_len, size_t dst_dim2_size,
4178 size_t dst_dim1_len, const void *src,
4179 size_t src_offset2_size, size_t src_offset1_len,
4180 size_t src_offset0_len, size_t src_dim2_size,
4181 size_t src_dim1_len)
4182 {
4183 if (!hsa_fns.hsa_amd_memory_lock_fn
4184 || !hsa_fns.hsa_amd_memory_unlock_fn
4185 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4186 return -1;
4187
4188 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4189 out quietly if we have anything oddly-aligned rather than letting the
4190 driver raise an error. */
4191 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4192 return -1;
4193
4194 if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
4195 return -1;
4196
4197 /* Only handle host to device or device to host transfers here. */
4198 if ((dst_ord == -1 && src_ord == -1)
4199 || (dst_ord != -1 && src_ord != -1))
4200 return -1;
4201
4202 hsa_amd_copy_direction_t dir
4203 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4204 hsa_agent_t copy_agent;
4205
4206 /* We need to pin (lock) host memory before we start the transfer. Try to
4207 lock the minimum size necessary, i.e. using partial first/last slices of
4208 the whole 3D array. Something like this:
4209
4210 slice 0: slice 1: slice 2:
4211 __________ __________ __________
4212 ^ /+++++++++/ : /+++++++++/ : / /
4213 column /+++##++++/| | /+++##++++/| | /+++## / # = subarray
4214 / / ##++++/ | |/+++##++++/ | |/+++##++++/ + = area to pin
4215 /_________/ : /_________/ : /_________/
4216 row --->
4217
4218 We could split very large transfers into several rectangular copies, but
4219 that is unimplemented for now. */
4220
4221 size_t bounded_size_host, first_elem_offset_host;
4222 void *host_ptr;
4223 if (dir == hsaHostToDevice)
4224 {
4225 size_t slice_bytes = src_dim2_size * src_dim1_len;
4226 bounded_size_host = slice_bytes * (dim0_len - 1)
4227 + src_dim2_size * (dim1_len - 1)
4228 + dim2_size;
4229 first_elem_offset_host = src_offset0_len * slice_bytes
4230 + src_offset1_len * src_dim2_size
4231 + src_offset2_size;
4232 host_ptr = (void *) src;
4233 struct agent_info *agent = get_agent_info (dst_ord);
4234 copy_agent = agent->id;
4235 }
4236 else
4237 {
4238 size_t slice_bytes = dst_dim2_size * dst_dim1_len;
4239 bounded_size_host = slice_bytes * (dim0_len - 1)
4240 + dst_dim2_size * (dim1_len - 1)
4241 + dim2_size;
4242 first_elem_offset_host = dst_offset0_len * slice_bytes
4243 + dst_offset1_len * dst_dim2_size
4244 + dst_offset2_size;
4245 host_ptr = dst;
4246 struct agent_info *agent = get_agent_info (src_ord);
4247 copy_agent = agent->id;
4248 }
4249
4250 void *agent_ptr;
4251
4252 hsa_status_t status
4253 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4254 bounded_size_host, NULL, 0, &agent_ptr);
4255 /* We can't lock the host memory: don't give up though, we might still be
4256 able to use the slow path in our caller (maybe even with iterated memcpy2d
4257 calls). So, don't make this an error. */
4258 if (status != HSA_STATUS_SUCCESS)
4259 return -1;
4260
4261 hsa_pitched_ptr_t dstpp, srcpp;
4262 hsa_dim3_t dst_offsets, src_offsets, ranges;
4263
4264 int retval = 1;
4265
4266 hsa_signal_t completion_signal;
4267 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4268 if (status != HSA_STATUS_SUCCESS)
4269 {
4270 retval = -1;
4271 goto unlock;
4272 }
4273
4274 if (dir == hsaHostToDevice)
4275 {
4276 srcpp.base = agent_ptr - first_elem_offset_host;
4277 dstpp.base = dst;
4278 }
4279 else
4280 {
4281 srcpp.base = (void *) src;
4282 dstpp.base = agent_ptr - first_elem_offset_host;
4283 }
4284
4285 /* Pitch is measured in bytes. */
4286 srcpp.pitch = src_dim2_size;
4287 /* Slice is also measured in bytes (i.e. total per-slice). */
4288 srcpp.slice = src_dim2_size * src_dim1_len;
4289
4290 src_offsets.x = src_offset2_size;
4291 src_offsets.y = src_offset1_len;
4292 src_offsets.z = src_offset0_len;
4293
4294 /* As above. */
4295 dstpp.pitch = dst_dim2_size;
4296 dstpp.slice = dst_dim2_size * dst_dim1_len;
4297
4298 dst_offsets.x = dst_offset2_size;
4299 dst_offsets.y = dst_offset1_len;
4300 dst_offsets.z = dst_offset0_len;
4301
4302 ranges.x = dim2_size;
4303 ranges.y = dim1_len;
4304 ranges.z = dim0_len;
4305
4306 status
4307 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4308 &src_offsets, &ranges,
4309 copy_agent, dir, 0, NULL,
4310 completion_signal);
4311 /* If the rectangular copy fails, we might still be able to use the slow
4312 path. We need to unlock the host memory though, so don't return
4313 immediately. */
4314 if (status != HSA_STATUS_SUCCESS)
4315 retval = -1;
4316 else
4317 {
4318 hsa_signal_value_t sv
4319 = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4320 HSA_SIGNAL_CONDITION_LT, 1,
4321 UINT64_MAX,
4322 HSA_WAIT_STATE_ACTIVE);
4323 if (sv < 0)
4324 {
4325 GCN_WARNING ("async copy rect failure");
4326 retval = -1;
4327 }
4328 }
4329
4330 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4331
4332 unlock:
4333 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4334 if (status != HSA_STATUS_SUCCESS)
4335 hsa_fatal ("Could not unlock host memory", status);
4336
4337 return retval;
4338 }
4339
4340 /* }}} */
4341 /* {{{ OpenMP Plugin API */
4342
4343 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4344 in VARS as a parameter. The kernel is identified by FN_PTR which must point
4345 to a kernel_info structure, and must have previously been loaded to the
4346 specified device. */
4347
4348 void
4349 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
4350 {
4351 struct agent_info *agent = get_agent_info (device);
4352 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4353 struct GOMP_kernel_launch_attributes def;
4354 struct GOMP_kernel_launch_attributes *kla;
4355 assert (agent == kernel->agent);
4356
4357 /* If we get here then the kernel must be OpenMP. */
4358 kernel->kind = KIND_OPENMP;
4359
4360 if (!parse_target_attributes (args, &def, &kla, agent))
4361 {
4362 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4363 return;
4364 }
4365 run_kernel (kernel, vars, kla, NULL, false);
4366 }
4367
4368 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
4369 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4370 GOMP_PLUGIN_target_task_completion when it has finished. */
4371
4372 void
4373 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
4374 void **args, void *async_data)
4375 {
4376 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4377 struct agent_info *agent = get_agent_info (device);
4378 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
4379 struct GOMP_kernel_launch_attributes def;
4380 struct GOMP_kernel_launch_attributes *kla;
4381 assert (agent == kernel->agent);
4382
4383 /* If we get here then the kernel must be OpenMP. */
4384 kernel->kind = KIND_OPENMP;
4385
4386 if (!parse_target_attributes (args, &def, &kla, agent))
4387 {
4388 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4389 return;
4390 }
4391
4392 maybe_init_omp_async (agent);
4393 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
4394 queue_push_callback (agent->omp_async_queue,
4395 GOMP_PLUGIN_target_task_completion, async_data);
4396 }
4397
4398 /* }}} */
4399 /* {{{ OpenACC Plugin API */
4400
4401 /* Run a synchronous OpenACC kernel. The device number is inferred from the
4402 already-loaded KERNEL. */
4403
4404 void
4405 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
4406 size_t mapnum __attribute__((unused)),
4407 void **hostaddrs __attribute__((unused)),
4408 void **devaddrs, unsigned *dims,
4409 void *targ_mem_desc)
4410 {
4411 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4412
4413 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
4414 }
4415
4416 /* Run an asynchronous OpenACC kernel on the specified queue. */
4417
4418 void
4419 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4420 size_t mapnum __attribute__((unused)),
4421 void **hostaddrs __attribute__((unused)),
4422 void **devaddrs,
4423 unsigned *dims, void *targ_mem_desc,
4424 struct goacc_asyncqueue *aq)
4425 {
4426 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4427
4428 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
4429 }
4430
4431 /* Create a new asynchronous thread and queue for running future kernels. */
4432
4433 struct goacc_asyncqueue *
4434 GOMP_OFFLOAD_openacc_async_construct (int device)
4435 {
4436 struct agent_info *agent = get_agent_info (device);
4437
4438 pthread_mutex_lock (&agent->async_queues_mutex);
4439
4440 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4441 aq->agent = get_agent_info (device);
4442 aq->prev = NULL;
4443 aq->next = agent->async_queues;
4444 if (aq->next)
4445 {
4446 aq->next->prev = aq;
4447 aq->id = aq->next->id + 1;
4448 }
4449 else
4450 aq->id = 1;
4451 agent->async_queues = aq;
4452
4453 aq->queue_first = 0;
4454 aq->queue_n = 0;
4455 aq->drain_queue_stop = 0;
4456
4457 if (pthread_mutex_init (&aq->mutex, NULL))
4458 {
4459 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4460 return false;
4461 }
4462 if (pthread_cond_init (&aq->queue_cond_in, NULL))
4463 {
4464 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4465 return false;
4466 }
4467 if (pthread_cond_init (&aq->queue_cond_out, NULL))
4468 {
4469 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4470 return false;
4471 }
4472
4473 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4474 ASYNC_QUEUE_SIZE,
4475 HSA_QUEUE_TYPE_MULTI,
4476 hsa_queue_callback, NULL,
4477 UINT32_MAX, UINT32_MAX,
4478 &aq->hsa_queue);
4479 if (status != HSA_STATUS_SUCCESS)
4480 hsa_fatal ("Error creating command queue", status);
4481
4482 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4483 if (err != 0)
4484 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4485 strerror (err));
4486 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4487 aq->id);
4488
4489 pthread_mutex_unlock (&agent->async_queues_mutex);
4490
4491 return aq;
4492 }
4493
4494 /* Destroy an existing asynchronous thread and queue. Waits for any
4495 currently-running task to complete, but cancels any queued tasks. */
4496
4497 bool
4498 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4499 {
4500 struct agent_info *agent = aq->agent;
4501
4502 finalize_async_thread (aq);
4503
4504 pthread_mutex_lock (&agent->async_queues_mutex);
4505
4506 int err;
4507 if ((err = pthread_mutex_destroy (&aq->mutex)))
4508 {
4509 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4510 goto fail;
4511 }
4512 if (pthread_cond_destroy (&aq->queue_cond_in))
4513 {
4514 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4515 goto fail;
4516 }
4517 if (pthread_cond_destroy (&aq->queue_cond_out))
4518 {
4519 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4520 goto fail;
4521 }
4522 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4523 if (status != HSA_STATUS_SUCCESS)
4524 {
4525 hsa_error ("Error destroying command queue", status);
4526 goto fail;
4527 }
4528
4529 if (aq->prev)
4530 aq->prev->next = aq->next;
4531 if (aq->next)
4532 aq->next->prev = aq->prev;
4533 if (agent->async_queues == aq)
4534 agent->async_queues = aq->next;
4535
4536 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4537
4538 free (aq);
4539 pthread_mutex_unlock (&agent->async_queues_mutex);
4540 return true;
4541
4542 fail:
4543 pthread_mutex_unlock (&agent->async_queues_mutex);
4544 return false;
4545 }
4546
4547 /* Return true if the specified async queue is currently empty. */
4548
4549 int
4550 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4551 {
4552 return queue_empty (aq);
4553 }
4554
4555 /* Block until the specified queue has executed all its tasks and the
4556 queue is empty. */
4557
4558 bool
4559 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4560 {
4561 wait_queue (aq);
4562 return true;
4563 }
4564
4565 /* Add a serialization point across two async queues. Any new tasks added to
4566 AQ2, after this call, will not run until all tasks on AQ1, at the time
4567 of this call, have completed. */
4568
4569 bool
4570 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4571 struct goacc_asyncqueue *aq2)
4572 {
4573 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4574 scheduled to run on it up to this point. */
4575 if (aq1 != aq2)
4576 {
4577 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4578 queue_push_asyncwait (aq2, placeholderp);
4579 }
4580 return true;
4581 }
4582
4583 /* Add an opaque callback to the given async queue. */
4584
4585 void
4586 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4587 void (*fn) (void *), void *data)
4588 {
4589 queue_push_callback (aq, fn, data);
4590 }
4591
4592 /* Queue up an asynchronous data copy from host to DEVICE. */
4593
4594 bool
4595 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4596 size_t n, struct goacc_asyncqueue *aq)
4597 {
4598 struct agent_info *agent = get_agent_info (device);
4599 assert (agent == aq->agent);
4600 queue_push_copy (aq, dst, src, n);
4601 return true;
4602 }
4603
4604 /* Queue up an asynchronous data copy from DEVICE to host. */
4605
4606 bool
4607 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4608 size_t n, struct goacc_asyncqueue *aq)
4609 {
4610 struct agent_info *agent = get_agent_info (device);
4611 assert (agent == aq->agent);
4612 queue_push_copy (aq, dst, src, n);
4613 return true;
4614 }
4615
4616 union goacc_property_value
4617 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4618 {
4619 struct agent_info *agent = get_agent_info (device);
4620
4621 union goacc_property_value propval = { .val = 0 };
4622
4623 switch (prop)
4624 {
4625 case GOACC_PROPERTY_FREE_MEMORY:
4626 /* Not supported. */
4627 break;
4628 case GOACC_PROPERTY_MEMORY:
4629 {
4630 size_t size;
4631 hsa_region_t region = agent->data_region;
4632 hsa_status_t status =
4633 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4634 if (status == HSA_STATUS_SUCCESS)
4635 propval.val = size;
4636 break;
4637 }
4638 case GOACC_PROPERTY_NAME:
4639 propval.ptr = agent->name;
4640 break;
4641 case GOACC_PROPERTY_VENDOR:
4642 propval.ptr = agent->vendor_name;
4643 break;
4644 case GOACC_PROPERTY_DRIVER:
4645 propval.ptr = hsa_context.driver_version_s;
4646 break;
4647 }
4648
4649 return propval;
4650 }
4651
4652 /* Set up plugin-specific thread-local-data (host-side). */
4653
4654 void *
4655 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4656 {
4657 struct gcn_thread *thread_data
4658 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4659
4660 thread_data->async = GOMP_ASYNC_SYNC;
4661
4662 return (void *) thread_data;
4663 }
4664
4665 /* Clean up plugin-specific thread-local-data. */
4666
4667 void
4668 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4669 {
4670 free (data);
4671 }
4672
4673 /* }}} */
4674