Home | History | Annotate | Line # | Download | only in plugin
plugin-gcn.c revision 1.1.1.3
      1 /* Plugin for AMD GCN execution.
      2 
      3    Copyright (C) 2013-2024 Free Software Foundation, Inc.
      4 
      5    Contributed by Mentor Embedded
      6 
      7    This file is part of the GNU Offloading and Multi Processing Library
      8    (libgomp).
      9 
     10    Libgomp is free software; you can redistribute it and/or modify it
     11    under the terms of the GNU General Public License as published by
     12    the Free Software Foundation; either version 3, or (at your option)
     13    any later version.
     14 
     15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
     16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
     17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
     18    more details.
     19 
     20    Under Section 7 of GPL version 3, you are granted additional
     21    permissions described in the GCC Runtime Library Exception, version
     22    3.1, as published by the Free Software Foundation.
     23 
     24    You should have received a copy of the GNU General Public License and
     25    a copy of the GCC Runtime Library Exception along with this program;
     26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     27    <http://www.gnu.org/licenses/>.  */
     28 
     29 /* {{{ Includes and defines  */
     30 
     31 #include "config.h"
     32 #include "symcat.h"
     33 #include <stdio.h>
     34 #include <stdlib.h>
     35 #include <string.h>
     36 #include <pthread.h>
     37 #include <inttypes.h>
     38 #include <stdbool.h>
     39 #include <limits.h>
     40 #include <hsa.h>
     41 #include <hsa_ext_amd.h>
     42 #include <dlfcn.h>
     43 #include <signal.h>
     44 #include "libgomp-plugin.h"
     45 #include "config/gcn/libgomp-gcn.h"  /* For struct output.  */
     46 #include "gomp-constants.h"
     47 #include <elf.h>
     48 #include "oacc-plugin.h"
     49 #include "oacc-int.h"
     50 #include <assert.h>
     51 
     52 /* These probably won't be in elf.h for a while.  */
     53 #ifndef R_AMDGPU_NONE
     54 #define R_AMDGPU_NONE		0
     55 #define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
     56 #define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
     57 #define R_AMDGPU_ABS64		3	/* S + A  */
     58 #define R_AMDGPU_REL32		4	/* S + A - P  */
     59 #define R_AMDGPU_REL64		5	/* S + A - P  */
     60 #define R_AMDGPU_ABS32		6	/* S + A  */
     61 #define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
     62 #define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
     63 #define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
     64 #define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
     65 #define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
     66 #define R_AMDGPU_RELATIVE64	13	/* B + A  */
     67 #endif
     68 
     69 /* GCN specific definitions for asynchronous queues.  */
     70 
     71 #define ASYNC_QUEUE_SIZE 64
     72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
     73 #define DEBUG_QUEUES 0
     74 #define DEBUG_THREAD_SLEEP 0
     75 #define DEBUG_THREAD_SIGNAL 0
     76 
     77 /* Defaults.  */
     78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
     79 
     80 /* Secure getenv() which returns NULL if running as SUID/SGID.  */
     81 #ifndef HAVE_SECURE_GETENV
     82 #ifdef HAVE___SECURE_GETENV
     83 #define secure_getenv __secure_getenv
     84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
     85   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
     86 
     87 #include <unistd.h>
     88 
     89 /* Implementation of secure_getenv() for targets where it is not provided but
     90    we have at least means to test real and effective IDs. */
     91 
     92 static char *
     93 secure_getenv (const char *name)
     94 {
     95   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
     96     return getenv (name);
     97   else
     98     return NULL;
     99 }
    100 
    101 #else
    102 #define secure_getenv getenv
    103 #endif
    104 #endif
    105 
    106 /* }}}  */
    107 /* {{{ Types  */
    108 
    109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
    110 
    111 struct gcn_thread
    112 {
    113   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
    114   int async;
    115 };
    116 
    117 /* As an HSA runtime is dlopened, following structure defines function
    118    pointers utilized by the HSA plug-in.  */
    119 
    120 struct hsa_runtime_fn_info
    121 {
    122   /* HSA runtime.  */
    123   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
    124 					const char **status_string);
    125   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
    126 					  void *value);
    127   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
    128 					 hsa_agent_info_t attribute,
    129 					 void *value);
    130   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
    131 				      hsa_isa_info_t attribute,
    132 				      uint32_t index,
    133 				      void *value);
    134   hsa_status_t (*hsa_init_fn) (void);
    135   hsa_status_t (*hsa_iterate_agents_fn)
    136     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
    137   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
    138 					  hsa_region_info_t attribute,
    139 					  void *value);
    140   hsa_status_t (*hsa_queue_create_fn)
    141     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
    142      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
    143      void *data, uint32_t private_segment_size,
    144      uint32_t group_segment_size, hsa_queue_t **queue);
    145   hsa_status_t (*hsa_agent_iterate_regions_fn)
    146     (hsa_agent_t agent,
    147      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
    148   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
    149   hsa_status_t (*hsa_executable_create_fn)
    150     (hsa_profile_t profile, hsa_executable_state_t executable_state,
    151      const char *options, hsa_executable_t *executable);
    152   hsa_status_t (*hsa_executable_global_variable_define_fn)
    153     (hsa_executable_t executable, const char *variable_name, void *address);
    154   hsa_status_t (*hsa_executable_load_code_object_fn)
    155     (hsa_executable_t executable, hsa_agent_t agent,
    156      hsa_code_object_t code_object, const char *options);
    157   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
    158 					   const char *options);
    159   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
    160 					uint32_t num_consumers,
    161 					const hsa_agent_t *consumers,
    162 					hsa_signal_t *signal);
    163   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
    164 					  void **ptr);
    165   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
    166 					      hsa_access_permission_t access);
    167   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
    168   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
    169   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
    170   hsa_status_t (*hsa_executable_get_symbol_fn)
    171     (hsa_executable_t executable, const char *module_name,
    172      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
    173      hsa_executable_symbol_t *symbol);
    174   hsa_status_t (*hsa_executable_symbol_get_info_fn)
    175     (hsa_executable_symbol_t executable_symbol,
    176      hsa_executable_symbol_info_t attribute, void *value);
    177   hsa_status_t (*hsa_executable_iterate_symbols_fn)
    178     (hsa_executable_t executable,
    179      hsa_status_t (*callback)(hsa_executable_t executable,
    180 			      hsa_executable_symbol_t symbol, void *data),
    181      void *data);
    182   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
    183 						    uint64_t value);
    184   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
    185   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
    186 				       hsa_signal_value_t value);
    187   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
    188 				       hsa_signal_value_t value);
    189   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
    190     (hsa_signal_t signal, hsa_signal_condition_t condition,
    191      hsa_signal_value_t compare_value, uint64_t timeout_hint,
    192      hsa_wait_state_t wait_state_hint);
    193   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
    194   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
    195 
    196   hsa_status_t (*hsa_code_object_deserialize_fn)
    197     (void *serialized_code_object, size_t serialized_code_object_size,
    198      const char *options, hsa_code_object_t *code_object);
    199   hsa_status_t (*hsa_amd_memory_lock_fn)
    200     (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
    201      void **agent_ptr);
    202   hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
    203   hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
    204     (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
    205      const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
    206      const hsa_dim3_t *range, hsa_agent_t copy_agent,
    207      hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
    208      const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
    209 };
    210 
    211 /* Structure describing the run-time and grid properties of an HSA kernel
    212    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
    213 
    214 struct GOMP_kernel_launch_attributes
    215 {
    216   /* Number of dimensions the workload has.  Maximum number is 3.  */
    217   uint32_t ndim;
    218   /* Size of the grid in the three respective dimensions.  */
    219   uint32_t gdims[3];
    220   /* Size of work-groups in the respective dimensions.  */
    221   uint32_t wdims[3];
    222 };
    223 
    224 /* Collection of information needed for a dispatch of a kernel from a
    225    kernel.  */
    226 
    227 struct kernel_dispatch
    228 {
    229   struct agent_info *agent;
    230   /* Pointer to a command queue associated with a kernel dispatch agent.  */
    231   void *queue;
    232   /* Pointer to a memory space used for kernel arguments passing.  */
    233   void *kernarg_address;
    234   /* Kernel object.  */
    235   uint64_t object;
    236   /* Synchronization signal used for dispatch synchronization.  */
    237   uint64_t signal;
    238   /* Private segment size.  */
    239   uint32_t private_segment_size;
    240   /* Group segment size.  */
    241   uint32_t group_segment_size;
    242 };
    243 
    244 /* Structure of the kernargs segment, supporting console output.
    245 
    246    This needs to match the definitions in Newlib, and the expectations
    247    in libgomp target code.  */
    248 
    249 struct kernargs {
    250   struct kernargs_abi abi;
    251 
    252   /* Output data.  */
    253   struct output output_data;
    254 };
    255 
    256 /* A queue entry for a future asynchronous launch.  */
    257 
    258 struct kernel_launch
    259 {
    260   struct kernel_info *kernel;
    261   void *vars;
    262   struct GOMP_kernel_launch_attributes kla;
    263 };
    264 
    265 /* A queue entry for a future callback.  */
    266 
    267 struct callback
    268 {
    269   void (*fn)(void *);
    270   void *data;
    271 };
    272 
    273 /* A data struct for the copy_data callback.  */
    274 
    275 struct copy_data
    276 {
    277   void *dst;
    278   const void *src;
    279   size_t len;
    280   struct goacc_asyncqueue *aq;
    281 };
    282 
    283 /* A queue entry for a placeholder.  These correspond to a wait event.  */
    284 
    285 struct placeholder
    286 {
    287   int executed;
    288   pthread_cond_t cond;
    289   pthread_mutex_t mutex;
    290 };
    291 
    292 /* A queue entry for a wait directive.  */
    293 
    294 struct asyncwait_info
    295 {
    296   struct placeholder *placeholderp;
    297 };
    298 
    299 /* Encode the type of an entry in an async queue.  */
    300 
    301 enum entry_type
    302 {
    303   KERNEL_LAUNCH,
    304   CALLBACK,
    305   ASYNC_WAIT,
    306   ASYNC_PLACEHOLDER
    307 };
    308 
    309 /* An entry in an async queue.  */
    310 
    311 struct queue_entry
    312 {
    313   enum entry_type type;
    314   union {
    315     struct kernel_launch launch;
    316     struct callback callback;
    317     struct asyncwait_info asyncwait;
    318     struct placeholder placeholder;
    319   } u;
    320 };
    321 
    322 /* An async queue header.
    323 
    324    OpenMP may create one of these.
    325    OpenACC may create many.  */
    326 
    327 struct goacc_asyncqueue
    328 {
    329   struct agent_info *agent;
    330   hsa_queue_t *hsa_queue;
    331 
    332   pthread_t thread_drain_queue;
    333   pthread_mutex_t mutex;
    334   pthread_cond_t queue_cond_in;
    335   pthread_cond_t queue_cond_out;
    336   struct queue_entry queue[ASYNC_QUEUE_SIZE];
    337   int queue_first;
    338   int queue_n;
    339   int drain_queue_stop;
    340 
    341   int id;
    342   struct goacc_asyncqueue *prev;
    343   struct goacc_asyncqueue *next;
    344 };
    345 
    346 /* Mkoffload uses this structure to describe a kernel.
    347 
    348    OpenMP kernel dimensions are passed at runtime.
    349    OpenACC kernel dimensions are passed at compile time, here.  */
    350 
    351 struct hsa_kernel_description
    352 {
    353   const char *name;
    354   int oacc_dims[3];  /* Only present for GCN kernels.  */
    355   int sgpr_count;
    356   int vpgr_count;
    357 };
    358 
    359 /* Mkoffload uses this structure to describe an offload variable.  */
    360 
    361 struct global_var_info
    362 {
    363   const char *name;
    364   void *address;
    365 };
    366 
    367 /* Mkoffload uses this structure to describe all the kernels in a
    368    loadable module.  These are passed the libgomp via static constructors.  */
    369 
    370 struct gcn_image_desc
    371 {
    372   struct gcn_image {
    373     size_t size;
    374     void *image;
    375   } *gcn_image;
    376   const unsigned kernel_count;
    377   struct hsa_kernel_description *kernel_infos;
    378   const unsigned ind_func_count;
    379   const unsigned global_variable_count;
    380 };
    381 
    382 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
    383    support.
    384    See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
    385 
    386 typedef enum {
    387   EF_AMDGPU_MACH_UNSUPPORTED = -1,
    388   EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
    389   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
    390   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
    391   EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
    392   EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
    393   EF_AMDGPU_MACH_AMDGCN_GFX90c = 0x032,
    394   EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
    395   EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045,
    396   EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
    397   EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044
    398 } EF_AMDGPU_MACH;
    399 
    400 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
    401 typedef EF_AMDGPU_MACH gcn_isa;
    402 
    403 /* Description of an HSA GPU agent (device) and the program associated with
    404    it.  */
    405 
    406 struct agent_info
    407 {
    408   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
    409   hsa_agent_t id;
    410   /* The user-visible device number.  */
    411   int device_id;
    412   /* Whether the agent has been initialized.  The fields below are usable only
    413      if it has been.  */
    414   bool initialized;
    415 
    416   /* The instruction set architecture of the device. */
    417   gcn_isa device_isa;
    418   /* Name of the agent. */
    419   char name[64];
    420   /* Name of the vendor of the agent. */
    421   char vendor_name[64];
    422   /* Command queues of the agent.  */
    423   hsa_queue_t *sync_queue;
    424   struct goacc_asyncqueue *async_queues, *omp_async_queue;
    425   pthread_mutex_t async_queues_mutex;
    426 
    427   /* The HSA memory region from which to allocate kernel arguments.  */
    428   hsa_region_t kernarg_region;
    429 
    430   /* The HSA memory region from which to allocate device data.  */
    431   hsa_region_t data_region;
    432 
    433   /* Allocated ephemeral memories (team arena and stack space).  */
    434   struct ephemeral_memories_list *ephemeral_memories_list;
    435   pthread_mutex_t ephemeral_memories_write_lock;
    436 
    437   /* Read-write lock that protects kernels which are running or about to be run
    438      from interference with loading and unloading of images.  Needs to be
    439      locked for reading while a kernel is being run, and for writing if the
    440      list of modules is manipulated (and thus the HSA program invalidated).  */
    441   pthread_rwlock_t module_rwlock;
    442 
    443   /* The module associated with this kernel.  */
    444   struct module_info *module;
    445 
    446   /* Mutex enforcing that only one thread will finalize the HSA program.  A
    447      thread should have locked agent->module_rwlock for reading before
    448      acquiring it.  */
    449   pthread_mutex_t prog_mutex;
    450   /* Flag whether the HSA program that consists of all the modules has been
    451      finalized.  */
    452   bool prog_finalized;
    453   /* HSA executable - the finalized program that is used to locate kernels.  */
    454   hsa_executable_t executable;
    455 };
    456 
    457 /* Information required to identify, finalize and run any given kernel.  */
    458 
    459 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
    460 
    461 struct kernel_info
    462 {
    463   /* Name of the kernel, required to locate it within the GCN object-code
    464      module.  */
    465   const char *name;
    466   /* The specific agent the kernel has been or will be finalized for and run
    467      on.  */
    468   struct agent_info *agent;
    469   /* The specific module where the kernel takes place.  */
    470   struct module_info *module;
    471   /* Information provided by mkoffload associated with the kernel.  */
    472   struct hsa_kernel_description *description;
    473   /* Mutex enforcing that at most once thread ever initializes a kernel for
    474      use.  A thread should have locked agent->module_rwlock for reading before
    475      acquiring it.  */
    476   pthread_mutex_t init_mutex;
    477   /* Flag indicating whether the kernel has been initialized and all fields
    478      below it contain valid data.  */
    479   bool initialized;
    480   /* Flag indicating that the kernel has a problem that blocks an execution.  */
    481   bool initialization_failed;
    482   /* The object to be put into the dispatch queue.  */
    483   uint64_t object;
    484   /* Required size of kernel arguments.  */
    485   uint32_t kernarg_segment_size;
    486   /* Required size of group segment.  */
    487   uint32_t group_segment_size;
    488   /* Required size of private segment.  */
    489   uint32_t private_segment_size;
    490   /* Set up for OpenMP or OpenACC?  */
    491   enum offload_kind kind;
    492 };
    493 
    494 /* Information about a particular GCN module, its image and kernels.  */
    495 
    496 struct module_info
    497 {
    498   /* The description with which the program has registered the image.  */
    499   struct gcn_image_desc *image_desc;
    500   /* GCN heap allocation.  */
    501   struct heap *heap;
    502   /* Physical boundaries of the loaded module.  */
    503   Elf64_Addr phys_address_start;
    504   Elf64_Addr phys_address_end;
    505 
    506   bool constructors_run_p;
    507   struct kernel_info *init_array_func, *fini_array_func;
    508 
    509   /* Number of kernels in this module.  */
    510   int kernel_count;
    511   /* An array of kernel_info structures describing each kernel in this
    512      module.  */
    513   struct kernel_info kernels[];
    514 };
    515 
    516 /* A linked list of memory arenas allocated on the device.
    517    These are used by OpenMP, as a means to optimize per-team malloc,
    518    and for host-accessible stack space.  */
    519 
    520 struct ephemeral_memories_list
    521 {
    522   struct ephemeral_memories_list *next;
    523 
    524   /* The size is determined by the number of teams and threads.  */
    525   size_t size;
    526   /* The device address allocated memory.  */
    527   void *address;
    528   /* A flag to prevent two asynchronous kernels trying to use the same memory.
    529      The mutex is locked until the kernel exits.  */
    530   pthread_mutex_t in_use;
    531 };
    532 
    533 /* Information about the whole HSA environment and all of its agents.  */
    534 
    535 struct hsa_context_info
    536 {
    537   /* Whether the structure has been initialized.  */
    538   bool initialized;
    539   /* Number of usable GPU HSA agents in the system.  */
    540   int agent_count;
    541   /* Array of agent_info structures describing the individual HSA agents.  */
    542   struct agent_info *agents;
    543   /* Driver version string. */
    544   char driver_version_s[30];
    545 };
    546 
    547 /* }}}  */
    548 /* {{{ Global variables  */
    549 
    550 /* Information about the whole HSA environment and all of its agents.  */
    551 
    552 static struct hsa_context_info hsa_context;
    553 
    554 /* HSA runtime functions that are initialized in init_hsa_context.  */
    555 
    556 static struct hsa_runtime_fn_info hsa_fns;
    557 
    558 /* Heap space, allocated target-side, provided for use of newlib malloc.
    559    Each module should have it's own heap allocated.
    560    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
    561 
    562 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
    563 
    564 /* Ephemeral memory sizes for each kernel launch.  */
    565 
    566 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
    567 static int stack_size = DEFAULT_GCN_STACK_SIZE;
    568 static int lowlat_size = -1;
    569 
    570 /* Flag to decide whether print to stderr information about what is going on.
    571    Set in init_debug depending on environment variables.  */
    572 
    573 static bool debug;
    574 
    575 /* Flag to decide if the runtime should suppress a possible fallback to host
    576    execution.  */
    577 
    578 static bool suppress_host_fallback;
    579 
    580 /* Flag to locate HSA runtime shared library that is dlopened
    581    by this plug-in.  */
    582 
    583 static const char *hsa_runtime_lib;
    584 
    585 /* Flag to decide if the runtime should support also CPU devices (can be
    586    a simulator).  */
    587 
    588 static bool support_cpu_devices;
    589 
    590 /* Runtime dimension overrides.  Zero indicates default.  */
    591 
    592 static int override_x_dim = 0;
    593 static int override_z_dim = 0;
    594 
    595 /* }}}  */
    596 /* {{{ Debug & Diagnostic  */
    597 
    598 /* Print a message to stderr if GCN_DEBUG value is set to true.  */
    599 
    600 #define DEBUG_PRINT(...) \
    601   do \
    602   { \
    603     if (debug) \
    604       { \
    605 	fprintf (stderr, __VA_ARGS__); \
    606       } \
    607   } \
    608   while (false);
    609 
    610 /* Flush stderr if GCN_DEBUG value is set to true.  */
    611 
    612 #define DEBUG_FLUSH()				\
    613   do {						\
    614     if (debug)					\
    615       fflush (stderr);				\
    616   } while (false)
    617 
    618 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
    619    is set to true.  */
    620 
    621 #define DEBUG_LOG(prefix, ...)			\
    622   do						\
    623     {						\
    624       DEBUG_PRINT (prefix);			\
    625       DEBUG_PRINT (__VA_ARGS__);			\
    626       DEBUG_FLUSH ();				\
    627     } while (false)
    628 
    629 /* Print a debugging message to stderr.  */
    630 
    631 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
    632 
    633 /* Print a warning message to stderr.  */
    634 
    635 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
    636 
    637 /* Print HSA warning STR with an HSA STATUS code.  */
    638 
    639 static void
    640 hsa_warn (const char *str, hsa_status_t status)
    641 {
    642   if (!debug)
    643     return;
    644 
    645   const char *hsa_error_msg = "[unknown]";
    646   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
    647 
    648   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
    649 	   hsa_error_msg);
    650 }
    651 
    652 /* Report a fatal error STR together with the HSA error corresponding to STATUS
    653    and terminate execution of the current process.  */
    654 
    655 static void
    656 hsa_fatal (const char *str, hsa_status_t status)
    657 {
    658   const char *hsa_error_msg = "[unknown]";
    659   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
    660   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
    661 		     hsa_error_msg);
    662 }
    663 
    664 /* Like hsa_fatal, except only report error message, and return FALSE
    665    for propagating error processing to outside of plugin.  */
    666 
    667 static bool
    668 hsa_error (const char *str, hsa_status_t status)
    669 {
    670   const char *hsa_error_msg = "[unknown]";
    671   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
    672   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
    673 		     hsa_error_msg);
    674   return false;
    675 }
    676 
    677 /* Dump information about the available hardware.  */
    678 
    679 static void
    680 dump_hsa_system_info (void)
    681 {
    682   hsa_status_t status;
    683 
    684   hsa_endianness_t endianness;
    685   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
    686 					   &endianness);
    687   if (status == HSA_STATUS_SUCCESS)
    688     switch (endianness)
    689       {
    690       case HSA_ENDIANNESS_LITTLE:
    691 	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
    692 	break;
    693       case HSA_ENDIANNESS_BIG:
    694 	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
    695 	break;
    696       default:
    697 	GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
    698       }
    699   else
    700     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
    701 
    702   uint8_t extensions[128];
    703   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
    704 					   &extensions);
    705   if (status == HSA_STATUS_SUCCESS)
    706     {
    707       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
    708 	GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
    709     }
    710   else
    711     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
    712 }
    713 
    714 /* Dump information about the available hardware.  */
    715 
    716 static void
    717 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
    718 {
    719   switch (machine_model)
    720     {
    721     case HSA_MACHINE_MODEL_SMALL:
    722       GCN_DEBUG ("%s: SMALL\n", s);
    723       break;
    724     case HSA_MACHINE_MODEL_LARGE:
    725       GCN_DEBUG ("%s: LARGE\n", s);
    726       break;
    727     default:
    728       GCN_WARNING ("%s: UNKNOWN\n", s);
    729       break;
    730     }
    731 }
    732 
    733 /* Dump information about the available hardware.  */
    734 
    735 static void
    736 dump_profile (hsa_profile_t profile, const char *s)
    737 {
    738   switch (profile)
    739     {
    740     case HSA_PROFILE_FULL:
    741       GCN_DEBUG ("%s: FULL\n", s);
    742       break;
    743     case HSA_PROFILE_BASE:
    744       GCN_DEBUG ("%s: BASE\n", s);
    745       break;
    746     default:
    747       GCN_WARNING ("%s: UNKNOWN\n", s);
    748       break;
    749     }
    750 }
    751 
    752 /* Dump information about a device memory region.  */
    753 
    754 static hsa_status_t
    755 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
    756 {
    757   hsa_status_t status;
    758 
    759   hsa_region_segment_t segment;
    760   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
    761 					   &segment);
    762   if (status == HSA_STATUS_SUCCESS)
    763     {
    764       if (segment == HSA_REGION_SEGMENT_GLOBAL)
    765 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
    766       else if (segment == HSA_REGION_SEGMENT_READONLY)
    767 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
    768       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
    769 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
    770       else if (segment == HSA_REGION_SEGMENT_GROUP)
    771 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
    772       else
    773 	GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
    774     }
    775   else
    776     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
    777 
    778   if (segment == HSA_REGION_SEGMENT_GLOBAL)
    779     {
    780       uint32_t flags;
    781       status
    782 	= hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
    783 					  &flags);
    784       if (status == HSA_STATUS_SUCCESS)
    785 	{
    786 	  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
    787 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
    788 	  if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
    789 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
    790 	  if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
    791 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
    792 	}
    793       else
    794 	GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
    795     }
    796 
    797   size_t size;
    798   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
    799   if (status == HSA_STATUS_SUCCESS)
    800     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
    801   else
    802     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
    803 
    804   status
    805     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
    806 				      &size);
    807   if (status == HSA_STATUS_SUCCESS)
    808     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
    809   else
    810     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
    811 
    812   bool alloc_allowed;
    813   status
    814     = hsa_fns.hsa_region_get_info_fn (region,
    815 				      HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
    816 				      &alloc_allowed);
    817   if (status == HSA_STATUS_SUCCESS)
    818     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
    819   else
    820     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
    821 
    822   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
    823     return HSA_STATUS_SUCCESS;
    824 
    825   status
    826     = hsa_fns.hsa_region_get_info_fn (region,
    827 				      HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
    828 				      &size);
    829   if (status == HSA_STATUS_SUCCESS)
    830     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
    831   else
    832     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
    833 
    834   size_t align;
    835   status
    836     = hsa_fns.hsa_region_get_info_fn (region,
    837 				      HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
    838 				      &align);
    839   if (status == HSA_STATUS_SUCCESS)
    840     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
    841   else
    842     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
    843 
    844   return HSA_STATUS_SUCCESS;
    845 }
    846 
    847 /* Dump information about all the device memory regions.  */
    848 
    849 static void
    850 dump_hsa_regions (hsa_agent_t agent)
    851 {
    852   hsa_status_t status;
    853   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
    854 						 dump_hsa_region,
    855 						 NULL);
    856   if (status != HSA_STATUS_SUCCESS)
    857     hsa_error ("Dumping hsa regions failed", status);
    858 }
    859 
    860 /* Dump information about the available devices.  */
    861 
    862 static hsa_status_t
    863 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
    864 {
    865   hsa_status_t status;
    866 
    867   char buf[64];
    868   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
    869 					  &buf);
    870   if (status == HSA_STATUS_SUCCESS)
    871     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
    872   else
    873     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
    874 
    875   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
    876 					  &buf);
    877   if (status == HSA_STATUS_SUCCESS)
    878     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
    879   else
    880     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
    881 
    882   hsa_machine_model_t machine_model;
    883   status
    884     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
    885 				     &machine_model);
    886   if (status == HSA_STATUS_SUCCESS)
    887     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
    888   else
    889     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
    890 
    891   hsa_profile_t profile;
    892   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
    893 					  &profile);
    894   if (status == HSA_STATUS_SUCCESS)
    895     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
    896   else
    897     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
    898 
    899   hsa_device_type_t device_type;
    900   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
    901 					  &device_type);
    902   if (status == HSA_STATUS_SUCCESS)
    903     {
    904       switch (device_type)
    905 	{
    906 	case HSA_DEVICE_TYPE_CPU:
    907 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
    908 	  break;
    909 	case HSA_DEVICE_TYPE_GPU:
    910 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
    911 	  break;
    912 	case HSA_DEVICE_TYPE_DSP:
    913 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
    914 	  break;
    915 	default:
    916 	  GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
    917 	  break;
    918 	}
    919     }
    920   else
    921     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
    922 
    923   uint32_t cu_count;
    924   status = hsa_fns.hsa_agent_get_info_fn
    925     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
    926   if (status == HSA_STATUS_SUCCESS)
    927     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
    928   else
    929     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
    930 
    931   uint32_t size;
    932   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
    933 					  &size);
    934   if (status == HSA_STATUS_SUCCESS)
    935     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
    936   else
    937     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
    938 
    939   uint32_t max_dim;
    940   status = hsa_fns.hsa_agent_get_info_fn (agent,
    941 					  HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
    942 					  &max_dim);
    943   if (status == HSA_STATUS_SUCCESS)
    944     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
    945   else
    946     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
    947 
    948   uint32_t max_size;
    949   status = hsa_fns.hsa_agent_get_info_fn (agent,
    950 					  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
    951 					  &max_size);
    952   if (status == HSA_STATUS_SUCCESS)
    953     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
    954   else
    955     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
    956 
    957   uint32_t grid_max_dim;
    958   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
    959 					  &grid_max_dim);
    960   if (status == HSA_STATUS_SUCCESS)
    961     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
    962   else
    963     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
    964 
    965   uint32_t grid_max_size;
    966   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
    967 					  &grid_max_size);
    968   if (status == HSA_STATUS_SUCCESS)
    969     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
    970   else
    971     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
    972 
    973   dump_hsa_regions (agent);
    974 
    975   return HSA_STATUS_SUCCESS;
    976 }
    977 
    978 /* Forward reference.  */
    979 
    980 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
    981 
    982 /* Helper function for dump_executable_symbols.  */
    983 
    984 static hsa_status_t
    985 dump_executable_symbol (hsa_executable_t executable,
    986 			hsa_executable_symbol_t symbol,
    987 			void *data __attribute__((unused)))
    988 {
    989   char *name = get_executable_symbol_name (symbol);
    990 
    991   if (name)
    992     {
    993       GCN_DEBUG ("executable symbol: %s\n", name);
    994       free (name);
    995     }
    996 
    997   return HSA_STATUS_SUCCESS;
    998 }
    999 
   1000 /* Dump all global symbol in an executable.  */
   1001 
   1002 static void
   1003 dump_executable_symbols (hsa_executable_t executable)
   1004 {
   1005   hsa_status_t status;
   1006   status
   1007     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
   1008 						 dump_executable_symbol,
   1009 						 NULL);
   1010   if (status != HSA_STATUS_SUCCESS)
   1011     hsa_fatal ("Could not dump HSA executable symbols", status);
   1012 }
   1013 
   1014 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
   1015 
   1016 static void
   1017 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
   1018 {
   1019   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
   1020 
   1021   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
   1022   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
   1023   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
   1024   fprintf (stderr, "%*sheap address: %p\n", indent, "",
   1025 	   (void*)kernargs->abi.heap_ptr);
   1026   fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
   1027 	   "", (void*)kernargs->abi.arena_ptr,
   1028 	   kernargs->abi.arena_size_per_team);
   1029   fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
   1030 	   "", (void*)kernargs->abi.stack_ptr,
   1031 	   kernargs->abi.stack_size_per_thread);
   1032   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
   1033   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
   1034 	   dispatch->private_segment_size);
   1035   fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
   1036 	   "", dispatch->group_segment_size);
   1037   fprintf (stderr, "\n");
   1038 }
   1039 
   1040 /* }}}  */
   1041 /* {{{ Utility functions  */
   1042 
   1043 /* Cast the thread local storage to gcn_thread.  */
   1044 
   1045 static inline struct gcn_thread *
   1046 gcn_thread (void)
   1047 {
   1048   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
   1049 }
   1050 
   1051 /* Initialize debug and suppress_host_fallback according to the environment.  */
   1052 
   1053 static void
   1054 init_environment_variables (void)
   1055 {
   1056   if (secure_getenv ("GCN_DEBUG"))
   1057     debug = true;
   1058   else
   1059     debug = false;
   1060 
   1061   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
   1062     suppress_host_fallback = true;
   1063   else
   1064     suppress_host_fallback = false;
   1065 
   1066   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
   1067   if (hsa_runtime_lib == NULL)
   1068     hsa_runtime_lib = "libhsa-runtime64.so.1";
   1069 
   1070   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
   1071 
   1072   const char *x = secure_getenv ("GCN_NUM_TEAMS");
   1073   if (!x)
   1074     x = secure_getenv ("GCN_NUM_GANGS");
   1075   if (x)
   1076     override_x_dim = atoi (x);
   1077 
   1078   const char *z = secure_getenv ("GCN_NUM_THREADS");
   1079   if (!z)
   1080     z = secure_getenv ("GCN_NUM_WORKERS");
   1081   if (z)
   1082     override_z_dim = atoi (z);
   1083 
   1084   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
   1085   if (heap)
   1086     {
   1087       size_t tmp = atol (heap);
   1088       if (tmp)
   1089 	gcn_kernel_heap_size = tmp;
   1090     }
   1091 
   1092   const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
   1093   if (arena)
   1094     {
   1095       int tmp = atoi (arena);
   1096       if (tmp)
   1097 	team_arena_size = tmp;;
   1098     }
   1099 
   1100   const char *stack = secure_getenv ("GCN_STACK_SIZE");
   1101   if (stack)
   1102     {
   1103       int tmp = atoi (stack);
   1104       if (tmp)
   1105 	stack_size = tmp;;
   1106     }
   1107 
   1108   const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
   1109   if (lowlat)
   1110     lowlat_size = atoi (lowlat);
   1111 }
   1112 
   1113 /* Return malloc'd string with name of SYMBOL.  */
   1114 
   1115 static char *
   1116 get_executable_symbol_name (hsa_executable_symbol_t symbol)
   1117 {
   1118   hsa_status_t status;
   1119   char *res;
   1120   uint32_t len;
   1121   const hsa_executable_symbol_info_t info_name_length
   1122     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
   1123 
   1124   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
   1125 						      &len);
   1126   if (status != HSA_STATUS_SUCCESS)
   1127     {
   1128       hsa_error ("Could not get length of symbol name", status);
   1129       return NULL;
   1130     }
   1131 
   1132   res = GOMP_PLUGIN_malloc (len + 1);
   1133 
   1134   const hsa_executable_symbol_info_t info_name
   1135     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
   1136 
   1137   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
   1138 
   1139   if (status != HSA_STATUS_SUCCESS)
   1140     {
   1141       hsa_error ("Could not get symbol name", status);
   1142       free (res);
   1143       return NULL;
   1144     }
   1145 
   1146   res[len] = '\0';
   1147 
   1148   return res;
   1149 }
   1150 
   1151 /* Get the number of GPU Compute Units.  */
   1152 
   1153 static int
   1154 get_cu_count (struct agent_info *agent)
   1155 {
   1156   uint32_t cu_count;
   1157   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
   1158     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
   1159   if (status == HSA_STATUS_SUCCESS)
   1160     return cu_count;
   1161   else
   1162     return 64;  /* The usual number for older devices.  */
   1163 }
   1164 
   1165 /* Calculate the maximum grid size for OMP threads / OACC workers.
   1166    This depends on the kernel's resource usage levels.  */
   1167 
   1168 static int
   1169 limit_worker_threads (int threads)
   1170 {
   1171   /* FIXME Do something more inteligent here.
   1172      GCN can always run 4 threads within a Compute Unit, but
   1173      more than that depends on register usage.  */
   1174   if (threads > 16)
   1175     threads = 16;
   1176   return threads;
   1177 }
   1178 
   1179 /* This sets the maximum number of teams to twice the number of GPU Compute
   1180    Units to avoid memory waste and corresponding memory access faults.  */
   1181 
   1182 static int
   1183 limit_teams (int teams, struct agent_info *agent)
   1184 {
   1185   int max_teams = 2 * get_cu_count (agent);
   1186   if (teams > max_teams)
   1187     teams = max_teams;
   1188   return teams;
   1189 }
   1190 
   1191 /* Parse the target attributes INPUT provided by the compiler and return true
   1192    if we should run anything all.  If INPUT is NULL, fill DEF with default
   1193    values, then store INPUT or DEF into *RESULT.
   1194 
   1195    This is used for OpenMP only.  */
   1196 
   1197 static bool
   1198 parse_target_attributes (void **input,
   1199 			 struct GOMP_kernel_launch_attributes *def,
   1200 			 struct GOMP_kernel_launch_attributes **result,
   1201 			 struct agent_info *agent)
   1202 {
   1203   if (!input)
   1204     GOMP_PLUGIN_fatal ("No target arguments provided");
   1205 
   1206   bool grid_attrs_found = false;
   1207   bool gcn_dims_found = false;
   1208   int gcn_teams = 0;
   1209   int gcn_threads = 0;
   1210   while (*input)
   1211     {
   1212       intptr_t id = (intptr_t) *input++, val;
   1213 
   1214       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
   1215 	val = (intptr_t) *input++;
   1216       else
   1217 	val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
   1218 
   1219       val = (val > INT_MAX) ? INT_MAX : val;
   1220 
   1221       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
   1222 	  && ((id & GOMP_TARGET_ARG_ID_MASK)
   1223 	      == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
   1224 	{
   1225 	  grid_attrs_found = true;
   1226 	  break;
   1227 	}
   1228       else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
   1229 	       == GOMP_TARGET_ARG_DEVICE_ALL)
   1230 	{
   1231 	  gcn_dims_found = true;
   1232 	  switch (id & GOMP_TARGET_ARG_ID_MASK)
   1233 	    {
   1234 	    case GOMP_TARGET_ARG_NUM_TEAMS:
   1235 	      gcn_teams = limit_teams (val, agent);
   1236 	      break;
   1237 	    case GOMP_TARGET_ARG_THREAD_LIMIT:
   1238 	      gcn_threads = limit_worker_threads (val);
   1239 	      break;
   1240 	    default:
   1241 	      ;
   1242 	    }
   1243 	}
   1244     }
   1245 
   1246   if (gcn_dims_found)
   1247     {
   1248       bool gfx900_workaround_p = false;
   1249 
   1250       if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
   1251 	  && gcn_threads == 0 && override_z_dim == 0)
   1252 	{
   1253 	  gfx900_workaround_p = true;
   1254 	  GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
   1255 		       "threads to at most 4 per team.\n");
   1256 	  GCN_WARNING (" - If this is not a Vega 10 device, please use "
   1257 		       "GCN_NUM_THREADS=16\n");
   1258 	}
   1259 
   1260       /* Ideally, when a dimension isn't explicitly specified, we should
   1261 	 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
   1262 	 In practice, we tune for peak performance on BabelStream, which
   1263 	 for OpenACC is currently 32 threads per CU.  */
   1264       def->ndim = 3;
   1265       if (gcn_teams <= 0 && gcn_threads <= 0)
   1266 	{
   1267 	  /* Set up a reasonable number of teams and threads.  */
   1268 	  gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
   1269 	  def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
   1270 	  def->gdims[2] = gcn_threads;
   1271 	}
   1272       else if (gcn_teams <= 0 && gcn_threads > 0)
   1273 	{
   1274 	  /* Auto-scale the number of teams with the number of threads.  */
   1275 	  def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
   1276 	  def->gdims[2] = gcn_threads;
   1277 	}
   1278       else if (gcn_teams > 0 && gcn_threads <= 0)
   1279 	{
   1280 	  int max_threads = gfx900_workaround_p ? 4 : 16;
   1281 
   1282 	  /* Auto-scale the number of threads with the number of teams.  */
   1283 	  def->gdims[0] = gcn_teams;
   1284 	  def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
   1285 	  if (def->gdims[2] == 0)
   1286 	    def->gdims[2] = 1;
   1287 	  else if (def->gdims[2] > max_threads)
   1288 	    def->gdims[2] = max_threads;
   1289 	}
   1290       else
   1291 	{
   1292 	  def->gdims[0] = gcn_teams;
   1293 	  def->gdims[2] = gcn_threads;
   1294 	}
   1295       def->gdims[1] = 64; /* Each thread is 64 work items wide.  */
   1296       def->wdims[0] = 1;  /* Single team per work-group.  */
   1297       def->wdims[1] = 64;
   1298       def->wdims[2] = 16;
   1299       *result = def;
   1300       return true;
   1301     }
   1302   else if (!grid_attrs_found)
   1303     {
   1304       def->ndim = 1;
   1305       def->gdims[0] = 1;
   1306       def->gdims[1] = 1;
   1307       def->gdims[2] = 1;
   1308       def->wdims[0] = 1;
   1309       def->wdims[1] = 1;
   1310       def->wdims[2] = 1;
   1311       *result = def;
   1312       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
   1313       return true;
   1314     }
   1315 
   1316   struct GOMP_kernel_launch_attributes *kla;
   1317   kla = (struct GOMP_kernel_launch_attributes *) *input;
   1318   *result = kla;
   1319   if (kla->ndim == 0 || kla->ndim > 3)
   1320     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
   1321 
   1322   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
   1323   unsigned i;
   1324   for (i = 0; i < kla->ndim; i++)
   1325     {
   1326       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
   1327 		 kla->gdims[i], kla->wdims[i]);
   1328       if (kla->gdims[i] == 0)
   1329 	return false;
   1330     }
   1331   return true;
   1332 }
   1333 
   1334 /* Return the group size given the requested GROUP size, GRID size and number
   1335    of grid dimensions NDIM.  */
   1336 
   1337 static uint32_t
   1338 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
   1339 {
   1340   if (group == 0)
   1341     {
   1342       /* TODO: Provide a default via environment or device characteristics.  */
   1343       if (ndim == 1)
   1344 	group = 64;
   1345       else if (ndim == 2)
   1346 	group = 8;
   1347       else
   1348 	group = 4;
   1349     }
   1350 
   1351   if (group > grid)
   1352     group = grid;
   1353   return group;
   1354 }
   1355 
   1356 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
   1357 
   1358 static void
   1359 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
   1360 {
   1361   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
   1362 }
   1363 
   1364 /* A never-called callback for the HSA command queues.  These signal events
   1365    that we don't use, so we trigger an error.
   1366 
   1367    This "queue" is not to be confused with the async queues, below.  */
   1368 
   1369 static void
   1370 hsa_queue_callback (hsa_status_t status,
   1371 		hsa_queue_t *queue __attribute__ ((unused)),
   1372 		void *data __attribute__ ((unused)))
   1373 {
   1374   hsa_fatal ("Asynchronous queue error", status);
   1375 }
   1376 
   1377 /* }}}  */
   1378 /* {{{ HSA initialization  */
   1379 
   1380 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
   1381 
   1382 static bool
   1383 init_hsa_runtime_functions (void)
   1384 {
   1385 #define DLSYM_FN(function) \
   1386   hsa_fns.function##_fn = dlsym (handle, #function); \
   1387   if (hsa_fns.function##_fn == NULL) \
   1388     GOMP_PLUGIN_fatal ("'%s' is missing '%s'", hsa_runtime_lib, #function);
   1389 #define DLSYM_OPT_FN(function) \
   1390   hsa_fns.function##_fn = dlsym (handle, #function);
   1391 
   1392   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
   1393   if (handle == NULL)
   1394     return false;
   1395 
   1396   DLSYM_FN (hsa_status_string)
   1397   DLSYM_FN (hsa_system_get_info)
   1398   DLSYM_FN (hsa_agent_get_info)
   1399   DLSYM_FN (hsa_init)
   1400   DLSYM_FN (hsa_iterate_agents)
   1401   DLSYM_FN (hsa_region_get_info)
   1402   DLSYM_FN (hsa_queue_create)
   1403   DLSYM_FN (hsa_agent_iterate_regions)
   1404   DLSYM_FN (hsa_executable_destroy)
   1405   DLSYM_FN (hsa_executable_create)
   1406   DLSYM_FN (hsa_executable_global_variable_define)
   1407   DLSYM_FN (hsa_executable_load_code_object)
   1408   DLSYM_FN (hsa_executable_freeze)
   1409   DLSYM_FN (hsa_signal_create)
   1410   DLSYM_FN (hsa_memory_allocate)
   1411   DLSYM_FN (hsa_memory_assign_agent)
   1412   DLSYM_FN (hsa_memory_copy)
   1413   DLSYM_FN (hsa_memory_free)
   1414   DLSYM_FN (hsa_signal_destroy)
   1415   DLSYM_FN (hsa_executable_get_symbol)
   1416   DLSYM_FN (hsa_executable_symbol_get_info)
   1417   DLSYM_FN (hsa_executable_iterate_symbols)
   1418   DLSYM_FN (hsa_queue_add_write_index_release)
   1419   DLSYM_FN (hsa_queue_load_read_index_acquire)
   1420   DLSYM_FN (hsa_signal_wait_acquire)
   1421   DLSYM_FN (hsa_signal_store_relaxed)
   1422   DLSYM_FN (hsa_signal_store_release)
   1423   DLSYM_FN (hsa_signal_load_acquire)
   1424   DLSYM_FN (hsa_queue_destroy)
   1425   DLSYM_FN (hsa_code_object_deserialize)
   1426   DLSYM_OPT_FN (hsa_amd_memory_lock)
   1427   DLSYM_OPT_FN (hsa_amd_memory_unlock)
   1428   DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
   1429   return true;
   1430 #undef DLSYM_OPT_FN
   1431 #undef DLSYM_FN
   1432 }
   1433 
   1434 static gcn_isa isa_code (const char *isa);
   1435 
   1436 /* Return true if the agent is a GPU and can accept of concurrent submissions
   1437    from different threads.  */
   1438 
   1439 static bool
   1440 suitable_hsa_agent_p (hsa_agent_t agent)
   1441 {
   1442   hsa_device_type_t device_type;
   1443   hsa_status_t status
   1444     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
   1445 				     &device_type);
   1446   if (status != HSA_STATUS_SUCCESS)
   1447     return false;
   1448 
   1449   switch (device_type)
   1450     {
   1451     case HSA_DEVICE_TYPE_GPU:
   1452       {
   1453 	char name[64];
   1454 	hsa_status_t status
   1455 	  = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, name);
   1456 	if (status != HSA_STATUS_SUCCESS
   1457 	    || isa_code (name) == EF_AMDGPU_MACH_UNSUPPORTED)
   1458 	  {
   1459 	    GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
   1460 		       status == HSA_STATUS_SUCCESS ? name : "invalid");
   1461 	    return false;
   1462 	  }
   1463       }
   1464       break;
   1465     case HSA_DEVICE_TYPE_CPU:
   1466       if (!support_cpu_devices)
   1467 	return false;
   1468       break;
   1469     default:
   1470       return false;
   1471     }
   1472 
   1473   uint32_t features = 0;
   1474   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
   1475 					  &features);
   1476   if (status != HSA_STATUS_SUCCESS
   1477       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
   1478     return false;
   1479   hsa_queue_type_t queue_type;
   1480   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
   1481 					  &queue_type);
   1482   if (status != HSA_STATUS_SUCCESS
   1483       || (queue_type != HSA_QUEUE_TYPE_MULTI))
   1484     return false;
   1485 
   1486   return true;
   1487 }
   1488 
   1489 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
   1490    agent_count in hsa_context.  */
   1491 
   1492 static hsa_status_t
   1493 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
   1494 {
   1495   if (suitable_hsa_agent_p (agent))
   1496     hsa_context.agent_count++;
   1497   return HSA_STATUS_SUCCESS;
   1498 }
   1499 
   1500 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
   1501    id to the describing structure in the hsa context.  The index of the
   1502    structure is pointed to by DATA, increment it afterwards.  */
   1503 
   1504 static hsa_status_t
   1505 assign_agent_ids (hsa_agent_t agent, void *data)
   1506 {
   1507   if (suitable_hsa_agent_p (agent))
   1508     {
   1509       int *agent_index = (int *) data;
   1510       hsa_context.agents[*agent_index].id = agent;
   1511       ++*agent_index;
   1512     }
   1513   return HSA_STATUS_SUCCESS;
   1514 }
   1515 
   1516 /* Initialize hsa_context if it has not already been done.
   1517    If !PROBE: returns TRUE on success.
   1518    If PROBE: returns TRUE on success or if the plugin/device shall be silently
   1519    ignored, and otherwise emits an error and returns FALSE.  */
   1520 
   1521 static bool
   1522 init_hsa_context (bool probe)
   1523 {
   1524   hsa_status_t status;
   1525   int agent_index = 0;
   1526 
   1527   if (hsa_context.initialized)
   1528     return true;
   1529   init_environment_variables ();
   1530   if (!init_hsa_runtime_functions ())
   1531     {
   1532       const char *msg = "Run-time could not be dynamically opened";
   1533       if (suppress_host_fallback)
   1534 	GOMP_PLUGIN_fatal ("%s\n", msg);
   1535       else
   1536 	GCN_WARNING ("%s\n", msg);
   1537       return probe ? true : false;
   1538     }
   1539   status = hsa_fns.hsa_init_fn ();
   1540   if (status != HSA_STATUS_SUCCESS)
   1541     return hsa_error ("Run-time could not be initialized", status);
   1542   GCN_DEBUG ("HSA run-time initialized for GCN\n");
   1543 
   1544   if (debug)
   1545     dump_hsa_system_info ();
   1546 
   1547   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
   1548   if (status != HSA_STATUS_SUCCESS)
   1549     return hsa_error ("GCN GPU devices could not be enumerated", status);
   1550   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
   1551 
   1552   hsa_context.agents
   1553     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
   1554 				  * sizeof (struct agent_info));
   1555   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
   1556   if (status != HSA_STATUS_SUCCESS)
   1557     return hsa_error ("Scanning compute agents failed", status);
   1558   if (agent_index != hsa_context.agent_count)
   1559     {
   1560       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
   1561       return false;
   1562     }
   1563 
   1564   if (debug)
   1565     {
   1566       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
   1567       if (status != HSA_STATUS_SUCCESS)
   1568 	GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
   1569     }
   1570 
   1571   uint16_t minor, major;
   1572   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
   1573 					   &minor);
   1574   if (status != HSA_STATUS_SUCCESS)
   1575     GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
   1576   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
   1577 					   &major);
   1578   if (status != HSA_STATUS_SUCCESS)
   1579     GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
   1580 
   1581   size_t len = sizeof hsa_context.driver_version_s;
   1582   int printed = snprintf (hsa_context.driver_version_s, len,
   1583 			  "HSA Runtime %hu.%hu", (unsigned short int)major,
   1584 			  (unsigned short int)minor);
   1585   if (printed >= len)
   1586     GCN_WARNING ("HSA runtime version string was truncated."
   1587 		 "Version %hu.%hu is too long.", (unsigned short int)major,
   1588 		 (unsigned short int)minor);
   1589 
   1590   hsa_context.initialized = true;
   1591   return true;
   1592 }
   1593 
   1594 /* Verify that hsa_context has already been initialized and return the
   1595    agent_info structure describing device number N.  Return NULL on error.  */
   1596 
   1597 static struct agent_info *
   1598 get_agent_info (int n)
   1599 {
   1600   if (!hsa_context.initialized)
   1601     {
   1602       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
   1603       return NULL;
   1604     }
   1605   if (n >= hsa_context.agent_count)
   1606     {
   1607       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
   1608       return NULL;
   1609     }
   1610   if (!hsa_context.agents[n].initialized)
   1611     {
   1612       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
   1613       return NULL;
   1614     }
   1615   return &hsa_context.agents[n];
   1616 }
   1617 
   1618 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
   1619 
   1620    Selects (breaks at) a suitable region of type KIND.  */
   1621 
   1622 static hsa_status_t
   1623 get_memory_region (hsa_region_t region, hsa_region_t *retval,
   1624 		   hsa_region_global_flag_t kind)
   1625 {
   1626   hsa_status_t status;
   1627   hsa_region_segment_t segment;
   1628 
   1629   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
   1630 					   &segment);
   1631   if (status != HSA_STATUS_SUCCESS)
   1632     return status;
   1633   if (segment != HSA_REGION_SEGMENT_GLOBAL)
   1634     return HSA_STATUS_SUCCESS;
   1635 
   1636   uint32_t flags;
   1637   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
   1638 					   &flags);
   1639   if (status != HSA_STATUS_SUCCESS)
   1640     return status;
   1641   if (flags & kind)
   1642     {
   1643       *retval = region;
   1644       return HSA_STATUS_INFO_BREAK;
   1645     }
   1646   return HSA_STATUS_SUCCESS;
   1647 }
   1648 
   1649 /* Callback of hsa_agent_iterate_regions.
   1650 
   1651    Selects a kernargs memory region.  */
   1652 
   1653 static hsa_status_t
   1654 get_kernarg_memory_region (hsa_region_t region, void *data)
   1655 {
   1656   return get_memory_region (region, (hsa_region_t *)data,
   1657 			    HSA_REGION_GLOBAL_FLAG_KERNARG);
   1658 }
   1659 
   1660 /* Callback of hsa_agent_iterate_regions.
   1661 
   1662    Selects a coarse-grained memory region suitable for the heap and
   1663    offload data.  */
   1664 
   1665 static hsa_status_t
   1666 get_data_memory_region (hsa_region_t region, void *data)
   1667 {
   1668   return get_memory_region (region, (hsa_region_t *)data,
   1669 			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
   1670 }
   1671 
   1672 static int
   1673 elf_gcn_isa_field (Elf64_Ehdr *image)
   1674 {
   1675   return image->e_flags & EF_AMDGPU_MACH_MASK;
   1676 }
   1677 
   1678 const static char *gcn_gfx803_s = "gfx803";
   1679 const static char *gcn_gfx900_s = "gfx900";
   1680 const static char *gcn_gfx906_s = "gfx906";
   1681 const static char *gcn_gfx908_s = "gfx908";
   1682 const static char *gcn_gfx90a_s = "gfx90a";
   1683 const static char *gcn_gfx90c_s = "gfx90c";
   1684 const static char *gcn_gfx1030_s = "gfx1030";
   1685 const static char *gcn_gfx1036_s = "gfx1036";
   1686 const static char *gcn_gfx1100_s = "gfx1100";
   1687 const static char *gcn_gfx1103_s = "gfx1103";
   1688 const static int gcn_isa_name_len = 7;
   1689 
   1690 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
   1691    support the ISA. */
   1692 
   1693 static const char*
   1694 isa_hsa_name (int isa) {
   1695   switch(isa)
   1696     {
   1697     case EF_AMDGPU_MACH_AMDGCN_GFX803:
   1698       return gcn_gfx803_s;
   1699     case EF_AMDGPU_MACH_AMDGCN_GFX900:
   1700       return gcn_gfx900_s;
   1701     case EF_AMDGPU_MACH_AMDGCN_GFX906:
   1702       return gcn_gfx906_s;
   1703     case EF_AMDGPU_MACH_AMDGCN_GFX908:
   1704       return gcn_gfx908_s;
   1705     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
   1706       return gcn_gfx90a_s;
   1707     case EF_AMDGPU_MACH_AMDGCN_GFX90c:
   1708       return gcn_gfx90c_s;
   1709     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
   1710       return gcn_gfx1030_s;
   1711     case EF_AMDGPU_MACH_AMDGCN_GFX1036:
   1712       return gcn_gfx1036_s;
   1713     case EF_AMDGPU_MACH_AMDGCN_GFX1100:
   1714       return gcn_gfx1100_s;
   1715     case EF_AMDGPU_MACH_AMDGCN_GFX1103:
   1716       return gcn_gfx1103_s;
   1717     }
   1718   return NULL;
   1719 }
   1720 
   1721 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
   1722    with -march) or NULL if we do not support the ISA.
   1723    Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
   1724 
   1725 static const char*
   1726 isa_gcc_name (int isa) {
   1727   switch(isa)
   1728     {
   1729     case EF_AMDGPU_MACH_AMDGCN_GFX803:
   1730       return "fiji";
   1731     default:
   1732       return isa_hsa_name (isa);
   1733     }
   1734 }
   1735 
   1736 /* Returns the code which is used in the GCN object code to identify the ISA with
   1737    the given name (as used by the HSA runtime).  */
   1738 
   1739 static gcn_isa
   1740 isa_code(const char *isa) {
   1741   if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
   1742     return EF_AMDGPU_MACH_AMDGCN_GFX803;
   1743 
   1744   if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
   1745     return EF_AMDGPU_MACH_AMDGCN_GFX900;
   1746 
   1747   if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
   1748     return EF_AMDGPU_MACH_AMDGCN_GFX906;
   1749 
   1750   if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
   1751     return EF_AMDGPU_MACH_AMDGCN_GFX908;
   1752 
   1753   if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
   1754     return EF_AMDGPU_MACH_AMDGCN_GFX90a;
   1755 
   1756   if (!strncmp (isa, gcn_gfx90c_s, gcn_isa_name_len))
   1757     return EF_AMDGPU_MACH_AMDGCN_GFX90c;
   1758 
   1759   if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
   1760     return EF_AMDGPU_MACH_AMDGCN_GFX1030;
   1761 
   1762   if (!strncmp (isa, gcn_gfx1036_s, gcn_isa_name_len))
   1763     return EF_AMDGPU_MACH_AMDGCN_GFX1036;
   1764 
   1765   if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len))
   1766     return EF_AMDGPU_MACH_AMDGCN_GFX1100;
   1767 
   1768   if (!strncmp (isa, gcn_gfx1103_s, gcn_isa_name_len))
   1769     return EF_AMDGPU_MACH_AMDGCN_GFX1103;
   1770 
   1771   return EF_AMDGPU_MACH_UNSUPPORTED;
   1772 }
   1773 
   1774 /* CDNA2 devices have twice as many VGPRs compared to older devices.  */
   1775 
   1776 static int
   1777 max_isa_vgprs (int isa)
   1778 {
   1779   switch (isa)
   1780     {
   1781     case EF_AMDGPU_MACH_AMDGCN_GFX803:
   1782     case EF_AMDGPU_MACH_AMDGCN_GFX900:
   1783     case EF_AMDGPU_MACH_AMDGCN_GFX906:
   1784     case EF_AMDGPU_MACH_AMDGCN_GFX908:
   1785       return 256;
   1786     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
   1787       return 512;
   1788     case EF_AMDGPU_MACH_AMDGCN_GFX90c:
   1789       return 256;
   1790     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
   1791     case EF_AMDGPU_MACH_AMDGCN_GFX1036:
   1792       return 512;  /* 512 SIMD32 = 256 wavefrontsize64.  */
   1793     case EF_AMDGPU_MACH_AMDGCN_GFX1100:
   1794     case EF_AMDGPU_MACH_AMDGCN_GFX1103:
   1795       return 1536; /* 1536 SIMD32 = 768 wavefrontsize64.  */
   1796     }
   1797   GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
   1798 }
   1799 
   1800 /* }}}  */
   1801 /* {{{ Run  */
   1802 
   1803 /* Create or reuse a team arena and stack space.
   1804 
   1805    Team arenas are used by OpenMP to avoid calling malloc multiple times
   1806    while setting up each team.  This is purely a performance optimization.
   1807 
   1808    The stack space is used by all kernels.  We must allocate it in such a
   1809    way that the reverse offload implmentation can access the data.
   1810 
   1811    Allocating this memory costs performance, so this function will reuse an
   1812    existing allocation if a large enough one is idle.
   1813    The memory lock is released, but not deallocated, when the kernel exits.  */
   1814 
   1815 static void
   1816 configure_ephemeral_memories (struct kernel_info *kernel,
   1817 			      struct kernargs_abi *kernargs, int num_teams,
   1818 			      int num_threads)
   1819 {
   1820   struct agent_info *agent = kernel->agent;
   1821   struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
   1822   struct ephemeral_memories_list *item;
   1823 
   1824   int actual_arena_size = (kernel->kind == KIND_OPENMP
   1825 			   ? team_arena_size : 0);
   1826   int actual_arena_total_size = actual_arena_size * num_teams;
   1827   size_t size = (actual_arena_total_size
   1828 		 + num_teams * num_threads * stack_size);
   1829 
   1830   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
   1831     {
   1832       if (item->size < size)
   1833 	continue;
   1834 
   1835       if (pthread_mutex_trylock (&item->in_use) == 0)
   1836 	break;
   1837     }
   1838 
   1839   if (!item)
   1840     {
   1841       GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
   1842 		 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
   1843 		 num_teams, num_threads, size);
   1844 
   1845       if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
   1846 	{
   1847 	  GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
   1848 	  return;
   1849 	}
   1850       item = malloc (sizeof (*item));
   1851       item->size = size;
   1852       item->next = NULL;
   1853       *next_ptr = item;
   1854 
   1855       if (pthread_mutex_init (&item->in_use, NULL))
   1856 	{
   1857 	  GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
   1858 	  return;
   1859 	}
   1860       if (pthread_mutex_lock (&item->in_use))
   1861 	{
   1862 	  GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
   1863 	  return;
   1864 	}
   1865       if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
   1866 	{
   1867 	  GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
   1868 	  return;
   1869 	}
   1870 
   1871       hsa_status_t status;
   1872       status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
   1873 					       &item->address);
   1874       if (status != HSA_STATUS_SUCCESS)
   1875 	hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
   1876       status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
   1877 						   HSA_ACCESS_PERMISSION_RW);
   1878       if (status != HSA_STATUS_SUCCESS)
   1879 	hsa_fatal ("Could not assign arena & stack memory to device", status);
   1880     }
   1881 
   1882   kernargs->arena_ptr = (actual_arena_total_size
   1883 			 ? (uint64_t)item->address
   1884 			 : 0);
   1885   kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
   1886   kernargs->arena_size_per_team = actual_arena_size;
   1887   kernargs->stack_size_per_thread = stack_size;
   1888 }
   1889 
   1890 /* Mark an ephemeral memory space available for reuse.  */
   1891 
   1892 static void
   1893 release_ephemeral_memories (struct agent_info* agent, void *address)
   1894 {
   1895   struct ephemeral_memories_list *item;
   1896 
   1897   for (item = agent->ephemeral_memories_list; item; item = item->next)
   1898     {
   1899       if (item->address == address)
   1900 	{
   1901 	  if (pthread_mutex_unlock (&item->in_use))
   1902 	    GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
   1903 	  return;
   1904 	}
   1905     }
   1906   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
   1907 }
   1908 
   1909 /* Clean up all the allocated team arenas.  */
   1910 
   1911 static bool
   1912 destroy_ephemeral_memories (struct agent_info *agent)
   1913 {
   1914   struct ephemeral_memories_list *item, *next;
   1915 
   1916   for (item = agent->ephemeral_memories_list; item; item = next)
   1917     {
   1918       next = item->next;
   1919       hsa_fns.hsa_memory_free_fn (item->address);
   1920       if (pthread_mutex_destroy (&item->in_use))
   1921 	{
   1922 	  GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
   1923 	  return false;
   1924 	}
   1925       free (item);
   1926     }
   1927   agent->ephemeral_memories_list = NULL;
   1928 
   1929   return true;
   1930 }
   1931 
   1932 /* Allocate memory on a specified device.  */
   1933 
   1934 static void *
   1935 alloc_by_agent (struct agent_info *agent, size_t size)
   1936 {
   1937   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
   1938 
   1939   void *ptr;
   1940   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
   1941 							size, &ptr);
   1942   if (status != HSA_STATUS_SUCCESS)
   1943     {
   1944       hsa_error ("Could not allocate device memory", status);
   1945       return NULL;
   1946     }
   1947 
   1948   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
   1949 					       HSA_ACCESS_PERMISSION_RW);
   1950   if (status != HSA_STATUS_SUCCESS)
   1951     {
   1952       hsa_error ("Could not assign data memory to device", status);
   1953       return NULL;
   1954     }
   1955 
   1956   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
   1957   bool profiling_dispatch_p
   1958     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
   1959   if (profiling_dispatch_p)
   1960     {
   1961       acc_prof_info *prof_info = thr->prof_info;
   1962       acc_event_info data_event_info;
   1963       acc_api_info *api_info = thr->api_info;
   1964 
   1965       prof_info->event_type = acc_ev_alloc;
   1966 
   1967       data_event_info.data_event.event_type = prof_info->event_type;
   1968       data_event_info.data_event.valid_bytes
   1969 	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
   1970       data_event_info.data_event.parent_construct
   1971 	= acc_construct_parallel;
   1972       data_event_info.data_event.implicit = 1;
   1973       data_event_info.data_event.tool_info = NULL;
   1974       data_event_info.data_event.var_name = NULL;
   1975       data_event_info.data_event.bytes = size;
   1976       data_event_info.data_event.host_ptr = NULL;
   1977       data_event_info.data_event.device_ptr = (void *) ptr;
   1978 
   1979       api_info->device_api = acc_device_api_other;
   1980 
   1981       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
   1982 					    api_info);
   1983     }
   1984 
   1985   return ptr;
   1986 }
   1987 
   1988 /* Create kernel dispatch data structure for given KERNEL, along with
   1989    the necessary device signals and memory allocations.  */
   1990 
   1991 static struct kernel_dispatch *
   1992 create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
   1993 			int num_threads)
   1994 {
   1995   struct agent_info *agent = kernel->agent;
   1996   struct kernel_dispatch *shadow
   1997     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
   1998 
   1999   shadow->agent = kernel->agent;
   2000   shadow->object = kernel->object;
   2001 
   2002   hsa_signal_t sync_signal;
   2003   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
   2004   if (status != HSA_STATUS_SUCCESS)
   2005     hsa_fatal ("Error creating the GCN sync signal", status);
   2006 
   2007   shadow->signal = sync_signal.handle;
   2008   shadow->private_segment_size = kernel->private_segment_size;
   2009 
   2010   if (lowlat_size < 0)
   2011     {
   2012       /* Divide the LDS between the number of running teams.
   2013 	 Allocate not less than is defined in the kernel metadata.  */
   2014       int teams_per_cu = num_teams / get_cu_count (agent);
   2015       int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
   2016       shadow->group_segment_size
   2017 	= (kernel->group_segment_size > LDS_per_team
   2018 	   ? kernel->group_segment_size
   2019 	   : LDS_per_team);;
   2020     }
   2021   else if (lowlat_size < GCN_LOWLAT_HEAP+8)
   2022     /* Ensure that there's space for the OpenMP libgomp data.  */
   2023     shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
   2024   else
   2025     shadow->group_segment_size = (lowlat_size > 65536
   2026 				  ? 65536
   2027 				  : lowlat_size);
   2028 
   2029   /* We expect kernels to request a single pointer, explicitly, and the
   2030      rest of struct kernargs, implicitly.  If they request anything else
   2031      then something is wrong.  */
   2032   if (kernel->kernarg_segment_size > 8)
   2033     {
   2034       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
   2035       return NULL;
   2036     }
   2037 
   2038   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
   2039 					   sizeof (struct kernargs),
   2040 					   &shadow->kernarg_address);
   2041   if (status != HSA_STATUS_SUCCESS)
   2042     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
   2043   struct kernargs *kernargs = shadow->kernarg_address;
   2044 
   2045   /* Zero-initialize the output_data (minimum needed).  */
   2046   kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
   2047   kernargs->output_data.next_output = 0;
   2048   for (unsigned i = 0;
   2049        i < (sizeof (kernargs->output_data.queue)
   2050 	    / sizeof (kernargs->output_data.queue[0]));
   2051        i++)
   2052     kernargs->output_data.queue[i].written = 0;
   2053   kernargs->output_data.consumed = 0;
   2054 
   2055   /* Pass in the heap location.  */
   2056   kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
   2057 
   2058   /* Create the ephemeral memory spaces.  */
   2059   configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
   2060 
   2061   /* Ensure we can recognize unset return values.  */
   2062   kernargs->output_data.return_value = 0xcafe0000;
   2063 
   2064   return shadow;
   2065 }
   2066 
   2067 static void
   2068 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
   2069 			 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
   2070 {
   2071   int dev_num = dev_num64;
   2072   GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
   2073 			  NULL);
   2074 }
   2075 
   2076 /* Output any data written to console output from the kernel.  It is expected
   2077    that this function is polled during kernel execution.
   2078 
   2079    We print all entries from the last item printed to the next entry without
   2080    a "written" flag.  If the "final" flag is set then it'll continue right to
   2081    the end.
   2082 
   2083    The print buffer is circular, but the from and to locations don't wrap when
   2084    the buffer does, so the output limit is UINT_MAX.  The target blocks on
   2085    output when the buffer is full.  */
   2086 
   2087 static void
   2088 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
   2089 		bool final)
   2090 {
   2091   unsigned int limit = (sizeof (kernargs->output_data.queue)
   2092 			/ sizeof (kernargs->output_data.queue[0]));
   2093 
   2094   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
   2095 				       __ATOMIC_ACQUIRE);
   2096   unsigned int to = kernargs->output_data.next_output;
   2097 
   2098   if (from > to)
   2099     {
   2100       /* Overflow.  */
   2101       if (final)
   2102 	printf ("GCN print buffer overflowed.\n");
   2103       return;
   2104     }
   2105 
   2106   unsigned int i;
   2107   for (i = from; i < to; i++)
   2108     {
   2109       struct printf_data *data = &kernargs->output_data.queue[i%limit];
   2110 
   2111       if (!data->written && !final)
   2112 	break;
   2113 
   2114       switch (data->type)
   2115 	{
   2116 	case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
   2117 	case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
   2118 	case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
   2119 	case 3: printf ("%.128s%.128s", data->msg, data->text); break;
   2120 	case 4:
   2121 	  process_reverse_offload (data->value_u64[0], data->value_u64[1],
   2122 				   data->value_u64[2], data->value_u64[3],
   2123 				   data->value_u64[4], data->value_u64[5]);
   2124 	  break;
   2125 	default: printf ("GCN print buffer error!\n"); break;
   2126 	}
   2127       data->written = 0;
   2128       __atomic_store_n (&kernargs->output_data.consumed, i+1,
   2129 			__ATOMIC_RELEASE);
   2130     }
   2131   fflush (stdout);
   2132 }
   2133 
   2134 /* Release data structure created for a kernel dispatch in SHADOW argument,
   2135    and clean up the signal and memory allocations.  */
   2136 
   2137 static void
   2138 release_kernel_dispatch (struct kernel_dispatch *shadow)
   2139 {
   2140   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
   2141 
   2142   struct kernargs *kernargs = shadow->kernarg_address;
   2143   void *addr = (void *)kernargs->abi.arena_ptr;
   2144   if (!addr)
   2145     addr = (void *)kernargs->abi.stack_ptr;
   2146   release_ephemeral_memories (shadow->agent, addr);
   2147 
   2148   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
   2149 
   2150   hsa_signal_t s;
   2151   s.handle = shadow->signal;
   2152   hsa_fns.hsa_signal_destroy_fn (s);
   2153 
   2154   free (shadow);
   2155 }
   2156 
   2157 /* Extract the properties from a kernel binary.  */
   2158 
   2159 static void
   2160 init_kernel_properties (struct kernel_info *kernel)
   2161 {
   2162   hsa_status_t status;
   2163   struct agent_info *agent = kernel->agent;
   2164   hsa_executable_symbol_t kernel_symbol;
   2165   char *buf = alloca (strlen (kernel->name) + 4);
   2166   sprintf (buf, "%s.kd", kernel->name);
   2167   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   2168 						 buf, agent->id,
   2169 						 0, &kernel_symbol);
   2170   if (status != HSA_STATUS_SUCCESS)
   2171     {
   2172       hsa_warn ("Could not find symbol for kernel in the code object", status);
   2173       fprintf (stderr, "not found name: '%s'\n", buf);
   2174       dump_executable_symbols (agent->executable);
   2175       goto failure;
   2176     }
   2177   GCN_DEBUG ("Located kernel %s\n", kernel->name);
   2178   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2179     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
   2180   if (status != HSA_STATUS_SUCCESS)
   2181     hsa_fatal ("Could not extract a kernel object from its symbol", status);
   2182   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2183     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
   2184      &kernel->kernarg_segment_size);
   2185   if (status != HSA_STATUS_SUCCESS)
   2186     hsa_fatal ("Could not get info about kernel argument size", status);
   2187   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2188     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
   2189      &kernel->group_segment_size);
   2190   if (status != HSA_STATUS_SUCCESS)
   2191     hsa_fatal ("Could not get info about kernel group segment size", status);
   2192   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2193     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
   2194      &kernel->private_segment_size);
   2195   if (status != HSA_STATUS_SUCCESS)
   2196     hsa_fatal ("Could not get info about kernel private segment size",
   2197 	       status);
   2198 
   2199   /* The kernel type is not known until something tries to launch it.  */
   2200   kernel->kind = KIND_UNKNOWN;
   2201 
   2202   GCN_DEBUG ("Kernel structure for %s fully initialized with "
   2203 	     "following segment sizes: \n", kernel->name);
   2204   GCN_DEBUG ("  group_segment_size: %u\n",
   2205 	     (unsigned) kernel->group_segment_size);
   2206   GCN_DEBUG ("  private_segment_size: %u\n",
   2207 	     (unsigned) kernel->private_segment_size);
   2208   GCN_DEBUG ("  kernarg_segment_size: %u\n",
   2209 	     (unsigned) kernel->kernarg_segment_size);
   2210   return;
   2211 
   2212 failure:
   2213   kernel->initialization_failed = true;
   2214 }
   2215 
   2216 /* Do all the work that is necessary before running KERNEL for the first time.
   2217    The function assumes the program has been created, finalized and frozen by
   2218    create_and_finalize_hsa_program.  */
   2219 
   2220 static void
   2221 init_kernel (struct kernel_info *kernel)
   2222 {
   2223   if (pthread_mutex_lock (&kernel->init_mutex))
   2224     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
   2225   if (kernel->initialized)
   2226     {
   2227       if (pthread_mutex_unlock (&kernel->init_mutex))
   2228 	GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
   2229 			   "mutex");
   2230 
   2231       return;
   2232     }
   2233 
   2234   init_kernel_properties (kernel);
   2235 
   2236   if (!kernel->initialization_failed)
   2237     {
   2238       GCN_DEBUG ("\n");
   2239 
   2240       kernel->initialized = true;
   2241     }
   2242   if (pthread_mutex_unlock (&kernel->init_mutex))
   2243     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
   2244 		       "mutex");
   2245 }
   2246 
   2247 /* Run KERNEL on its agent, pass VARS to it as arguments and take
   2248    launch attributes from KLA.
   2249 
   2250    MODULE_LOCKED indicates that the caller already holds the lock and
   2251    run_kernel need not lock it again.
   2252    If AQ is NULL then agent->sync_queue will be used.  */
   2253 
   2254 static void
   2255 run_kernel (struct kernel_info *kernel, void *vars,
   2256 	    struct GOMP_kernel_launch_attributes *kla,
   2257 	    struct goacc_asyncqueue *aq, bool module_locked)
   2258 {
   2259   struct agent_info *agent = kernel->agent;
   2260   GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
   2261 	     kernel->description->vpgr_count);
   2262 
   2263   /* Reduce the number of threads/workers if there are insufficient
   2264      VGPRs available to run the kernels together.  */
   2265   if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
   2266     {
   2267       int max_vgprs = max_isa_vgprs (agent->device_isa);
   2268       int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
   2269       int max_threads = (max_vgprs / granulated_vgprs) * 4;
   2270       if (kla->gdims[2] > max_threads)
   2271 	{
   2272 	  GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
   2273 		       " per team/gang - reducing to %d threads/workers.\n",
   2274 		       kla->gdims[2], max_threads);
   2275 	  kla->gdims[2] = max_threads;
   2276 	}
   2277     }
   2278 
   2279   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
   2280 	     (aq ? aq->id : 0));
   2281   GCN_DEBUG ("GCN launch attribs: gdims:[");
   2282   int i;
   2283   for (i = 0; i < kla->ndim; ++i)
   2284     {
   2285       if (i)
   2286 	DEBUG_PRINT (", ");
   2287       DEBUG_PRINT ("%u", kla->gdims[i]);
   2288     }
   2289   DEBUG_PRINT ("], normalized gdims:[");
   2290   for (i = 0; i < kla->ndim; ++i)
   2291     {
   2292       if (i)
   2293 	DEBUG_PRINT (", ");
   2294       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
   2295     }
   2296   DEBUG_PRINT ("], wdims:[");
   2297   for (i = 0; i < kla->ndim; ++i)
   2298     {
   2299       if (i)
   2300 	DEBUG_PRINT (", ");
   2301       DEBUG_PRINT ("%u", kla->wdims[i]);
   2302     }
   2303   DEBUG_PRINT ("]\n");
   2304   DEBUG_FLUSH ();
   2305 
   2306   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
   2307     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
   2308 
   2309   if (!agent->initialized)
   2310     GOMP_PLUGIN_fatal ("Agent must be initialized");
   2311 
   2312   if (!kernel->initialized)
   2313     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
   2314 
   2315   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
   2316 
   2317   uint64_t index
   2318     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
   2319   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
   2320 
   2321   /* Wait until the queue is not full before writing the packet.   */
   2322   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
   2323 	 >= command_q->size)
   2324     ;
   2325 
   2326   /* Do not allow the dimensions to be overridden when running
   2327      constructors or destructors.  */
   2328   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
   2329   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
   2330 
   2331   hsa_kernel_dispatch_packet_t *packet;
   2332   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
   2333 	   + index % command_q->size;
   2334 
   2335   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
   2336   packet->grid_size_x = override_x ? : kla->gdims[0];
   2337   packet->workgroup_size_x = get_group_size (kla->ndim,
   2338 					     packet->grid_size_x,
   2339 					     kla->wdims[0]);
   2340 
   2341   if (kla->ndim >= 2)
   2342     {
   2343       packet->grid_size_y = kla->gdims[1];
   2344       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
   2345 						 kla->wdims[1]);
   2346     }
   2347   else
   2348     {
   2349       packet->grid_size_y = 1;
   2350       packet->workgroup_size_y = 1;
   2351     }
   2352 
   2353   if (kla->ndim == 3)
   2354     {
   2355       packet->grid_size_z = limit_worker_threads (override_z
   2356 						  ? : kla->gdims[2]);
   2357       packet->workgroup_size_z = get_group_size (kla->ndim,
   2358 						 packet->grid_size_z,
   2359 						 kla->wdims[2]);
   2360     }
   2361   else
   2362     {
   2363       packet->grid_size_z = 1;
   2364       packet->workgroup_size_z = 1;
   2365     }
   2366 
   2367   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
   2368 	     " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
   2369 	     packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
   2370 	     packet->grid_size_x / packet->workgroup_size_x,
   2371 	     packet->grid_size_y / packet->workgroup_size_y,
   2372 	     packet->grid_size_z / packet->workgroup_size_z,
   2373 	     packet->workgroup_size_x, packet->workgroup_size_y,
   2374 	     packet->workgroup_size_z);
   2375 
   2376   struct kernel_dispatch *shadow
   2377     = create_kernel_dispatch (kernel, packet->grid_size_x,
   2378 			      packet->grid_size_z);
   2379   shadow->queue = command_q;
   2380 
   2381   if (debug)
   2382     {
   2383       fprintf (stderr, "\nKernel has following dependencies:\n");
   2384       print_kernel_dispatch (shadow, 2);
   2385     }
   2386 
   2387   packet->private_segment_size = shadow->private_segment_size;
   2388   packet->group_segment_size = shadow->group_segment_size;
   2389   packet->kernel_object = shadow->object;
   2390   packet->kernarg_address = shadow->kernarg_address;
   2391   hsa_signal_t s;
   2392   s.handle = shadow->signal;
   2393   packet->completion_signal = s;
   2394   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
   2395   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
   2396 
   2397   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
   2398 
   2399   uint16_t header;
   2400   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
   2401   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
   2402   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
   2403 
   2404   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
   2405 	     agent->device_id);
   2406 
   2407   packet_store_release ((uint32_t *) packet, header,
   2408 			(uint16_t) kla->ndim
   2409 			<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
   2410 
   2411   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
   2412 				       index);
   2413 
   2414   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
   2415 
   2416   /* Root signal waits with 1ms timeout.  */
   2417   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
   2418 					     1000 * 1000,
   2419 					     HSA_WAIT_STATE_BLOCKED) != 0)
   2420     {
   2421       console_output (kernel, shadow->kernarg_address, false);
   2422     }
   2423   console_output (kernel, shadow->kernarg_address, true);
   2424 
   2425   struct kernargs *kernargs = shadow->kernarg_address;
   2426   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
   2427 
   2428   release_kernel_dispatch (shadow);
   2429 
   2430   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
   2431     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
   2432 
   2433   unsigned int upper = (return_value & ~0xffff) >> 16;
   2434   if (upper == 0xcafe)
   2435     ; // exit not called, normal termination.
   2436   else if (upper == 0xffff)
   2437     ; // exit called.
   2438   else
   2439     {
   2440       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
   2441 			 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
   2442 			 return_value);
   2443       abort ();
   2444     }
   2445 
   2446   if (upper == 0xffff)
   2447     {
   2448       unsigned int signal = (return_value >> 8) & 0xff;
   2449 
   2450       if (signal == SIGABRT)
   2451 	{
   2452 	  GCN_WARNING ("GCN Kernel aborted\n");
   2453 	  abort ();
   2454 	}
   2455       else if (signal != 0)
   2456 	{
   2457 	  GCN_WARNING ("GCN Kernel received unknown signal\n");
   2458 	  abort ();
   2459 	}
   2460 
   2461       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
   2462       exit (return_value & 0xff);
   2463     }
   2464 }
   2465 
   2466 /* }}}  */
   2467 /* {{{ Load/Unload  */
   2468 
   2469 /* Initialize KERNEL from D and other parameters.  Return true on success. */
   2470 
   2471 static bool
   2472 init_basic_kernel_info (struct kernel_info *kernel,
   2473 			struct hsa_kernel_description *d,
   2474 			struct agent_info *agent,
   2475 			struct module_info *module)
   2476 {
   2477   kernel->agent = agent;
   2478   kernel->module = module;
   2479   kernel->name = d->name;
   2480   kernel->description = d;
   2481   if (pthread_mutex_init (&kernel->init_mutex, NULL))
   2482     {
   2483       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
   2484       return false;
   2485     }
   2486   return true;
   2487 }
   2488 
   2489 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
   2490 
   2491 static bool
   2492 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
   2493 {
   2494   int isa_field = elf_gcn_isa_field (image);
   2495   const char* isa_s = isa_hsa_name (isa_field);
   2496   if (!isa_s)
   2497     {
   2498       hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
   2499       return false;
   2500     }
   2501 
   2502   if (isa_field != agent->device_isa)
   2503     {
   2504       char msg[120];
   2505       const char *agent_isa_s = isa_hsa_name (agent->device_isa);
   2506       const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
   2507       assert (agent_isa_s);
   2508       assert (agent_isa_gcc_s);
   2509 
   2510       snprintf (msg, sizeof msg,
   2511 		"GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
   2512 		"Try to recompile with '-foffload-options=-march=%s'.\n",
   2513 		isa_s, agent_isa_s, agent_isa_gcc_s);
   2514 
   2515       hsa_error (msg, HSA_STATUS_ERROR);
   2516       return false;
   2517     }
   2518 
   2519   return true;
   2520 }
   2521 
   2522 /* Create and finalize the program consisting of all loaded modules.  */
   2523 
   2524 static bool
   2525 create_and_finalize_hsa_program (struct agent_info *agent)
   2526 {
   2527   hsa_status_t status;
   2528   bool res = true;
   2529   if (pthread_mutex_lock (&agent->prog_mutex))
   2530     {
   2531       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
   2532       return false;
   2533     }
   2534   if (agent->prog_finalized)
   2535     goto final;
   2536 
   2537   status
   2538     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
   2539 					HSA_EXECUTABLE_STATE_UNFROZEN,
   2540 					"", &agent->executable);
   2541   if (status != HSA_STATUS_SUCCESS)
   2542     {
   2543       hsa_error ("Could not create GCN executable", status);
   2544       goto fail;
   2545     }
   2546 
   2547   /* Load any GCN modules.  */
   2548   struct module_info *module = agent->module;
   2549   if (module)
   2550     {
   2551       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
   2552 
   2553       if (!isa_matches_agent (agent, image))
   2554 	goto fail;
   2555 
   2556       hsa_code_object_t co = { 0 };
   2557       status = hsa_fns.hsa_code_object_deserialize_fn
   2558 	(module->image_desc->gcn_image->image,
   2559 	 module->image_desc->gcn_image->size,
   2560 	 NULL, &co);
   2561       if (status != HSA_STATUS_SUCCESS)
   2562 	{
   2563 	  hsa_error ("Could not deserialize GCN code object", status);
   2564 	  goto fail;
   2565 	}
   2566 
   2567       status = hsa_fns.hsa_executable_load_code_object_fn
   2568 	(agent->executable, agent->id, co, "");
   2569       if (status != HSA_STATUS_SUCCESS)
   2570 	{
   2571 	  hsa_error ("Could not load GCN code object", status);
   2572 	  goto fail;
   2573 	}
   2574 
   2575       if (!module->heap)
   2576 	{
   2577 	  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
   2578 						   gcn_kernel_heap_size,
   2579 						   (void**)&module->heap);
   2580 	  if (status != HSA_STATUS_SUCCESS)
   2581 	    {
   2582 	      hsa_error ("Could not allocate memory for GCN heap", status);
   2583 	      goto fail;
   2584 	    }
   2585 
   2586 	  status = hsa_fns.hsa_memory_assign_agent_fn
   2587 			(module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
   2588 	  if (status != HSA_STATUS_SUCCESS)
   2589 	    {
   2590 	      hsa_error ("Could not assign GCN heap memory to device", status);
   2591 	      goto fail;
   2592 	    }
   2593 
   2594 	  hsa_fns.hsa_memory_copy_fn (&module->heap->size,
   2595 				      &gcn_kernel_heap_size,
   2596 				      sizeof (gcn_kernel_heap_size));
   2597 	}
   2598 
   2599     }
   2600 
   2601   if (debug)
   2602     dump_executable_symbols (agent->executable);
   2603 
   2604   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
   2605   if (status != HSA_STATUS_SUCCESS)
   2606     {
   2607       hsa_error ("Could not freeze the GCN executable", status);
   2608       goto fail;
   2609     }
   2610 
   2611 final:
   2612   agent->prog_finalized = true;
   2613 
   2614   if (pthread_mutex_unlock (&agent->prog_mutex))
   2615     {
   2616       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
   2617       res = false;
   2618     }
   2619 
   2620   return res;
   2621 
   2622 fail:
   2623   res = false;
   2624   goto final;
   2625 }
   2626 
   2627 /* Free the HSA program in agent and everything associated with it and set
   2628    agent->prog_finalized and the initialized flags of all kernels to false.
   2629    Return TRUE on success.  */
   2630 
   2631 static bool
   2632 destroy_hsa_program (struct agent_info *agent)
   2633 {
   2634   if (!agent->prog_finalized)
   2635     return true;
   2636 
   2637   hsa_status_t status;
   2638 
   2639   GCN_DEBUG ("Destroying the current GCN program.\n");
   2640 
   2641   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
   2642   if (status != HSA_STATUS_SUCCESS)
   2643     return hsa_error ("Could not destroy GCN executable", status);
   2644 
   2645   if (agent->module)
   2646     {
   2647       int i;
   2648       for (i = 0; i < agent->module->kernel_count; i++)
   2649 	agent->module->kernels[i].initialized = false;
   2650 
   2651       if (agent->module->heap)
   2652 	{
   2653 	  hsa_fns.hsa_memory_free_fn (agent->module->heap);
   2654 	  agent->module->heap = NULL;
   2655 	}
   2656     }
   2657   agent->prog_finalized = false;
   2658   return true;
   2659 }
   2660 
   2661 /* Deinitialize all information associated with MODULE and kernels within
   2662    it.  Return TRUE on success.  */
   2663 
   2664 static bool
   2665 destroy_module (struct module_info *module, bool locked)
   2666 {
   2667   /* Run destructors before destroying module.  */
   2668   struct GOMP_kernel_launch_attributes kla =
   2669     { 3,
   2670       /* Grid size.  */
   2671       { 1, 64, 1 },
   2672       /* Work-group size.  */
   2673       { 1, 64, 1 }
   2674     };
   2675 
   2676   if (module->fini_array_func)
   2677     {
   2678       init_kernel (module->fini_array_func);
   2679       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
   2680     }
   2681   module->constructors_run_p = false;
   2682 
   2683   int i;
   2684   for (i = 0; i < module->kernel_count; i++)
   2685     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
   2686       {
   2687 	GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
   2688 			   "mutex");
   2689 	return false;
   2690       }
   2691 
   2692   return true;
   2693 }
   2694 
   2695 /* }}}  */
   2696 /* {{{ Async  */
   2697 
   2698 /* Callback of dispatch queues to report errors.  */
   2699 
   2700 static void
   2701 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
   2702 {
   2703   struct queue_entry *entry = &aq->queue[index];
   2704 
   2705   switch (entry->type)
   2706     {
   2707     case KERNEL_LAUNCH:
   2708       if (DEBUG_QUEUES)
   2709 	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
   2710 		   aq->agent->device_id, aq->id, index);
   2711       run_kernel (entry->u.launch.kernel,
   2712 		  entry->u.launch.vars,
   2713 		  &entry->u.launch.kla, aq, false);
   2714       if (DEBUG_QUEUES)
   2715 	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
   2716 		   aq->agent->device_id, aq->id, index);
   2717       break;
   2718 
   2719     case CALLBACK:
   2720       if (DEBUG_QUEUES)
   2721 	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
   2722 		   aq->agent->device_id, aq->id, index);
   2723       entry->u.callback.fn (entry->u.callback.data);
   2724       if (DEBUG_QUEUES)
   2725 	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
   2726 		   aq->agent->device_id, aq->id, index);
   2727       break;
   2728 
   2729     case ASYNC_WAIT:
   2730       {
   2731 	/* FIXME: is it safe to access a placeholder that may already have
   2732 	   been executed?  */
   2733         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
   2734 
   2735 	if (DEBUG_QUEUES)
   2736           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
   2737 		     aq->agent->device_id, aq->id, index);
   2738 
   2739 	pthread_mutex_lock (&placeholderp->mutex);
   2740 
   2741 	while (!placeholderp->executed)
   2742           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
   2743 
   2744 	pthread_mutex_unlock (&placeholderp->mutex);
   2745 
   2746 	if (pthread_cond_destroy (&placeholderp->cond))
   2747 	  GOMP_PLUGIN_error ("Failed to destroy serialization cond");
   2748 
   2749 	if (pthread_mutex_destroy (&placeholderp->mutex))
   2750 	  GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
   2751 
   2752 	if (DEBUG_QUEUES)
   2753           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
   2754 		     "entry (%d) done\n", aq->agent->device_id, aq->id, index);
   2755       }
   2756       break;
   2757 
   2758     case ASYNC_PLACEHOLDER:
   2759       pthread_mutex_lock (&entry->u.placeholder.mutex);
   2760       entry->u.placeholder.executed = 1;
   2761       pthread_cond_signal (&entry->u.placeholder.cond);
   2762       pthread_mutex_unlock (&entry->u.placeholder.mutex);
   2763       break;
   2764 
   2765     default:
   2766       GOMP_PLUGIN_fatal ("Unknown queue element");
   2767     }
   2768 }
   2769 
   2770 /* This function is run as a thread to service an async queue in the
   2771    background.  It runs continuously until the stop flag is set.  */
   2772 
   2773 static void *
   2774 drain_queue (void *thread_arg)
   2775 {
   2776   struct goacc_asyncqueue *aq = thread_arg;
   2777 
   2778   if (DRAIN_QUEUE_SYNCHRONOUS_P)
   2779     {
   2780       aq->drain_queue_stop = 2;
   2781       return NULL;
   2782     }
   2783 
   2784   pthread_mutex_lock (&aq->mutex);
   2785 
   2786   while (true)
   2787     {
   2788       if (aq->drain_queue_stop)
   2789 	break;
   2790 
   2791       if (aq->queue_n > 0)
   2792 	{
   2793 	  pthread_mutex_unlock (&aq->mutex);
   2794 	  execute_queue_entry (aq, aq->queue_first);
   2795 
   2796 	  pthread_mutex_lock (&aq->mutex);
   2797 	  aq->queue_first = ((aq->queue_first + 1)
   2798 			     % ASYNC_QUEUE_SIZE);
   2799 	  aq->queue_n--;
   2800 
   2801 	  if (DEBUG_THREAD_SIGNAL)
   2802 	    GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
   2803 		       aq->agent->device_id, aq->id);
   2804 	  pthread_cond_broadcast (&aq->queue_cond_out);
   2805 	  pthread_mutex_unlock (&aq->mutex);
   2806 
   2807 	  if (DEBUG_QUEUES)
   2808 	    GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
   2809 		       aq->id);
   2810 	  pthread_mutex_lock (&aq->mutex);
   2811 	}
   2812       else
   2813 	{
   2814 	  if (DEBUG_THREAD_SLEEP)
   2815 	    GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
   2816 		       aq->agent->device_id, aq->id);
   2817 	  pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
   2818 	  if (DEBUG_THREAD_SLEEP)
   2819 	    GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
   2820 		       aq->agent->device_id, aq->id);
   2821 	}
   2822     }
   2823 
   2824   aq->drain_queue_stop = 2;
   2825   if (DEBUG_THREAD_SIGNAL)
   2826     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
   2827 	       aq->agent->device_id, aq->id);
   2828   pthread_cond_broadcast (&aq->queue_cond_out);
   2829   pthread_mutex_unlock (&aq->mutex);
   2830 
   2831   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
   2832   return NULL;
   2833 }
   2834 
   2835 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
   2836    is not usually the case.  This is just a debug tool.  */
   2837 
   2838 static void
   2839 drain_queue_synchronous (struct goacc_asyncqueue *aq)
   2840 {
   2841   pthread_mutex_lock (&aq->mutex);
   2842 
   2843   while (aq->queue_n > 0)
   2844     {
   2845       execute_queue_entry (aq, aq->queue_first);
   2846 
   2847       aq->queue_first = ((aq->queue_first + 1)
   2848 			 % ASYNC_QUEUE_SIZE);
   2849       aq->queue_n--;
   2850     }
   2851 
   2852   pthread_mutex_unlock (&aq->mutex);
   2853 }
   2854 
   2855 /* Block the current thread until an async queue is writable.  The aq->mutex
   2856    lock should be held on entry, and remains locked on exit.  */
   2857 
   2858 static void
   2859 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
   2860 {
   2861   if (aq->queue_n == ASYNC_QUEUE_SIZE)
   2862     {
   2863       /* Queue is full.  Wait for it to not be full.  */
   2864       while (aq->queue_n == ASYNC_QUEUE_SIZE)
   2865 	pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
   2866     }
   2867 }
   2868 
   2869 /* Request an asynchronous kernel launch on the specified queue.  This
   2870    may block if the queue is full, but returns without waiting for the
   2871    kernel to run.  */
   2872 
   2873 static void
   2874 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
   2875 		   void *vars, struct GOMP_kernel_launch_attributes *kla)
   2876 {
   2877   assert (aq->agent == kernel->agent);
   2878 
   2879   pthread_mutex_lock (&aq->mutex);
   2880 
   2881   wait_for_queue_nonfull (aq);
   2882 
   2883   int queue_last = ((aq->queue_first + aq->queue_n)
   2884 		    % ASYNC_QUEUE_SIZE);
   2885   if (DEBUG_QUEUES)
   2886     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
   2887 	       aq->id, queue_last);
   2888 
   2889   aq->queue[queue_last].type = KERNEL_LAUNCH;
   2890   aq->queue[queue_last].u.launch.kernel = kernel;
   2891   aq->queue[queue_last].u.launch.vars = vars;
   2892   aq->queue[queue_last].u.launch.kla = *kla;
   2893 
   2894   aq->queue_n++;
   2895 
   2896   if (DEBUG_THREAD_SIGNAL)
   2897     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
   2898 	       aq->agent->device_id, aq->id);
   2899   pthread_cond_signal (&aq->queue_cond_in);
   2900 
   2901   pthread_mutex_unlock (&aq->mutex);
   2902 }
   2903 
   2904 /* Request an asynchronous callback on the specified queue.  The callback
   2905    function will be called, with the given opaque data, from the appropriate
   2906    async thread, when all previous items on that queue are complete.  */
   2907 
   2908 static void
   2909 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
   2910 		     void *data)
   2911 {
   2912   pthread_mutex_lock (&aq->mutex);
   2913 
   2914   wait_for_queue_nonfull (aq);
   2915 
   2916   int queue_last = ((aq->queue_first + aq->queue_n)
   2917 		    % ASYNC_QUEUE_SIZE);
   2918   if (DEBUG_QUEUES)
   2919     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
   2920 	       aq->id, queue_last);
   2921 
   2922   aq->queue[queue_last].type = CALLBACK;
   2923   aq->queue[queue_last].u.callback.fn = fn;
   2924   aq->queue[queue_last].u.callback.data = data;
   2925 
   2926   aq->queue_n++;
   2927 
   2928   if (DEBUG_THREAD_SIGNAL)
   2929     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
   2930 	       aq->agent->device_id, aq->id);
   2931   pthread_cond_signal (&aq->queue_cond_in);
   2932 
   2933   pthread_mutex_unlock (&aq->mutex);
   2934 }
   2935 
   2936 /* Request that a given async thread wait for another thread (unspecified) to
   2937    reach the given placeholder.  The wait will occur when all previous entries
   2938    on the queue are complete.  A placeholder is effectively a kind of signal
   2939    which simply sets a flag when encountered in a queue.  */
   2940 
   2941 static void
   2942 queue_push_asyncwait (struct goacc_asyncqueue *aq,
   2943 		      struct placeholder *placeholderp)
   2944 {
   2945   pthread_mutex_lock (&aq->mutex);
   2946 
   2947   wait_for_queue_nonfull (aq);
   2948 
   2949   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
   2950   if (DEBUG_QUEUES)
   2951     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
   2952 	       aq->id, queue_last);
   2953 
   2954   aq->queue[queue_last].type = ASYNC_WAIT;
   2955   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
   2956 
   2957   aq->queue_n++;
   2958 
   2959   if (DEBUG_THREAD_SIGNAL)
   2960     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
   2961 	       aq->agent->device_id, aq->id);
   2962   pthread_cond_signal (&aq->queue_cond_in);
   2963 
   2964   pthread_mutex_unlock (&aq->mutex);
   2965 }
   2966 
   2967 /* Add a placeholder into an async queue.  When the async thread reaches the
   2968    placeholder it will set the "executed" flag to true and continue.
   2969    Another thread may be waiting on this thread reaching the placeholder.  */
   2970 
   2971 static struct placeholder *
   2972 queue_push_placeholder (struct goacc_asyncqueue *aq)
   2973 {
   2974   struct placeholder *placeholderp;
   2975 
   2976   pthread_mutex_lock (&aq->mutex);
   2977 
   2978   wait_for_queue_nonfull (aq);
   2979 
   2980   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
   2981   if (DEBUG_QUEUES)
   2982     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
   2983 	       aq->id, queue_last);
   2984 
   2985   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
   2986   placeholderp = &aq->queue[queue_last].u.placeholder;
   2987 
   2988   if (pthread_mutex_init (&placeholderp->mutex, NULL))
   2989     {
   2990       pthread_mutex_unlock (&aq->mutex);
   2991       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
   2992     }
   2993 
   2994   if (pthread_cond_init (&placeholderp->cond, NULL))
   2995     {
   2996       pthread_mutex_unlock (&aq->mutex);
   2997       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
   2998     }
   2999 
   3000   placeholderp->executed = 0;
   3001 
   3002   aq->queue_n++;
   3003 
   3004   if (DEBUG_THREAD_SIGNAL)
   3005     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
   3006 	       aq->agent->device_id, aq->id);
   3007   pthread_cond_signal (&aq->queue_cond_in);
   3008 
   3009   pthread_mutex_unlock (&aq->mutex);
   3010 
   3011   return placeholderp;
   3012 }
   3013 
   3014 /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
   3015 
   3016 static void
   3017 finalize_async_thread (struct goacc_asyncqueue *aq)
   3018 {
   3019   pthread_mutex_lock (&aq->mutex);
   3020   if (aq->drain_queue_stop == 2)
   3021     {
   3022       pthread_mutex_unlock (&aq->mutex);
   3023       return;
   3024     }
   3025 
   3026   aq->drain_queue_stop = 1;
   3027 
   3028   if (DEBUG_THREAD_SIGNAL)
   3029     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
   3030 	       aq->agent->device_id, aq->id);
   3031   pthread_cond_signal (&aq->queue_cond_in);
   3032 
   3033   while (aq->drain_queue_stop != 2)
   3034     {
   3035       if (DEBUG_THREAD_SLEEP)
   3036 	GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
   3037 		   " to sleep\n", aq->agent->device_id, aq->id);
   3038       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
   3039       if (DEBUG_THREAD_SLEEP)
   3040 	GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
   3041 		   aq->agent->device_id, aq->id);
   3042     }
   3043 
   3044   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
   3045 	     aq->id);
   3046   pthread_mutex_unlock (&aq->mutex);
   3047 
   3048   int err = pthread_join (aq->thread_drain_queue, NULL);
   3049   if (err != 0)
   3050     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
   3051 		       aq->agent->device_id, aq->id, strerror (err));
   3052   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
   3053 }
   3054 
   3055 /* Set up an async queue for OpenMP.  There will be only one.  The
   3056    implementation simply uses an OpenACC async queue.
   3057    FIXME: is this thread-safe if two threads call this function?  */
   3058 
   3059 static void
   3060 maybe_init_omp_async (struct agent_info *agent)
   3061 {
   3062   if (!agent->omp_async_queue)
   3063     agent->omp_async_queue
   3064       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
   3065 }
   3066 
   3067 /* A wrapper that works around an issue in the HSA runtime with host-to-device
   3068    copies from read-only pages.  */
   3069 
   3070 static void
   3071 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
   3072 {
   3073   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
   3074 
   3075   if (status == HSA_STATUS_SUCCESS)
   3076     return;
   3077 
   3078   /* It appears that the copy fails if the source data is in a read-only page.
   3079      We can't detect that easily, so try copying the data to a temporary buffer
   3080      and doing the copy again if we got an error above.  */
   3081 
   3082   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
   3083 	       "[%p:+%d]\n", (void *) src, (int) len);
   3084 
   3085   void *src_copy = malloc (len);
   3086   memcpy (src_copy, src, len);
   3087   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
   3088   free (src_copy);
   3089   if (status != HSA_STATUS_SUCCESS)
   3090     GOMP_PLUGIN_error ("memory copy failed");
   3091 }
   3092 
   3093 /* Copy data to or from a device.  This is intended for use as an async
   3094    callback event.  */
   3095 
   3096 static void
   3097 copy_data (void *data_)
   3098 {
   3099   struct copy_data *data = (struct copy_data *)data_;
   3100   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
   3101 	     data->aq->agent->device_id, data->aq->id, data->len, data->src,
   3102 	     data->dst);
   3103   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
   3104   free (data);
   3105 }
   3106 
   3107 /* Request an asynchronous data copy, to or from a device, on a given queue.
   3108    The event will be registered as a callback.  */
   3109 
   3110 static void
   3111 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
   3112 		 size_t len)
   3113 {
   3114   if (DEBUG_QUEUES)
   3115     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
   3116 	       aq->agent->device_id, aq->id, len, src, dst);
   3117   struct copy_data *data
   3118     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
   3119   data->dst = dst;
   3120   data->src = src;
   3121   data->len = len;
   3122   data->aq = aq;
   3123   queue_push_callback (aq, copy_data, data);
   3124 }
   3125 
   3126 /* Return true if the given queue is currently empty.  */
   3127 
   3128 static int
   3129 queue_empty (struct goacc_asyncqueue *aq)
   3130 {
   3131   pthread_mutex_lock (&aq->mutex);
   3132   int res = aq->queue_n == 0 ? 1 : 0;
   3133   pthread_mutex_unlock (&aq->mutex);
   3134 
   3135   return res;
   3136 }
   3137 
   3138 /* Wait for a given queue to become empty.  This implements an OpenACC wait
   3139    directive.  */
   3140 
   3141 static void
   3142 wait_queue (struct goacc_asyncqueue *aq)
   3143 {
   3144   if (DRAIN_QUEUE_SYNCHRONOUS_P)
   3145     {
   3146       drain_queue_synchronous (aq);
   3147       return;
   3148     }
   3149 
   3150   pthread_mutex_lock (&aq->mutex);
   3151 
   3152   while (aq->queue_n > 0)
   3153     {
   3154       if (DEBUG_THREAD_SLEEP)
   3155 	GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
   3156 		   aq->agent->device_id, aq->id);
   3157       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
   3158       if (DEBUG_THREAD_SLEEP)
   3159 	GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
   3160 		   aq->id);
   3161     }
   3162 
   3163   pthread_mutex_unlock (&aq->mutex);
   3164   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
   3165 }
   3166 
   3167 /* }}}  */
   3168 /* {{{ OpenACC support  */
   3169 
   3170 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
   3171 
   3172 static void
   3173 gcn_exec (struct kernel_info *kernel,
   3174 	  void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
   3175 	  struct goacc_asyncqueue *aq)
   3176 {
   3177   if (!GOMP_OFFLOAD_can_run (kernel))
   3178     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
   3179 
   3180   /* If we get here then this must be an OpenACC kernel.  */
   3181   kernel->kind = KIND_OPENACC;
   3182 
   3183   struct hsa_kernel_description *hsa_kernel_desc = NULL;
   3184   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
   3185     {
   3186       struct hsa_kernel_description *d
   3187 	= &kernel->module->image_desc->kernel_infos[i];
   3188       if (d->name == kernel->name)
   3189 	{
   3190 	  hsa_kernel_desc = d;
   3191 	  break;
   3192 	}
   3193     }
   3194 
   3195   /* We may have statically-determined dimensions in
   3196      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
   3197      invocation at runtime in dims[].  We allow static dimensions to take
   3198      priority over dynamic dimensions when present (non-zero).  */
   3199   if (hsa_kernel_desc->oacc_dims[0] > 0)
   3200     dims[0] = hsa_kernel_desc->oacc_dims[0];
   3201   if (hsa_kernel_desc->oacc_dims[1] > 0)
   3202     dims[1] = hsa_kernel_desc->oacc_dims[1];
   3203   if (hsa_kernel_desc->oacc_dims[2] > 0)
   3204     dims[2] = hsa_kernel_desc->oacc_dims[2];
   3205 
   3206   /* Ideally, when a dimension isn't explicitly specified, we should
   3207      tune it to run 40 (or 32?) threads per CU with no threads getting queued.
   3208      In practice, we tune for peak performance on BabelStream, which
   3209      for OpenACC is currently 32 threads per CU.  */
   3210   if (dims[0] == 0 && dims[1] == 0)
   3211     {
   3212       /* If any of the OpenACC dimensions remain 0 then we get to pick a
   3213 	 number.  There isn't really a correct answer for this without a clue
   3214 	 about the problem size, so let's do a reasonable number of workers
   3215 	 and gangs.  */
   3216 
   3217       dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs.  */
   3218       dims[1] = 8; /* Workers.  */
   3219     }
   3220   else if (dims[0] == 0 && dims[1] > 0)
   3221     {
   3222       /* Auto-scale the number of gangs with the requested number of workers.  */
   3223       dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
   3224     }
   3225   else if (dims[0] > 0 && dims[1] == 0)
   3226     {
   3227       /* Auto-scale the number of workers with the requested number of gangs.  */
   3228       dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
   3229       if (dims[1] == 0)
   3230 	dims[1] = 1;
   3231       if (dims[1] > 16)
   3232 	dims[1] = 16;
   3233     }
   3234 
   3235   /* The incoming dimensions are expressed in terms of gangs, workers, and
   3236      vectors.  The HSA dimensions are expressed in terms of "work-items",
   3237      which means multiples of vector lanes.
   3238 
   3239      The "grid size" specifies the size of the problem space, and the
   3240      "work-group size" specifies how much of that we want a single compute
   3241      unit to chew on at once.
   3242 
   3243      The three dimensions do not really correspond to hardware, but the
   3244      important thing is that the HSA runtime will launch as many
   3245      work-groups as it takes to process the entire grid, and each
   3246      work-group will contain as many wave-fronts as it takes to process
   3247      the work-items in that group.
   3248 
   3249      Essentially, as long as we set the Y dimension to 64 (the number of
   3250      vector lanes in hardware), and the Z group size to the maximum (16),
   3251      then we will get the gangs (X) and workers (Z) launched as we expect.
   3252 
   3253      The reason for the apparent reversal of vector and worker dimension
   3254      order is to do with the way the run-time distributes work-items across
   3255      v1 and v2.  */
   3256   struct GOMP_kernel_launch_attributes kla =
   3257     {3,
   3258      /* Grid size.  */
   3259      {dims[0], 64, dims[1]},
   3260      /* Work-group size.  */
   3261      {1,       64, 16}
   3262     };
   3263 
   3264   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
   3265   acc_prof_info *prof_info = thr->prof_info;
   3266   acc_event_info enqueue_launch_event_info;
   3267   acc_api_info *api_info = thr->api_info;
   3268   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
   3269   if (profiling_dispatch_p)
   3270     {
   3271       prof_info->event_type = acc_ev_enqueue_launch_start;
   3272 
   3273       enqueue_launch_event_info.launch_event.event_type
   3274 	= prof_info->event_type;
   3275       enqueue_launch_event_info.launch_event.valid_bytes
   3276 	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
   3277       enqueue_launch_event_info.launch_event.parent_construct
   3278 	= acc_construct_parallel;
   3279       enqueue_launch_event_info.launch_event.implicit = 1;
   3280       enqueue_launch_event_info.launch_event.tool_info = NULL;
   3281       enqueue_launch_event_info.launch_event.kernel_name
   3282 	= (char *) kernel->name;
   3283       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
   3284       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
   3285       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
   3286 
   3287       api_info->device_api = acc_device_api_other;
   3288 
   3289       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
   3290 	&enqueue_launch_event_info, api_info);
   3291     }
   3292 
   3293   if (!async)
   3294     run_kernel (kernel, devaddrs, &kla, NULL, false);
   3295   else
   3296     queue_push_launch (aq, kernel, devaddrs, &kla);
   3297 
   3298   if (profiling_dispatch_p)
   3299     {
   3300       prof_info->event_type = acc_ev_enqueue_launch_end;
   3301       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
   3302       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
   3303 					    &enqueue_launch_event_info,
   3304 					    api_info);
   3305     }
   3306 }
   3307 
   3308 /* }}}  */
   3309 /* {{{ Generic Plugin API  */
   3310 
   3311 /* Return the name of the accelerator, which is "gcn".  */
   3312 
   3313 const char *
   3314 GOMP_OFFLOAD_get_name (void)
   3315 {
   3316   return "gcn";
   3317 }
   3318 
   3319 /* Return the specific capabilities the HSA accelerator have.  */
   3320 
   3321 unsigned int
   3322 GOMP_OFFLOAD_get_caps (void)
   3323 {
   3324   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
   3325   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
   3326 	    | GOMP_OFFLOAD_CAP_OPENACC_200;
   3327 }
   3328 
   3329 /* Identify as GCN accelerator.  */
   3330 
   3331 int
   3332 GOMP_OFFLOAD_get_type (void)
   3333 {
   3334   return OFFLOAD_TARGET_TYPE_GCN;
   3335 }
   3336 
   3337 /* Return the libgomp version number we're compatible with.  There is
   3338    no requirement for cross-version compatibility.  */
   3339 
   3340 unsigned
   3341 GOMP_OFFLOAD_version (void)
   3342 {
   3343   return GOMP_VERSION;
   3344 }
   3345 
   3346 /* Return the number of GCN devices on the system.  */
   3347 
   3348 int
   3349 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
   3350 {
   3351   if (!init_hsa_context (true))
   3352     exit (EXIT_FAILURE);
   3353   /* Return -1 if no omp_requires_mask cannot be fulfilled but
   3354      devices were present.  */
   3355   if (hsa_context.agent_count > 0
   3356       && ((omp_requires_mask
   3357 	   & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
   3358 	       | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
   3359     return -1;
   3360   return hsa_context.agent_count;
   3361 }
   3362 
   3363 /* Initialize device (agent) number N so that it can be used for computation.
   3364    Return TRUE on success.  */
   3365 
   3366 bool
   3367 GOMP_OFFLOAD_init_device (int n)
   3368 {
   3369   if (!init_hsa_context (false))
   3370     return false;
   3371   if (n >= hsa_context.agent_count)
   3372     {
   3373       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
   3374       return false;
   3375     }
   3376   struct agent_info *agent = &hsa_context.agents[n];
   3377 
   3378   if (agent->initialized)
   3379     return true;
   3380 
   3381   agent->device_id = n;
   3382 
   3383   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
   3384     {
   3385       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
   3386       return false;
   3387     }
   3388   if (pthread_mutex_init (&agent->prog_mutex, NULL))
   3389     {
   3390       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
   3391       return false;
   3392     }
   3393   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
   3394     {
   3395       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
   3396       return false;
   3397     }
   3398   if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
   3399     {
   3400       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
   3401       return false;
   3402     }
   3403   agent->async_queues = NULL;
   3404   agent->omp_async_queue = NULL;
   3405   agent->ephemeral_memories_list = NULL;
   3406 
   3407   uint32_t queue_size;
   3408   hsa_status_t status;
   3409   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
   3410 					  HSA_AGENT_INFO_QUEUE_MAX_SIZE,
   3411 					  &queue_size);
   3412   if (status != HSA_STATUS_SUCCESS)
   3413     return hsa_error ("Error requesting maximum queue size of the GCN agent",
   3414 		      status);
   3415 
   3416   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
   3417 					  &agent->name);
   3418   if (status != HSA_STATUS_SUCCESS)
   3419     return hsa_error ("Error querying the name of the agent", status);
   3420 
   3421   agent->device_isa = isa_code (agent->name);
   3422   if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED)
   3423     return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
   3424 
   3425   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
   3426 					  &agent->vendor_name);
   3427   if (status != HSA_STATUS_SUCCESS)
   3428     return hsa_error ("Error querying the vendor name of the agent", status);
   3429 
   3430   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
   3431 					HSA_QUEUE_TYPE_MULTI,
   3432 					hsa_queue_callback, NULL, UINT32_MAX,
   3433 					UINT32_MAX, &agent->sync_queue);
   3434   if (status != HSA_STATUS_SUCCESS)
   3435     return hsa_error ("Error creating command queue", status);
   3436 
   3437   agent->kernarg_region.handle = (uint64_t) -1;
   3438   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
   3439 						 get_kernarg_memory_region,
   3440 						 &agent->kernarg_region);
   3441   if (status != HSA_STATUS_SUCCESS
   3442       && status != HSA_STATUS_INFO_BREAK)
   3443     hsa_error ("Scanning memory regions failed", status);
   3444   if (agent->kernarg_region.handle == (uint64_t) -1)
   3445     {
   3446       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
   3447 			 "arguments");
   3448       return false;
   3449     }
   3450   GCN_DEBUG ("Selected kernel arguments memory region:\n");
   3451   dump_hsa_region (agent->kernarg_region, NULL);
   3452 
   3453   agent->data_region.handle = (uint64_t) -1;
   3454   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
   3455 						 get_data_memory_region,
   3456 						 &agent->data_region);
   3457   if (status != HSA_STATUS_SUCCESS
   3458       && status != HSA_STATUS_INFO_BREAK)
   3459     hsa_error ("Scanning memory regions failed", status);
   3460   if (agent->data_region.handle == (uint64_t) -1)
   3461     {
   3462       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
   3463 			 "data");
   3464       return false;
   3465     }
   3466   GCN_DEBUG ("Selected device data memory region:\n");
   3467   dump_hsa_region (agent->data_region, NULL);
   3468 
   3469   GCN_DEBUG ("GCN agent %d initialized\n", n);
   3470 
   3471   agent->initialized = true;
   3472   return true;
   3473 }
   3474 
   3475 /* Load GCN object-code module described by struct gcn_image_desc in
   3476    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
   3477    If there are any constructors then run them.  If not NULL, REV_FN_TABLE will
   3478    contain the on-device addresses of the functions for reverse offload.  To be
   3479    freed by the caller.  */
   3480 
   3481 int
   3482 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
   3483 			 struct addr_pair **target_table,
   3484 			 uint64_t **rev_fn_table,
   3485 			 uint64_t *host_ind_fn_table)
   3486 {
   3487   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
   3488     {
   3489       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
   3490 			 " (expected %u, received %u)",
   3491 			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
   3492       return -1;
   3493     }
   3494 
   3495   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
   3496   struct agent_info *agent;
   3497   struct addr_pair *pair;
   3498   struct module_info *module;
   3499   struct kernel_info *kernel;
   3500   int kernel_count = image_desc->kernel_count;
   3501   unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
   3502 			      ? image_desc->ind_func_count : 0;
   3503   unsigned var_count = image_desc->global_variable_count;
   3504   /* Currently, "others" is a struct of ICVS.  */
   3505   int other_count = 1;
   3506 
   3507   agent = get_agent_info (ord);
   3508   if (!agent)
   3509     return -1;
   3510 
   3511   if (pthread_rwlock_wrlock (&agent->module_rwlock))
   3512     {
   3513       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
   3514       return -1;
   3515     }
   3516   if (agent->prog_finalized
   3517       && !destroy_hsa_program (agent))
   3518     return -1;
   3519 
   3520   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
   3521   GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
   3522   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
   3523   GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
   3524   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
   3525 			     * sizeof (struct addr_pair));
   3526   *target_table = pair;
   3527   module = (struct module_info *)
   3528     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
   3529 				+ kernel_count * sizeof (struct kernel_info));
   3530   module->image_desc = image_desc;
   3531   module->kernel_count = kernel_count;
   3532   module->heap = NULL;
   3533   module->constructors_run_p = false;
   3534 
   3535   kernel = &module->kernels[0];
   3536 
   3537   /* Allocate memory for kernel dependencies.  */
   3538   for (unsigned i = 0; i < kernel_count; i++)
   3539     {
   3540       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
   3541       if (!init_basic_kernel_info (kernel, d, agent, module))
   3542 	return -1;
   3543       if (strcmp (d->name, "_init_array") == 0)
   3544 	module->init_array_func = kernel;
   3545       else if (strcmp (d->name, "_fini_array") == 0)
   3546         module->fini_array_func = kernel;
   3547       else
   3548 	{
   3549 	  pair->start = (uintptr_t) kernel;
   3550 	  pair->end = (uintptr_t) (kernel + 1);
   3551 	  pair++;
   3552 	}
   3553       kernel++;
   3554     }
   3555 
   3556   agent->module = module;
   3557   if (pthread_rwlock_unlock (&agent->module_rwlock))
   3558     {
   3559       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
   3560       return -1;
   3561     }
   3562 
   3563   if (!create_and_finalize_hsa_program (agent))
   3564     return -1;
   3565 
   3566   if (var_count > 0)
   3567     {
   3568       hsa_status_t status;
   3569       hsa_executable_symbol_t var_symbol;
   3570       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   3571 						     ".offload_var_table",
   3572 						     agent->id,
   3573 						     0, &var_symbol);
   3574 
   3575       if (status != HSA_STATUS_SUCCESS)
   3576 	hsa_fatal ("Could not find symbol for variable in the code object",
   3577 		   status);
   3578 
   3579       uint64_t var_table_addr;
   3580       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3581 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
   3582 	 &var_table_addr);
   3583       if (status != HSA_STATUS_SUCCESS)
   3584 	hsa_fatal ("Could not extract a variable from its symbol", status);
   3585 
   3586       struct {
   3587 	uint64_t addr;
   3588 	uint64_t size;
   3589       } var_table[var_count];
   3590       GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
   3591 			     (void*)var_table_addr, sizeof (var_table));
   3592 
   3593       for (unsigned i = 0; i < var_count; i++)
   3594 	{
   3595 	  pair->start = var_table[i].addr;
   3596 	  pair->end = var_table[i].addr + var_table[i].size;
   3597 	  GCN_DEBUG ("Found variable at %p with size %lu\n",
   3598 		     (void *)var_table[i].addr, var_table[i].size);
   3599 	  pair++;
   3600 	}
   3601     }
   3602 
   3603   if (ind_func_count > 0)
   3604     {
   3605       hsa_status_t status;
   3606 
   3607       /* Read indirect function table from image.  */
   3608       hsa_executable_symbol_t ind_funcs_symbol;
   3609       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   3610 						     ".offload_ind_func_table",
   3611 						     agent->id,
   3612 						     0, &ind_funcs_symbol);
   3613 
   3614       if (status != HSA_STATUS_SUCCESS)
   3615 	hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
   3616 		   "code object", status);
   3617 
   3618       uint64_t ind_funcs_table_addr;
   3619       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3620 	(ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
   3621 	 &ind_funcs_table_addr);
   3622       if (status != HSA_STATUS_SUCCESS)
   3623 	hsa_fatal ("Could not extract a variable from its symbol", status);
   3624 
   3625       uint64_t ind_funcs_table[ind_func_count];
   3626       GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
   3627 			     (void*) ind_funcs_table_addr,
   3628 			     sizeof (ind_funcs_table));
   3629 
   3630       /* Build host->target address map for indirect functions.  */
   3631       uint64_t ind_fn_map[ind_func_count * 2 + 1];
   3632       for (unsigned i = 0; i < ind_func_count; i++)
   3633 	{
   3634 	  ind_fn_map[i * 2] = host_ind_fn_table[i];
   3635 	  ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
   3636 	  GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
   3637 		     i, host_ind_fn_table[i], ind_funcs_table[i]);
   3638 	}
   3639       ind_fn_map[ind_func_count * 2] = 0;
   3640 
   3641       /* Write the map onto the target.  */
   3642       void *map_target_addr
   3643 	= GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
   3644       GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
   3645 
   3646       GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
   3647 			     (void*) ind_fn_map,
   3648 			     sizeof (ind_fn_map));
   3649 
   3650       /* Write address of the map onto the target.  */
   3651       hsa_executable_symbol_t symbol;
   3652 
   3653       status
   3654 	= hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   3655 						XSTRING (GOMP_INDIRECT_ADDR_MAP),
   3656 						agent->id, 0, &symbol);
   3657       if (status != HSA_STATUS_SUCCESS)
   3658 	hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
   3659 		   status);
   3660 
   3661       uint64_t varptr;
   3662       uint32_t varsize;
   3663 
   3664       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3665 	(symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
   3666 	 &varptr);
   3667       if (status != HSA_STATUS_SUCCESS)
   3668 	hsa_fatal ("Could not extract a variable from its symbol", status);
   3669       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3670 	(symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
   3671 	&varsize);
   3672       if (status != HSA_STATUS_SUCCESS)
   3673 	hsa_fatal ("Could not extract a variable size from its symbol",
   3674 		   status);
   3675 
   3676       GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
   3677 		 varptr, varsize);
   3678 
   3679       GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
   3680 			     &map_target_addr,
   3681 			     sizeof (map_target_addr));
   3682     }
   3683 
   3684   GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
   3685 
   3686   hsa_status_t status;
   3687   hsa_executable_symbol_t var_symbol;
   3688   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   3689 						 XSTRING (GOMP_ADDITIONAL_ICVS),
   3690 						 agent->id, 0, &var_symbol);
   3691   if (status == HSA_STATUS_SUCCESS)
   3692     {
   3693       uint64_t varptr;
   3694       uint32_t varsize;
   3695 
   3696       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3697 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
   3698 	 &varptr);
   3699       if (status != HSA_STATUS_SUCCESS)
   3700 	hsa_fatal ("Could not extract a variable from its symbol", status);
   3701       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3702 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
   3703 	 &varsize);
   3704       if (status != HSA_STATUS_SUCCESS)
   3705 	hsa_fatal ("Could not extract a variable size from its symbol",
   3706 		   status);
   3707 
   3708       pair->start = varptr;
   3709       pair->end = varptr + varsize;
   3710     }
   3711   else
   3712     {
   3713       /* The variable was not in this image.  */
   3714       GCN_DEBUG ("Variable not found in image: %s\n",
   3715 		 XSTRING (GOMP_ADDITIONAL_ICVS));
   3716       pair->start = pair->end = 0;
   3717     }
   3718 
   3719   /* Ensure that constructors are run first.  */
   3720   struct GOMP_kernel_launch_attributes kla =
   3721     { 3,
   3722       /* Grid size.  */
   3723       { 1, 64, 1 },
   3724       /* Work-group size.  */
   3725       { 1, 64, 1 }
   3726     };
   3727 
   3728   if (module->init_array_func)
   3729     {
   3730       init_kernel (module->init_array_func);
   3731       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
   3732     }
   3733   module->constructors_run_p = true;
   3734 
   3735   /* Don't report kernels that libgomp need not know about.  */
   3736   if (module->init_array_func)
   3737     kernel_count--;
   3738   if (module->fini_array_func)
   3739     kernel_count--;
   3740 
   3741   if (rev_fn_table != NULL && kernel_count == 0)
   3742     *rev_fn_table = NULL;
   3743   else if (rev_fn_table != NULL)
   3744     {
   3745       hsa_status_t status;
   3746       hsa_executable_symbol_t var_symbol;
   3747       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   3748 						     ".offload_func_table",
   3749 						     agent->id, 0, &var_symbol);
   3750       if (status != HSA_STATUS_SUCCESS)
   3751 	hsa_fatal ("Could not find symbol for variable in the code object",
   3752 		   status);
   3753       uint64_t fn_table_addr;
   3754       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3755 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
   3756 	 &fn_table_addr);
   3757       if (status != HSA_STATUS_SUCCESS)
   3758 	hsa_fatal ("Could not extract a variable from its symbol", status);
   3759       *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
   3760       GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
   3761 			     (void*) fn_table_addr,
   3762 			     kernel_count * sizeof (uint64_t));
   3763     }
   3764 
   3765   return kernel_count + var_count + other_count;
   3766 }
   3767 
   3768 /* Unload GCN object-code module described by struct gcn_image_desc in
   3769    TARGET_DATA from agent number N.  Return TRUE on success.  */
   3770 
   3771 bool
   3772 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
   3773 {
   3774   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
   3775     {
   3776       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
   3777 			 " (expected %u, received %u)",
   3778 			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
   3779       return false;
   3780     }
   3781 
   3782   struct agent_info *agent;
   3783   agent = get_agent_info (n);
   3784   if (!agent)
   3785     return false;
   3786 
   3787   if (pthread_rwlock_wrlock (&agent->module_rwlock))
   3788     {
   3789       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
   3790       return false;
   3791     }
   3792 
   3793   if (!agent->module || agent->module->image_desc != target_data)
   3794     {
   3795       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
   3796 			 "loaded before");
   3797       return false;
   3798     }
   3799 
   3800   if (!destroy_module (agent->module, true))
   3801     return false;
   3802   free (agent->module);
   3803   agent->module = NULL;
   3804   if (!destroy_hsa_program (agent))
   3805     return false;
   3806   if (pthread_rwlock_unlock (&agent->module_rwlock))
   3807     {
   3808       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
   3809       return false;
   3810     }
   3811   return true;
   3812 }
   3813 
   3814 /* Deinitialize all information and status associated with agent number N.  We
   3815    do not attempt any synchronization, assuming the user and libgomp will not
   3816    attempt deinitialization of a device that is in any way being used at the
   3817    same time.  Return TRUE on success.  */
   3818 
   3819 bool
   3820 GOMP_OFFLOAD_fini_device (int n)
   3821 {
   3822   struct agent_info *agent = get_agent_info (n);
   3823   if (!agent)
   3824     return false;
   3825 
   3826   if (!agent->initialized)
   3827     return true;
   3828 
   3829   if (agent->omp_async_queue)
   3830     {
   3831       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
   3832       agent->omp_async_queue = NULL;
   3833     }
   3834 
   3835   if (agent->module)
   3836     {
   3837       if (!destroy_module (agent->module, false))
   3838 	return false;
   3839       free (agent->module);
   3840       agent->module = NULL;
   3841     }
   3842 
   3843   if (!destroy_ephemeral_memories (agent))
   3844     return false;
   3845 
   3846   if (!destroy_hsa_program (agent))
   3847     return false;
   3848 
   3849   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
   3850   if (status != HSA_STATUS_SUCCESS)
   3851     return hsa_error ("Error destroying command queue", status);
   3852 
   3853   if (pthread_mutex_destroy (&agent->prog_mutex))
   3854     {
   3855       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
   3856       return false;
   3857     }
   3858   if (pthread_rwlock_destroy (&agent->module_rwlock))
   3859     {
   3860       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
   3861       return false;
   3862     }
   3863 
   3864   if (pthread_mutex_destroy (&agent->async_queues_mutex))
   3865     {
   3866       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
   3867       return false;
   3868     }
   3869   if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
   3870     {
   3871       GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
   3872       return false;
   3873     }
   3874   agent->initialized = false;
   3875   return true;
   3876 }
   3877 
   3878 /* Return true if the HSA runtime can run function FN_PTR.  */
   3879 
   3880 bool
   3881 GOMP_OFFLOAD_can_run (void *fn_ptr)
   3882 {
   3883   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
   3884 
   3885   init_kernel (kernel);
   3886   if (kernel->initialization_failed)
   3887     GOMP_PLUGIN_fatal ("kernel initialization failed");
   3888 
   3889   return true;
   3890 }
   3891 
   3892 /* Allocate memory on device N.  */
   3893 
   3894 void *
   3895 GOMP_OFFLOAD_alloc (int n, size_t size)
   3896 {
   3897   struct agent_info *agent = get_agent_info (n);
   3898   return alloc_by_agent (agent, size);
   3899 }
   3900 
   3901 /* Free memory from device N.  */
   3902 
   3903 bool
   3904 GOMP_OFFLOAD_free (int device, void *ptr)
   3905 {
   3906   GCN_DEBUG ("Freeing memory on device %d\n", device);
   3907 
   3908   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
   3909   if (status != HSA_STATUS_SUCCESS)
   3910     {
   3911       hsa_error ("Could not free device memory", status);
   3912       return false;
   3913     }
   3914 
   3915   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
   3916   bool profiling_dispatch_p
   3917     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
   3918   if (profiling_dispatch_p)
   3919     {
   3920       acc_prof_info *prof_info = thr->prof_info;
   3921       acc_event_info data_event_info;
   3922       acc_api_info *api_info = thr->api_info;
   3923 
   3924       prof_info->event_type = acc_ev_free;
   3925 
   3926       data_event_info.data_event.event_type = prof_info->event_type;
   3927       data_event_info.data_event.valid_bytes
   3928 	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
   3929       data_event_info.data_event.parent_construct
   3930 	= acc_construct_parallel;
   3931       data_event_info.data_event.implicit = 1;
   3932       data_event_info.data_event.tool_info = NULL;
   3933       data_event_info.data_event.var_name = NULL;
   3934       data_event_info.data_event.bytes = 0;
   3935       data_event_info.data_event.host_ptr = NULL;
   3936       data_event_info.data_event.device_ptr = (void *) ptr;
   3937 
   3938       api_info->device_api = acc_device_api_other;
   3939 
   3940       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
   3941 					    api_info);
   3942     }
   3943 
   3944   return true;
   3945 }
   3946 
   3947 /* Copy data from DEVICE to host.  */
   3948 
   3949 bool
   3950 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
   3951 {
   3952   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
   3953 	     src, dst);
   3954   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
   3955   if (status != HSA_STATUS_SUCCESS)
   3956     GOMP_PLUGIN_error ("memory copy failed");
   3957   return true;
   3958 }
   3959 
   3960 /* Copy data from host to DEVICE.  */
   3961 
   3962 bool
   3963 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
   3964 {
   3965   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
   3966 	     device, dst);
   3967   hsa_memory_copy_wrapper (dst, src, n);
   3968   return true;
   3969 }
   3970 
   3971 /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
   3972 
   3973 bool
   3974 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
   3975 {
   3976   struct gcn_thread *thread_data = gcn_thread ();
   3977 
   3978   if (thread_data && !async_synchronous_p (thread_data->async))
   3979     {
   3980       struct agent_info *agent = get_agent_info (device);
   3981       maybe_init_omp_async (agent);
   3982       queue_push_copy (agent->omp_async_queue, dst, src, n);
   3983       return true;
   3984     }
   3985 
   3986   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
   3987 	     device, src, device, dst);
   3988   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
   3989   if (status != HSA_STATUS_SUCCESS)
   3990     GOMP_PLUGIN_error ("memory copy failed");
   3991   return true;
   3992 }
   3993 
   3994 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
   3995    measured in bytes.  So we have:
   3996 
   3997    dim1_size: number of bytes to copy on innermost dimension ("row")
   3998    dim0_len: number of rows to copy
   3999    dst: base pointer for destination of copy
   4000    dst_offset1_size: innermost row offset (for dest), in bytes
   4001    dst_offset0_len: offset, number of rows (for dest)
   4002    dst_dim1_size: whole-array dest row length, in bytes (pitch)
   4003    src: base pointer for source of copy
   4004    src_offset1_size: innermost row offset (for source), in bytes
   4005    src_offset0_len: offset, number of rows (for source)
   4006    src_dim1_size: whole-array source row length, in bytes (pitch)
   4007 */
   4008 
   4009 int
   4010 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
   4011 		       size_t dim0_len, void *dst, size_t dst_offset1_size,
   4012 		       size_t dst_offset0_len, size_t dst_dim1_size,
   4013 		       const void *src, size_t src_offset1_size,
   4014 		       size_t src_offset0_len, size_t src_dim1_size)
   4015 {
   4016   if (!hsa_fns.hsa_amd_memory_lock_fn
   4017       || !hsa_fns.hsa_amd_memory_unlock_fn
   4018       || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
   4019     return -1;
   4020 
   4021   /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
   4022      out quietly if we have anything oddly-aligned rather than letting the
   4023      driver raise an error.  */
   4024   if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
   4025     return -1;
   4026 
   4027   if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
   4028     return -1;
   4029 
   4030   /* Only handle host to device or device to host transfers here.  */
   4031   if ((dst_ord == -1 && src_ord == -1)
   4032       || (dst_ord != -1 && src_ord != -1))
   4033     return -1;
   4034 
   4035   hsa_amd_copy_direction_t dir
   4036     = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
   4037   hsa_agent_t copy_agent;
   4038 
   4039   /* We need to pin (lock) host memory before we start the transfer.  Try to
   4040      lock the minimum size necessary, i.e. using partial first/last rows of the
   4041      whole array.  Something like this:
   4042 
   4043 	 rows -->
   4044 	 ..............
   4045      c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
   4046      o | ++#######+++++ <- whole row
   4047      l | ++#######+++++ <-     "
   4048      s v ++#######..... <- last row apart from trailing remainder
   4049 	 ..............
   4050 
   4051      We could split very large transfers into several rectangular copies, but
   4052      that is unimplemented for now.  */
   4053 
   4054   size_t bounded_size_host, first_elem_offset_host;
   4055   void *host_ptr;
   4056   if (dir == hsaHostToDevice)
   4057     {
   4058       bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
   4059       first_elem_offset_host = src_offset0_len * src_dim1_size
   4060 			       + src_offset1_size;
   4061       host_ptr = (void *) src;
   4062       struct agent_info *agent = get_agent_info (dst_ord);
   4063       copy_agent = agent->id;
   4064     }
   4065   else
   4066     {
   4067       bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
   4068       first_elem_offset_host = dst_offset0_len * dst_dim1_size
   4069 			       + dst_offset1_size;
   4070       host_ptr = dst;
   4071       struct agent_info *agent = get_agent_info (src_ord);
   4072       copy_agent = agent->id;
   4073     }
   4074 
   4075   void *agent_ptr;
   4076 
   4077   hsa_status_t status
   4078     = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
   4079 				      bounded_size_host, NULL, 0, &agent_ptr);
   4080   /* We can't lock the host memory: don't give up though, we might still be
   4081      able to use the slow path in our caller.  So, don't make this an
   4082      error.  */
   4083   if (status != HSA_STATUS_SUCCESS)
   4084     return -1;
   4085 
   4086   hsa_pitched_ptr_t dstpp, srcpp;
   4087   hsa_dim3_t dst_offsets, src_offsets, ranges;
   4088 
   4089   int retval = 1;
   4090 
   4091   hsa_signal_t completion_signal;
   4092   status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
   4093   if (status != HSA_STATUS_SUCCESS)
   4094     {
   4095       retval = -1;
   4096       goto unlock;
   4097     }
   4098 
   4099   if (dir == hsaHostToDevice)
   4100     {
   4101       srcpp.base = agent_ptr - first_elem_offset_host;
   4102       dstpp.base = dst;
   4103     }
   4104   else
   4105     {
   4106       srcpp.base = (void *) src;
   4107       dstpp.base = agent_ptr - first_elem_offset_host;
   4108     }
   4109 
   4110   srcpp.pitch = src_dim1_size;
   4111   srcpp.slice = 0;
   4112 
   4113   src_offsets.x = src_offset1_size;
   4114   src_offsets.y = src_offset0_len;
   4115   src_offsets.z = 0;
   4116 
   4117   dstpp.pitch = dst_dim1_size;
   4118   dstpp.slice = 0;
   4119 
   4120   dst_offsets.x = dst_offset1_size;
   4121   dst_offsets.y = dst_offset0_len;
   4122   dst_offsets.z = 0;
   4123 
   4124   ranges.x = dim1_size;
   4125   ranges.y = dim0_len;
   4126   ranges.z = 1;
   4127 
   4128   status
   4129     = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
   4130 						 &src_offsets, &ranges,
   4131 						 copy_agent, dir, 0, NULL,
   4132 						 completion_signal);
   4133   /* If the rectangular copy fails, we might still be able to use the slow
   4134      path.  We need to unlock the host memory though, so don't return
   4135      immediately.  */
   4136   if (status != HSA_STATUS_SUCCESS)
   4137     retval = -1;
   4138   else
   4139     hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
   4140 					HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
   4141 					HSA_WAIT_STATE_ACTIVE);
   4142 
   4143   hsa_fns.hsa_signal_destroy_fn (completion_signal);
   4144 
   4145 unlock:
   4146   status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
   4147   if (status != HSA_STATUS_SUCCESS)
   4148     hsa_fatal ("Could not unlock host memory", status);
   4149 
   4150   return retval;
   4151 }
   4152 
   4153 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
   4154    measured in bytes.  So we have:
   4155 
   4156    dim2_size: number of bytes to copy on innermost dimension ("row")
   4157    dim1_len: number of rows per slice to copy
   4158    dim0_len: number of slices to copy
   4159    dst: base pointer for destination of copy
   4160    dst_offset2_size: innermost row offset (for dest), in bytes
   4161    dst_offset1_len: offset, number of rows (for dest)
   4162    dst_offset0_len: offset, number of slices (for dest)
   4163    dst_dim2_size: whole-array dest row length, in bytes (pitch)
   4164    dst_dim1_len: whole-array number of rows in slice (for dest)
   4165    src: base pointer for source of copy
   4166    src_offset2_size: innermost row offset (for source), in bytes
   4167    src_offset1_len: offset, number of rows (for source)
   4168    src_offset0_len: offset, number of slices (for source)
   4169    src_dim2_size: whole-array source row length, in bytes (pitch)
   4170    src_dim1_len: whole-array number of rows in slice (for source)
   4171 */
   4172 
   4173 int
   4174 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
   4175 		       size_t dim1_len, size_t dim0_len, void *dst,
   4176 		       size_t dst_offset2_size, size_t dst_offset1_len,
   4177 		       size_t dst_offset0_len, size_t dst_dim2_size,
   4178 		       size_t dst_dim1_len, const void *src,
   4179 		       size_t src_offset2_size, size_t src_offset1_len,
   4180 		       size_t src_offset0_len, size_t src_dim2_size,
   4181 		       size_t src_dim1_len)
   4182 {
   4183   if (!hsa_fns.hsa_amd_memory_lock_fn
   4184       || !hsa_fns.hsa_amd_memory_unlock_fn
   4185       || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
   4186     return -1;
   4187 
   4188   /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
   4189      out quietly if we have anything oddly-aligned rather than letting the
   4190      driver raise an error.  */
   4191   if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
   4192     return -1;
   4193 
   4194   if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
   4195     return -1;
   4196 
   4197   /* Only handle host to device or device to host transfers here.  */
   4198   if ((dst_ord == -1 && src_ord == -1)
   4199       || (dst_ord != -1 && src_ord != -1))
   4200     return -1;
   4201 
   4202   hsa_amd_copy_direction_t dir
   4203     = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
   4204   hsa_agent_t copy_agent;
   4205 
   4206   /* We need to pin (lock) host memory before we start the transfer.  Try to
   4207      lock the minimum size necessary, i.e. using partial first/last slices of
   4208      the whole 3D array.  Something like this:
   4209 
   4210 	 slice 0:          slice 1:	     slice 2:
   4211 	     __________        __________        __________
   4212        ^    /+++++++++/    :  /+++++++++/    :	/         /
   4213     column /+++##++++/|    | /+++##++++/|    | /+++##    /  # = subarray
   4214      /	  /   ##++++/ |    |/+++##++++/ |    |/+++##++++/   + = area to pin
   4215 	 /_________/  :    /_________/  :    /_________/
   4216 	row --->
   4217 
   4218      We could split very large transfers into several rectangular copies, but
   4219      that is unimplemented for now.  */
   4220 
   4221   size_t bounded_size_host, first_elem_offset_host;
   4222   void *host_ptr;
   4223   if (dir == hsaHostToDevice)
   4224     {
   4225       size_t slice_bytes = src_dim2_size * src_dim1_len;
   4226       bounded_size_host = slice_bytes * (dim0_len - 1)
   4227 			  + src_dim2_size * (dim1_len - 1)
   4228 			  + dim2_size;
   4229       first_elem_offset_host = src_offset0_len * slice_bytes
   4230 			       + src_offset1_len * src_dim2_size
   4231 			       + src_offset2_size;
   4232       host_ptr = (void *) src;
   4233       struct agent_info *agent = get_agent_info (dst_ord);
   4234       copy_agent = agent->id;
   4235     }
   4236   else
   4237     {
   4238       size_t slice_bytes = dst_dim2_size * dst_dim1_len;
   4239       bounded_size_host = slice_bytes * (dim0_len - 1)
   4240 			  + dst_dim2_size * (dim1_len - 1)
   4241 			  + dim2_size;
   4242       first_elem_offset_host = dst_offset0_len * slice_bytes
   4243 			       + dst_offset1_len * dst_dim2_size
   4244 			       + dst_offset2_size;
   4245       host_ptr = dst;
   4246       struct agent_info *agent = get_agent_info (src_ord);
   4247       copy_agent = agent->id;
   4248     }
   4249 
   4250   void *agent_ptr;
   4251 
   4252   hsa_status_t status
   4253     = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
   4254 				      bounded_size_host, NULL, 0, &agent_ptr);
   4255   /* We can't lock the host memory: don't give up though, we might still be
   4256      able to use the slow path in our caller (maybe even with iterated memcpy2d
   4257      calls).  So, don't make this an error.  */
   4258   if (status != HSA_STATUS_SUCCESS)
   4259     return -1;
   4260 
   4261   hsa_pitched_ptr_t dstpp, srcpp;
   4262   hsa_dim3_t dst_offsets, src_offsets, ranges;
   4263 
   4264   int retval = 1;
   4265 
   4266   hsa_signal_t completion_signal;
   4267   status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
   4268   if (status != HSA_STATUS_SUCCESS)
   4269     {
   4270       retval = -1;
   4271       goto unlock;
   4272     }
   4273 
   4274   if (dir == hsaHostToDevice)
   4275     {
   4276       srcpp.base = agent_ptr - first_elem_offset_host;
   4277       dstpp.base = dst;
   4278     }
   4279   else
   4280     {
   4281       srcpp.base = (void *) src;
   4282       dstpp.base = agent_ptr - first_elem_offset_host;
   4283     }
   4284 
   4285   /* Pitch is measured in bytes.  */
   4286   srcpp.pitch = src_dim2_size;
   4287   /* Slice is also measured in bytes (i.e. total per-slice).  */
   4288   srcpp.slice = src_dim2_size * src_dim1_len;
   4289 
   4290   src_offsets.x = src_offset2_size;
   4291   src_offsets.y = src_offset1_len;
   4292   src_offsets.z = src_offset0_len;
   4293 
   4294   /* As above.  */
   4295   dstpp.pitch = dst_dim2_size;
   4296   dstpp.slice = dst_dim2_size * dst_dim1_len;
   4297 
   4298   dst_offsets.x = dst_offset2_size;
   4299   dst_offsets.y = dst_offset1_len;
   4300   dst_offsets.z = dst_offset0_len;
   4301 
   4302   ranges.x = dim2_size;
   4303   ranges.y = dim1_len;
   4304   ranges.z = dim0_len;
   4305 
   4306   status
   4307     = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
   4308 						 &src_offsets, &ranges,
   4309 						 copy_agent, dir, 0, NULL,
   4310 						 completion_signal);
   4311   /* If the rectangular copy fails, we might still be able to use the slow
   4312      path.  We need to unlock the host memory though, so don't return
   4313      immediately.  */
   4314   if (status != HSA_STATUS_SUCCESS)
   4315     retval = -1;
   4316   else
   4317     {
   4318       hsa_signal_value_t sv
   4319 	= hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
   4320 					      HSA_SIGNAL_CONDITION_LT, 1,
   4321 					      UINT64_MAX,
   4322 					      HSA_WAIT_STATE_ACTIVE);
   4323       if (sv < 0)
   4324 	{
   4325 	  GCN_WARNING ("async copy rect failure");
   4326 	  retval = -1;
   4327 	}
   4328     }
   4329 
   4330   hsa_fns.hsa_signal_destroy_fn (completion_signal);
   4331 
   4332 unlock:
   4333   status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
   4334   if (status != HSA_STATUS_SUCCESS)
   4335     hsa_fatal ("Could not unlock host memory", status);
   4336 
   4337   return retval;
   4338 }
   4339 
   4340 /* }}}  */
   4341 /* {{{ OpenMP Plugin API  */
   4342 
   4343 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
   4344    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
   4345    to a kernel_info structure, and must have previously been loaded to the
   4346    specified device.  */
   4347 
   4348 void
   4349 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
   4350 {
   4351   struct agent_info *agent = get_agent_info (device);
   4352   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
   4353   struct GOMP_kernel_launch_attributes def;
   4354   struct GOMP_kernel_launch_attributes *kla;
   4355   assert (agent == kernel->agent);
   4356 
   4357   /* If we get here then the kernel must be OpenMP.  */
   4358   kernel->kind = KIND_OPENMP;
   4359 
   4360   if (!parse_target_attributes (args, &def, &kla, agent))
   4361     {
   4362       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
   4363       return;
   4364     }
   4365   run_kernel (kernel, vars, kla, NULL, false);
   4366 }
   4367 
   4368 /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
   4369    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
   4370    GOMP_PLUGIN_target_task_completion when it has finished.  */
   4371 
   4372 void
   4373 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
   4374 			void **args, void *async_data)
   4375 {
   4376   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
   4377   struct agent_info *agent = get_agent_info (device);
   4378   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
   4379   struct GOMP_kernel_launch_attributes def;
   4380   struct GOMP_kernel_launch_attributes *kla;
   4381   assert (agent == kernel->agent);
   4382 
   4383   /* If we get here then the kernel must be OpenMP.  */
   4384   kernel->kind = KIND_OPENMP;
   4385 
   4386   if (!parse_target_attributes (args, &def, &kla, agent))
   4387     {
   4388       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
   4389       return;
   4390     }
   4391 
   4392   maybe_init_omp_async (agent);
   4393   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
   4394   queue_push_callback (agent->omp_async_queue,
   4395 		       GOMP_PLUGIN_target_task_completion, async_data);
   4396 }
   4397 
   4398 /* }}} */
   4399 /* {{{ OpenACC Plugin API  */
   4400 
   4401 /* Run a synchronous OpenACC kernel.  The device number is inferred from the
   4402    already-loaded KERNEL.  */
   4403 
   4404 void
   4405 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
   4406 			   size_t mapnum __attribute__((unused)),
   4407 			   void **hostaddrs __attribute__((unused)),
   4408 			   void **devaddrs, unsigned *dims,
   4409 			   void *targ_mem_desc)
   4410 {
   4411   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
   4412 
   4413   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
   4414 }
   4415 
   4416 /* Run an asynchronous OpenACC kernel on the specified queue.  */
   4417 
   4418 void
   4419 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
   4420 				 size_t mapnum __attribute__((unused)),
   4421 				 void **hostaddrs __attribute__((unused)),
   4422 				 void **devaddrs,
   4423 				 unsigned *dims, void *targ_mem_desc,
   4424 				 struct goacc_asyncqueue *aq)
   4425 {
   4426   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
   4427 
   4428   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
   4429 }
   4430 
   4431 /* Create a new asynchronous thread and queue for running future kernels.  */
   4432 
   4433 struct goacc_asyncqueue *
   4434 GOMP_OFFLOAD_openacc_async_construct (int device)
   4435 {
   4436   struct agent_info *agent = get_agent_info (device);
   4437 
   4438   pthread_mutex_lock (&agent->async_queues_mutex);
   4439 
   4440   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
   4441   aq->agent = get_agent_info (device);
   4442   aq->prev = NULL;
   4443   aq->next = agent->async_queues;
   4444   if (aq->next)
   4445     {
   4446       aq->next->prev = aq;
   4447       aq->id = aq->next->id + 1;
   4448     }
   4449   else
   4450     aq->id = 1;
   4451   agent->async_queues = aq;
   4452 
   4453   aq->queue_first = 0;
   4454   aq->queue_n = 0;
   4455   aq->drain_queue_stop = 0;
   4456 
   4457   if (pthread_mutex_init (&aq->mutex, NULL))
   4458     {
   4459       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
   4460       return false;
   4461     }
   4462   if (pthread_cond_init (&aq->queue_cond_in, NULL))
   4463     {
   4464       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
   4465       return false;
   4466     }
   4467   if (pthread_cond_init (&aq->queue_cond_out, NULL))
   4468     {
   4469       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
   4470       return false;
   4471     }
   4472 
   4473   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
   4474 						     ASYNC_QUEUE_SIZE,
   4475 						     HSA_QUEUE_TYPE_MULTI,
   4476 						     hsa_queue_callback, NULL,
   4477 						     UINT32_MAX, UINT32_MAX,
   4478 						     &aq->hsa_queue);
   4479   if (status != HSA_STATUS_SUCCESS)
   4480     hsa_fatal ("Error creating command queue", status);
   4481 
   4482   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
   4483   if (err != 0)
   4484     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
   4485 		       strerror (err));
   4486   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
   4487 	     aq->id);
   4488 
   4489   pthread_mutex_unlock (&agent->async_queues_mutex);
   4490 
   4491   return aq;
   4492 }
   4493 
   4494 /* Destroy an existing asynchronous thread and queue.  Waits for any
   4495    currently-running task to complete, but cancels any queued tasks.  */
   4496 
   4497 bool
   4498 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
   4499 {
   4500   struct agent_info *agent = aq->agent;
   4501 
   4502   finalize_async_thread (aq);
   4503 
   4504   pthread_mutex_lock (&agent->async_queues_mutex);
   4505 
   4506   int err;
   4507   if ((err = pthread_mutex_destroy (&aq->mutex)))
   4508     {
   4509       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
   4510       goto fail;
   4511     }
   4512   if (pthread_cond_destroy (&aq->queue_cond_in))
   4513     {
   4514       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
   4515       goto fail;
   4516     }
   4517   if (pthread_cond_destroy (&aq->queue_cond_out))
   4518     {
   4519       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
   4520       goto fail;
   4521     }
   4522   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
   4523   if (status != HSA_STATUS_SUCCESS)
   4524     {
   4525       hsa_error ("Error destroying command queue", status);
   4526       goto fail;
   4527     }
   4528 
   4529   if (aq->prev)
   4530     aq->prev->next = aq->next;
   4531   if (aq->next)
   4532     aq->next->prev = aq->prev;
   4533   if (agent->async_queues == aq)
   4534     agent->async_queues = aq->next;
   4535 
   4536   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
   4537 
   4538   free (aq);
   4539   pthread_mutex_unlock (&agent->async_queues_mutex);
   4540   return true;
   4541 
   4542 fail:
   4543   pthread_mutex_unlock (&agent->async_queues_mutex);
   4544   return false;
   4545 }
   4546 
   4547 /* Return true if the specified async queue is currently empty.  */
   4548 
   4549 int
   4550 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
   4551 {
   4552   return queue_empty (aq);
   4553 }
   4554 
   4555 /* Block until the specified queue has executed all its tasks and the
   4556    queue is empty.  */
   4557 
   4558 bool
   4559 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
   4560 {
   4561   wait_queue (aq);
   4562   return true;
   4563 }
   4564 
   4565 /* Add a serialization point across two async queues. Any new tasks added to
   4566    AQ2, after this call, will not run until all tasks on AQ1, at the time
   4567    of this call, have completed.  */
   4568 
   4569 bool
   4570 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
   4571 				      struct goacc_asyncqueue *aq2)
   4572 {
   4573   /* For serialize, stream aq2 waits for aq1 to complete work that has been
   4574      scheduled to run on it up to this point.  */
   4575   if (aq1 != aq2)
   4576     {
   4577       struct placeholder *placeholderp = queue_push_placeholder (aq1);
   4578       queue_push_asyncwait (aq2, placeholderp);
   4579     }
   4580   return true;
   4581 }
   4582 
   4583 /* Add an opaque callback to the given async queue.  */
   4584 
   4585 void
   4586 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
   4587 					   void (*fn) (void *), void *data)
   4588 {
   4589   queue_push_callback (aq, fn, data);
   4590 }
   4591 
   4592 /* Queue up an asynchronous data copy from host to DEVICE.  */
   4593 
   4594 bool
   4595 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
   4596 				     size_t n, struct goacc_asyncqueue *aq)
   4597 {
   4598   struct agent_info *agent = get_agent_info (device);
   4599   assert (agent == aq->agent);
   4600   queue_push_copy (aq, dst, src, n);
   4601   return true;
   4602 }
   4603 
   4604 /* Queue up an asynchronous data copy from DEVICE to host.  */
   4605 
   4606 bool
   4607 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
   4608 				     size_t n, struct goacc_asyncqueue *aq)
   4609 {
   4610   struct agent_info *agent = get_agent_info (device);
   4611   assert (agent == aq->agent);
   4612   queue_push_copy (aq, dst, src, n);
   4613   return true;
   4614 }
   4615 
   4616 union goacc_property_value
   4617 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
   4618 {
   4619   struct agent_info *agent = get_agent_info (device);
   4620 
   4621   union goacc_property_value propval = { .val = 0 };
   4622 
   4623   switch (prop)
   4624     {
   4625     case GOACC_PROPERTY_FREE_MEMORY:
   4626       /* Not supported. */
   4627       break;
   4628     case GOACC_PROPERTY_MEMORY:
   4629       {
   4630 	size_t size;
   4631 	hsa_region_t region = agent->data_region;
   4632 	hsa_status_t status =
   4633 	  hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
   4634 	if (status == HSA_STATUS_SUCCESS)
   4635 	  propval.val = size;
   4636 	break;
   4637       }
   4638     case GOACC_PROPERTY_NAME:
   4639       propval.ptr = agent->name;
   4640       break;
   4641     case GOACC_PROPERTY_VENDOR:
   4642       propval.ptr = agent->vendor_name;
   4643       break;
   4644     case GOACC_PROPERTY_DRIVER:
   4645       propval.ptr = hsa_context.driver_version_s;
   4646       break;
   4647     }
   4648 
   4649   return propval;
   4650 }
   4651 
   4652 /* Set up plugin-specific thread-local-data (host-side).  */
   4653 
   4654 void *
   4655 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
   4656 {
   4657   struct gcn_thread *thread_data
   4658     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
   4659 
   4660   thread_data->async = GOMP_ASYNC_SYNC;
   4661 
   4662   return (void *) thread_data;
   4663 }
   4664 
   4665 /* Clean up plugin-specific thread-local-data.  */
   4666 
   4667 void
   4668 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
   4669 {
   4670   free (data);
   4671 }
   4672 
   4673 /* }}} */
   4674