Home | History | Annotate | Line # | Download | only in plugin
plugin-gcn.c revision 1.1
      1 /* Plugin for AMD GCN execution.
      2 
      3    Copyright (C) 2013-2020 Free Software Foundation, Inc.
      4 
      5    Contributed by Mentor Embedded
      6 
      7    This file is part of the GNU Offloading and Multi Processing Library
      8    (libgomp).
      9 
     10    Libgomp is free software; you can redistribute it and/or modify it
     11    under the terms of the GNU General Public License as published by
     12    the Free Software Foundation; either version 3, or (at your option)
     13    any later version.
     14 
     15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
     16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
     17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
     18    more details.
     19 
     20    Under Section 7 of GPL version 3, you are granted additional
     21    permissions described in the GCC Runtime Library Exception, version
     22    3.1, as published by the Free Software Foundation.
     23 
     24    You should have received a copy of the GNU General Public License and
     25    a copy of the GCC Runtime Library Exception along with this program;
     26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     27    <http://www.gnu.org/licenses/>.  */
     28 
     29 /* {{{ Includes and defines  */
     30 
     31 #include "config.h"
     32 #include <stdio.h>
     33 #include <stdlib.h>
     34 #include <string.h>
     35 #include <pthread.h>
     36 #include <inttypes.h>
     37 #include <stdbool.h>
     38 #include <limits.h>
     39 #include <hsa.h>
     40 #include <dlfcn.h>
     41 #include <signal.h>
     42 #include "libgomp-plugin.h"
     43 #include "gomp-constants.h"
     44 #include <elf.h>
     45 #include "oacc-plugin.h"
     46 #include "oacc-int.h"
     47 #include <assert.h>
     48 
     49 /* Additional definitions not in HSA 1.1.
     50    FIXME: this needs to be updated in hsa.h for upstream, but the only source
     51           right now is the ROCr source which may cause license issues.  */
     52 #define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
     53 
     54 /* These probably won't be in elf.h for a while.  */
     55 #define R_AMDGPU_NONE		0
     56 #define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
     57 #define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
     58 #define R_AMDGPU_ABS64		3	/* S + A  */
     59 #define R_AMDGPU_REL32		4	/* S + A - P  */
     60 #define R_AMDGPU_REL64		5	/* S + A - P  */
     61 #define R_AMDGPU_ABS32		6	/* S + A  */
     62 #define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
     63 #define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
     64 #define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
     65 #define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
     66 #define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
     67 #define reserved		12
     68 #define R_AMDGPU_RELATIVE64	13	/* B + A  */
     69 
     70 /* GCN specific definitions for asynchronous queues.  */
     71 
     72 #define ASYNC_QUEUE_SIZE 64
     73 #define DRAIN_QUEUE_SYNCHRONOUS_P false
     74 #define DEBUG_QUEUES 0
     75 #define DEBUG_THREAD_SLEEP 0
     76 #define DEBUG_THREAD_SIGNAL 0
     77 
     78 /* Defaults.  */
     79 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
     80 
     81 /* Secure getenv() which returns NULL if running as SUID/SGID.  */
     82 #ifndef HAVE_SECURE_GETENV
     83 #ifdef HAVE___SECURE_GETENV
     84 #define secure_getenv __secure_getenv
     85 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
     86   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
     87 
     88 #include <unistd.h>
     89 
     90 /* Implementation of secure_getenv() for targets where it is not provided but
     91    we have at least means to test real and effective IDs. */
     92 
     93 static char *
     94 secure_getenv (const char *name)
     95 {
     96   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
     97     return getenv (name);
     98   else
     99     return NULL;
    100 }
    101 
    102 #else
    103 #define secure_getenv getenv
    104 #endif
    105 #endif
    106 
    107 /* }}}  */
    108 /* {{{ Types  */
    109 
    110 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
    111 
    112 struct gcn_thread
    113 {
    114   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
    115   int async;
    116 };
    117 
    118 /* As an HSA runtime is dlopened, following structure defines function
    119    pointers utilized by the HSA plug-in.  */
    120 
    121 struct hsa_runtime_fn_info
    122 {
    123   /* HSA runtime.  */
    124   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
    125 					const char **status_string);
    126   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
    127 					  void *value);
    128   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
    129 					 hsa_agent_info_t attribute,
    130 					 void *value);
    131   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
    132 				      hsa_isa_info_t attribute,
    133 				      uint32_t index,
    134 				      void *value);
    135   hsa_status_t (*hsa_init_fn) (void);
    136   hsa_status_t (*hsa_iterate_agents_fn)
    137     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
    138   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
    139 					  hsa_region_info_t attribute,
    140 					  void *value);
    141   hsa_status_t (*hsa_queue_create_fn)
    142     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
    143      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
    144      void *data, uint32_t private_segment_size,
    145      uint32_t group_segment_size, hsa_queue_t **queue);
    146   hsa_status_t (*hsa_agent_iterate_regions_fn)
    147     (hsa_agent_t agent,
    148      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
    149   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
    150   hsa_status_t (*hsa_executable_create_fn)
    151     (hsa_profile_t profile, hsa_executable_state_t executable_state,
    152      const char *options, hsa_executable_t *executable);
    153   hsa_status_t (*hsa_executable_global_variable_define_fn)
    154     (hsa_executable_t executable, const char *variable_name, void *address);
    155   hsa_status_t (*hsa_executable_load_code_object_fn)
    156     (hsa_executable_t executable, hsa_agent_t agent,
    157      hsa_code_object_t code_object, const char *options);
    158   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
    159 					   const char *options);
    160   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
    161 					uint32_t num_consumers,
    162 					const hsa_agent_t *consumers,
    163 					hsa_signal_t *signal);
    164   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
    165 					  void **ptr);
    166   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
    167 					      hsa_access_permission_t access);
    168   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
    169   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
    170   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
    171   hsa_status_t (*hsa_executable_get_symbol_fn)
    172     (hsa_executable_t executable, const char *module_name,
    173      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
    174      hsa_executable_symbol_t *symbol);
    175   hsa_status_t (*hsa_executable_symbol_get_info_fn)
    176     (hsa_executable_symbol_t executable_symbol,
    177      hsa_executable_symbol_info_t attribute, void *value);
    178   hsa_status_t (*hsa_executable_iterate_symbols_fn)
    179     (hsa_executable_t executable,
    180      hsa_status_t (*callback)(hsa_executable_t executable,
    181 			      hsa_executable_symbol_t symbol, void *data),
    182      void *data);
    183   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
    184 						    uint64_t value);
    185   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
    186   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
    187 				       hsa_signal_value_t value);
    188   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
    189 				       hsa_signal_value_t value);
    190   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
    191     (hsa_signal_t signal, hsa_signal_condition_t condition,
    192      hsa_signal_value_t compare_value, uint64_t timeout_hint,
    193      hsa_wait_state_t wait_state_hint);
    194   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
    195   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
    196 
    197   hsa_status_t (*hsa_code_object_deserialize_fn)
    198     (void *serialized_code_object, size_t serialized_code_object_size,
    199      const char *options, hsa_code_object_t *code_object);
    200 };
    201 
    202 /* Structure describing the run-time and grid properties of an HSA kernel
    203    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
    204 
    205 struct GOMP_kernel_launch_attributes
    206 {
    207   /* Number of dimensions the workload has.  Maximum number is 3.  */
    208   uint32_t ndim;
    209   /* Size of the grid in the three respective dimensions.  */
    210   uint32_t gdims[3];
    211   /* Size of work-groups in the respective dimensions.  */
    212   uint32_t wdims[3];
    213 };
    214 
    215 /* Collection of information needed for a dispatch of a kernel from a
    216    kernel.  */
    217 
    218 struct kernel_dispatch
    219 {
    220   struct agent_info *agent;
    221   /* Pointer to a command queue associated with a kernel dispatch agent.  */
    222   void *queue;
    223   /* Pointer to a memory space used for kernel arguments passing.  */
    224   void *kernarg_address;
    225   /* Kernel object.  */
    226   uint64_t object;
    227   /* Synchronization signal used for dispatch synchronization.  */
    228   uint64_t signal;
    229   /* Private segment size.  */
    230   uint32_t private_segment_size;
    231   /* Group segment size.  */
    232   uint32_t group_segment_size;
    233 };
    234 
    235 /* Structure of the kernargs segment, supporting console output.
    236 
    237    This needs to match the definitions in Newlib, and the expectations
    238    in libgomp target code.  */
    239 
    240 struct kernargs {
    241   /* Leave space for the real kernel arguments.
    242      OpenACC and OpenMP only use one pointer.  */
    243   int64_t dummy1;
    244   int64_t dummy2;
    245 
    246   /* A pointer to struct output, below, for console output data.  */
    247   int64_t out_ptr;
    248 
    249   /* A pointer to struct heap, below.  */
    250   int64_t heap_ptr;
    251 
    252   /* A pointer to an ephemeral memory arena.
    253     Only needed for OpenMP.  */
    254   int64_t arena_ptr;
    255 
    256   /* Output data.  */
    257   struct output {
    258     int return_value;
    259     unsigned int next_output;
    260     struct printf_data {
    261       int written;
    262       char msg[128];
    263       int type;
    264       union {
    265 	int64_t ivalue;
    266 	double dvalue;
    267 	char text[128];
    268       };
    269     } queue[1024];
    270     unsigned int consumed;
    271   } output_data;
    272 };
    273 
    274 /* A queue entry for a future asynchronous launch.  */
    275 
    276 struct kernel_launch
    277 {
    278   struct kernel_info *kernel;
    279   void *vars;
    280   struct GOMP_kernel_launch_attributes kla;
    281 };
    282 
    283 /* A queue entry for a future callback.  */
    284 
    285 struct callback
    286 {
    287   void (*fn)(void *);
    288   void *data;
    289 };
    290 
    291 /* A data struct for the copy_data callback.  */
    292 
    293 struct copy_data
    294 {
    295   void *dst;
    296   const void *src;
    297   size_t len;
    298   bool free_src;
    299   struct goacc_asyncqueue *aq;
    300 };
    301 
    302 /* A queue entry for a placeholder.  These correspond to a wait event.  */
    303 
    304 struct placeholder
    305 {
    306   int executed;
    307   pthread_cond_t cond;
    308   pthread_mutex_t mutex;
    309 };
    310 
    311 /* A queue entry for a wait directive.  */
    312 
    313 struct asyncwait_info
    314 {
    315   struct placeholder *placeholderp;
    316 };
    317 
    318 /* Encode the type of an entry in an async queue.  */
    319 
    320 enum entry_type
    321 {
    322   KERNEL_LAUNCH,
    323   CALLBACK,
    324   ASYNC_WAIT,
    325   ASYNC_PLACEHOLDER
    326 };
    327 
    328 /* An entry in an async queue.  */
    329 
    330 struct queue_entry
    331 {
    332   enum entry_type type;
    333   union {
    334     struct kernel_launch launch;
    335     struct callback callback;
    336     struct asyncwait_info asyncwait;
    337     struct placeholder placeholder;
    338   } u;
    339 };
    340 
    341 /* An async queue header.
    342 
    343    OpenMP may create one of these.
    344    OpenACC may create many.  */
    345 
    346 struct goacc_asyncqueue
    347 {
    348   struct agent_info *agent;
    349   hsa_queue_t *hsa_queue;
    350 
    351   pthread_t thread_drain_queue;
    352   pthread_mutex_t mutex;
    353   pthread_cond_t queue_cond_in;
    354   pthread_cond_t queue_cond_out;
    355   struct queue_entry queue[ASYNC_QUEUE_SIZE];
    356   int queue_first;
    357   int queue_n;
    358   int drain_queue_stop;
    359 
    360   int id;
    361   struct goacc_asyncqueue *prev;
    362   struct goacc_asyncqueue *next;
    363 };
    364 
    365 /* Mkoffload uses this structure to describe a kernel.
    366 
    367    OpenMP kernel dimensions are passed at runtime.
    368    OpenACC kernel dimensions are passed at compile time, here.  */
    369 
    370 struct hsa_kernel_description
    371 {
    372   const char *name;
    373   int oacc_dims[3];  /* Only present for GCN kernels.  */
    374   int sgpr_count;
    375   int vpgr_count;
    376 };
    377 
    378 /* Mkoffload uses this structure to describe an offload variable.  */
    379 
    380 struct global_var_info
    381 {
    382   const char *name;
    383   void *address;
    384 };
    385 
    386 /* Mkoffload uses this structure to describe all the kernels in a
    387    loadable module.  These are passed the libgomp via static constructors.  */
    388 
    389 struct gcn_image_desc
    390 {
    391   struct gcn_image {
    392     size_t size;
    393     void *image;
    394   } *gcn_image;
    395   const unsigned kernel_count;
    396   struct hsa_kernel_description *kernel_infos;
    397   const unsigned global_variable_count;
    398   struct global_var_info *global_variables;
    399 };
    400 
    401 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
    402    support.
    403    See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
    404 
    405 typedef enum {
    406   EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
    407   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
    408   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
    409 } EF_AMDGPU_MACH;
    410 
    411 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
    412 typedef EF_AMDGPU_MACH gcn_isa;
    413 
    414 /* Description of an HSA GPU agent (device) and the program associated with
    415    it.  */
    416 
    417 struct agent_info
    418 {
    419   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
    420   hsa_agent_t id;
    421   /* The user-visible device number.  */
    422   int device_id;
    423   /* Whether the agent has been initialized.  The fields below are usable only
    424      if it has been.  */
    425   bool initialized;
    426 
    427   /* The instruction set architecture of the device. */
    428   gcn_isa device_isa;
    429   /* Name of the agent. */
    430   char name[64];
    431   /* Name of the vendor of the agent. */
    432   char vendor_name[64];
    433   /* Command queues of the agent.  */
    434   hsa_queue_t *sync_queue;
    435   struct goacc_asyncqueue *async_queues, *omp_async_queue;
    436   pthread_mutex_t async_queues_mutex;
    437 
    438   /* The HSA memory region from which to allocate kernel arguments.  */
    439   hsa_region_t kernarg_region;
    440 
    441   /* The HSA memory region from which to allocate device data.  */
    442   hsa_region_t data_region;
    443 
    444   /* Allocated team arenas.  */
    445   struct team_arena_list *team_arena_list;
    446   pthread_mutex_t team_arena_write_lock;
    447 
    448   /* Read-write lock that protects kernels which are running or about to be run
    449      from interference with loading and unloading of images.  Needs to be
    450      locked for reading while a kernel is being run, and for writing if the
    451      list of modules is manipulated (and thus the HSA program invalidated).  */
    452   pthread_rwlock_t module_rwlock;
    453 
    454   /* The module associated with this kernel.  */
    455   struct module_info *module;
    456 
    457   /* Mutex enforcing that only one thread will finalize the HSA program.  A
    458      thread should have locked agent->module_rwlock for reading before
    459      acquiring it.  */
    460   pthread_mutex_t prog_mutex;
    461   /* Flag whether the HSA program that consists of all the modules has been
    462      finalized.  */
    463   bool prog_finalized;
    464   /* HSA executable - the finalized program that is used to locate kernels.  */
    465   hsa_executable_t executable;
    466 };
    467 
    468 /* Information required to identify, finalize and run any given kernel.  */
    469 
    470 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
    471 
    472 struct kernel_info
    473 {
    474   /* Name of the kernel, required to locate it within the GCN object-code
    475      module.  */
    476   const char *name;
    477   /* The specific agent the kernel has been or will be finalized for and run
    478      on.  */
    479   struct agent_info *agent;
    480   /* The specific module where the kernel takes place.  */
    481   struct module_info *module;
    482   /* Information provided by mkoffload associated with the kernel.  */
    483   struct hsa_kernel_description *description;
    484   /* Mutex enforcing that at most once thread ever initializes a kernel for
    485      use.  A thread should have locked agent->module_rwlock for reading before
    486      acquiring it.  */
    487   pthread_mutex_t init_mutex;
    488   /* Flag indicating whether the kernel has been initialized and all fields
    489      below it contain valid data.  */
    490   bool initialized;
    491   /* Flag indicating that the kernel has a problem that blocks an execution.  */
    492   bool initialization_failed;
    493   /* The object to be put into the dispatch queue.  */
    494   uint64_t object;
    495   /* Required size of kernel arguments.  */
    496   uint32_t kernarg_segment_size;
    497   /* Required size of group segment.  */
    498   uint32_t group_segment_size;
    499   /* Required size of private segment.  */
    500   uint32_t private_segment_size;
    501   /* Set up for OpenMP or OpenACC?  */
    502   enum offload_kind kind;
    503 };
    504 
    505 /* Information about a particular GCN module, its image and kernels.  */
    506 
    507 struct module_info
    508 {
    509   /* The description with which the program has registered the image.  */
    510   struct gcn_image_desc *image_desc;
    511   /* GCN heap allocation.  */
    512   struct heap *heap;
    513   /* Physical boundaries of the loaded module.  */
    514   Elf64_Addr phys_address_start;
    515   Elf64_Addr phys_address_end;
    516 
    517   bool constructors_run_p;
    518   struct kernel_info *init_array_func, *fini_array_func;
    519 
    520   /* Number of kernels in this module.  */
    521   int kernel_count;
    522   /* An array of kernel_info structures describing each kernel in this
    523      module.  */
    524   struct kernel_info kernels[];
    525 };
    526 
    527 /* A linked list of memory arenas allocated on the device.
    528    These are only used by OpenMP, as a means to optimize per-team malloc.  */
    529 
    530 struct team_arena_list
    531 {
    532   struct team_arena_list *next;
    533 
    534   /* The number of teams determines the size of the allocation.  */
    535   int num_teams;
    536   /* The device address of the arena itself.  */
    537   void *arena;
    538   /* A flag to prevent two asynchronous kernels trying to use the same arena.
    539      The mutex is locked until the kernel exits.  */
    540   pthread_mutex_t in_use;
    541 };
    542 
    543 /* Information about the whole HSA environment and all of its agents.  */
    544 
    545 struct hsa_context_info
    546 {
    547   /* Whether the structure has been initialized.  */
    548   bool initialized;
    549   /* Number of usable GPU HSA agents in the system.  */
    550   int agent_count;
    551   /* Array of agent_info structures describing the individual HSA agents.  */
    552   struct agent_info *agents;
    553   /* Driver version string. */
    554   char driver_version_s[30];
    555 };
    556 
    557 /* Format of the on-device heap.
    558 
    559    This must match the definition in Newlib and gcn-run.  */
    560 
    561 struct heap {
    562   int64_t size;
    563   char data[0];
    564 };
    565 
    566 /* }}}  */
    567 /* {{{ Global variables  */
    568 
    569 /* Information about the whole HSA environment and all of its agents.  */
    570 
    571 static struct hsa_context_info hsa_context;
    572 
    573 /* HSA runtime functions that are initialized in init_hsa_context.  */
    574 
    575 static struct hsa_runtime_fn_info hsa_fns;
    576 
    577 /* Heap space, allocated target-side, provided for use of newlib malloc.
    578    Each module should have it's own heap allocated.
    579    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
    580 
    581 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
    582 
    583 /* Flag to decide whether print to stderr information about what is going on.
    584    Set in init_debug depending on environment variables.  */
    585 
    586 static bool debug;
    587 
    588 /* Flag to decide if the runtime should suppress a possible fallback to host
    589    execution.  */
    590 
    591 static bool suppress_host_fallback;
    592 
    593 /* Flag to locate HSA runtime shared library that is dlopened
    594    by this plug-in.  */
    595 
    596 static const char *hsa_runtime_lib;
    597 
    598 /* Flag to decide if the runtime should support also CPU devices (can be
    599    a simulator).  */
    600 
    601 static bool support_cpu_devices;
    602 
    603 /* Runtime dimension overrides.  Zero indicates default.  */
    604 
    605 static int override_x_dim = 0;
    606 static int override_z_dim = 0;
    607 
    608 /* }}}  */
    609 /* {{{ Debug & Diagnostic  */
    610 
    611 /* Print a message to stderr if GCN_DEBUG value is set to true.  */
    612 
    613 #define DEBUG_PRINT(...) \
    614   do \
    615   { \
    616     if (debug) \
    617       { \
    618 	fprintf (stderr, __VA_ARGS__); \
    619       } \
    620   } \
    621   while (false);
    622 
    623 /* Flush stderr if GCN_DEBUG value is set to true.  */
    624 
    625 #define DEBUG_FLUSH()				\
    626   do {						\
    627     if (debug)					\
    628       fflush (stderr);				\
    629   } while (false)
    630 
    631 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
    632    is set to true.  */
    633 
    634 #define DEBUG_LOG(prefix, ...)			\
    635   do						\
    636     {						\
    637       DEBUG_PRINT (prefix);			\
    638       DEBUG_PRINT (__VA_ARGS__);			\
    639       DEBUG_FLUSH ();				\
    640     } while (false)
    641 
    642 /* Print a debugging message to stderr.  */
    643 
    644 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
    645 
    646 /* Print a warning message to stderr.  */
    647 
    648 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
    649 
    650 /* Print HSA warning STR with an HSA STATUS code.  */
    651 
    652 static void
    653 hsa_warn (const char *str, hsa_status_t status)
    654 {
    655   if (!debug)
    656     return;
    657 
    658   const char *hsa_error_msg = "[unknown]";
    659   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
    660 
    661   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
    662 	   hsa_error_msg);
    663 }
    664 
    665 /* Report a fatal error STR together with the HSA error corresponding to STATUS
    666    and terminate execution of the current process.  */
    667 
    668 static void
    669 hsa_fatal (const char *str, hsa_status_t status)
    670 {
    671   const char *hsa_error_msg = "[unknown]";
    672   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
    673   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
    674 		     hsa_error_msg);
    675 }
    676 
    677 /* Like hsa_fatal, except only report error message, and return FALSE
    678    for propagating error processing to outside of plugin.  */
    679 
    680 static bool
    681 hsa_error (const char *str, hsa_status_t status)
    682 {
    683   const char *hsa_error_msg = "[unknown]";
    684   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
    685   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
    686 		     hsa_error_msg);
    687   return false;
    688 }
    689 
    690 /* Dump information about the available hardware.  */
    691 
    692 static void
    693 dump_hsa_system_info (void)
    694 {
    695   hsa_status_t status;
    696 
    697   hsa_endianness_t endianness;
    698   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
    699 					   &endianness);
    700   if (status == HSA_STATUS_SUCCESS)
    701     switch (endianness)
    702       {
    703       case HSA_ENDIANNESS_LITTLE:
    704 	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
    705 	break;
    706       case HSA_ENDIANNESS_BIG:
    707 	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
    708 	break;
    709       default:
    710 	GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
    711       }
    712   else
    713     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
    714 
    715   uint8_t extensions[128];
    716   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
    717 					   &extensions);
    718   if (status == HSA_STATUS_SUCCESS)
    719     {
    720       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
    721 	GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
    722     }
    723   else
    724     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
    725 }
    726 
    727 /* Dump information about the available hardware.  */
    728 
    729 static void
    730 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
    731 {
    732   switch (machine_model)
    733     {
    734     case HSA_MACHINE_MODEL_SMALL:
    735       GCN_DEBUG ("%s: SMALL\n", s);
    736       break;
    737     case HSA_MACHINE_MODEL_LARGE:
    738       GCN_DEBUG ("%s: LARGE\n", s);
    739       break;
    740     default:
    741       GCN_WARNING ("%s: UNKNOWN\n", s);
    742       break;
    743     }
    744 }
    745 
    746 /* Dump information about the available hardware.  */
    747 
    748 static void
    749 dump_profile (hsa_profile_t profile, const char *s)
    750 {
    751   switch (profile)
    752     {
    753     case HSA_PROFILE_FULL:
    754       GCN_DEBUG ("%s: FULL\n", s);
    755       break;
    756     case HSA_PROFILE_BASE:
    757       GCN_DEBUG ("%s: BASE\n", s);
    758       break;
    759     default:
    760       GCN_WARNING ("%s: UNKNOWN\n", s);
    761       break;
    762     }
    763 }
    764 
    765 /* Dump information about a device memory region.  */
    766 
    767 static hsa_status_t
    768 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
    769 {
    770   hsa_status_t status;
    771 
    772   hsa_region_segment_t segment;
    773   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
    774 					   &segment);
    775   if (status == HSA_STATUS_SUCCESS)
    776     {
    777       if (segment == HSA_REGION_SEGMENT_GLOBAL)
    778 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
    779       else if (segment == HSA_REGION_SEGMENT_READONLY)
    780 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
    781       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
    782 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
    783       else if (segment == HSA_REGION_SEGMENT_GROUP)
    784 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
    785       else
    786 	GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
    787     }
    788   else
    789     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
    790 
    791   if (segment == HSA_REGION_SEGMENT_GLOBAL)
    792     {
    793       uint32_t flags;
    794       status
    795 	= hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
    796 					  &flags);
    797       if (status == HSA_STATUS_SUCCESS)
    798 	{
    799 	  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
    800 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
    801 	  if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
    802 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
    803 	  if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
    804 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
    805 	}
    806       else
    807 	GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
    808     }
    809 
    810   size_t size;
    811   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
    812   if (status == HSA_STATUS_SUCCESS)
    813     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
    814   else
    815     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
    816 
    817   status
    818     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
    819 				      &size);
    820   if (status == HSA_STATUS_SUCCESS)
    821     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
    822   else
    823     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
    824 
    825   bool alloc_allowed;
    826   status
    827     = hsa_fns.hsa_region_get_info_fn (region,
    828 				      HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
    829 				      &alloc_allowed);
    830   if (status == HSA_STATUS_SUCCESS)
    831     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
    832   else
    833     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
    834 
    835   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
    836     return HSA_STATUS_SUCCESS;
    837 
    838   status
    839     = hsa_fns.hsa_region_get_info_fn (region,
    840 				      HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
    841 				      &size);
    842   if (status == HSA_STATUS_SUCCESS)
    843     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
    844   else
    845     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
    846 
    847   size_t align;
    848   status
    849     = hsa_fns.hsa_region_get_info_fn (region,
    850 				      HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
    851 				      &align);
    852   if (status == HSA_STATUS_SUCCESS)
    853     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
    854   else
    855     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
    856 
    857   return HSA_STATUS_SUCCESS;
    858 }
    859 
    860 /* Dump information about all the device memory regions.  */
    861 
    862 static void
    863 dump_hsa_regions (hsa_agent_t agent)
    864 {
    865   hsa_status_t status;
    866   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
    867 						 dump_hsa_region,
    868 						 NULL);
    869   if (status != HSA_STATUS_SUCCESS)
    870     hsa_error ("Dumping hsa regions failed", status);
    871 }
    872 
    873 /* Dump information about the available devices.  */
    874 
    875 static hsa_status_t
    876 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
    877 {
    878   hsa_status_t status;
    879 
    880   char buf[64];
    881   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
    882 					  &buf);
    883   if (status == HSA_STATUS_SUCCESS)
    884     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
    885   else
    886     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
    887 
    888   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
    889 					  &buf);
    890   if (status == HSA_STATUS_SUCCESS)
    891     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
    892   else
    893     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
    894 
    895   hsa_machine_model_t machine_model;
    896   status
    897     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
    898 				     &machine_model);
    899   if (status == HSA_STATUS_SUCCESS)
    900     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
    901   else
    902     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
    903 
    904   hsa_profile_t profile;
    905   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
    906 					  &profile);
    907   if (status == HSA_STATUS_SUCCESS)
    908     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
    909   else
    910     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
    911 
    912   hsa_device_type_t device_type;
    913   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
    914 					  &device_type);
    915   if (status == HSA_STATUS_SUCCESS)
    916     {
    917       switch (device_type)
    918 	{
    919 	case HSA_DEVICE_TYPE_CPU:
    920 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
    921 	  break;
    922 	case HSA_DEVICE_TYPE_GPU:
    923 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
    924 	  break;
    925 	case HSA_DEVICE_TYPE_DSP:
    926 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
    927 	  break;
    928 	default:
    929 	  GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
    930 	  break;
    931 	}
    932     }
    933   else
    934     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
    935 
    936   uint32_t cu_count;
    937   status = hsa_fns.hsa_agent_get_info_fn
    938     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
    939   if (status == HSA_STATUS_SUCCESS)
    940     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
    941   else
    942     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
    943 
    944   uint32_t size;
    945   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
    946 					  &size);
    947   if (status == HSA_STATUS_SUCCESS)
    948     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
    949   else
    950     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
    951 
    952   uint32_t max_dim;
    953   status = hsa_fns.hsa_agent_get_info_fn (agent,
    954 					  HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
    955 					  &max_dim);
    956   if (status == HSA_STATUS_SUCCESS)
    957     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
    958   else
    959     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
    960 
    961   uint32_t max_size;
    962   status = hsa_fns.hsa_agent_get_info_fn (agent,
    963 					  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
    964 					  &max_size);
    965   if (status == HSA_STATUS_SUCCESS)
    966     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
    967   else
    968     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
    969 
    970   uint32_t grid_max_dim;
    971   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
    972 					  &grid_max_dim);
    973   if (status == HSA_STATUS_SUCCESS)
    974     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
    975   else
    976     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
    977 
    978   uint32_t grid_max_size;
    979   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
    980 					  &grid_max_size);
    981   if (status == HSA_STATUS_SUCCESS)
    982     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
    983   else
    984     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
    985 
    986   dump_hsa_regions (agent);
    987 
    988   return HSA_STATUS_SUCCESS;
    989 }
    990 
    991 /* Forward reference.  */
    992 
    993 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
    994 
    995 /* Helper function for dump_executable_symbols.  */
    996 
    997 static hsa_status_t
    998 dump_executable_symbol (hsa_executable_t executable,
    999 			hsa_executable_symbol_t symbol,
   1000 			void *data __attribute__((unused)))
   1001 {
   1002   char *name = get_executable_symbol_name (symbol);
   1003 
   1004   if (name)
   1005     {
   1006       GCN_DEBUG ("executable symbol: %s\n", name);
   1007       free (name);
   1008     }
   1009 
   1010   return HSA_STATUS_SUCCESS;
   1011 }
   1012 
   1013 /* Dump all global symbol in an executable.  */
   1014 
   1015 static void
   1016 dump_executable_symbols (hsa_executable_t executable)
   1017 {
   1018   hsa_status_t status;
   1019   status
   1020     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
   1021 						 dump_executable_symbol,
   1022 						 NULL);
   1023   if (status != HSA_STATUS_SUCCESS)
   1024     hsa_fatal ("Could not dump HSA executable symbols", status);
   1025 }
   1026 
   1027 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
   1028 
   1029 static void
   1030 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
   1031 {
   1032   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
   1033 
   1034   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
   1035   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
   1036   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
   1037   fprintf (stderr, "%*sheap address: %p\n", indent, "",
   1038 	   (void*)kernargs->heap_ptr);
   1039   fprintf (stderr, "%*sarena address: %p\n", indent, "",
   1040 	   (void*)kernargs->arena_ptr);
   1041   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
   1042   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
   1043 	   dispatch->private_segment_size);
   1044   fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
   1045 	   dispatch->group_segment_size);
   1046   fprintf (stderr, "\n");
   1047 }
   1048 
   1049 /* }}}  */
   1050 /* {{{ Utility functions  */
   1051 
   1052 /* Cast the thread local storage to gcn_thread.  */
   1053 
   1054 static inline struct gcn_thread *
   1055 gcn_thread (void)
   1056 {
   1057   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
   1058 }
   1059 
   1060 /* Initialize debug and suppress_host_fallback according to the environment.  */
   1061 
   1062 static void
   1063 init_environment_variables (void)
   1064 {
   1065   if (secure_getenv ("GCN_DEBUG"))
   1066     debug = true;
   1067   else
   1068     debug = false;
   1069 
   1070   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
   1071     suppress_host_fallback = true;
   1072   else
   1073     suppress_host_fallback = false;
   1074 
   1075   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
   1076   if (hsa_runtime_lib == NULL)
   1077     hsa_runtime_lib = "libhsa-runtime64.so";
   1078 
   1079   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
   1080 
   1081   const char *x = secure_getenv ("GCN_NUM_TEAMS");
   1082   if (!x)
   1083     x = secure_getenv ("GCN_NUM_GANGS");
   1084   if (x)
   1085     override_x_dim = atoi (x);
   1086 
   1087   const char *z = secure_getenv ("GCN_NUM_THREADS");
   1088   if (!z)
   1089     z = secure_getenv ("GCN_NUM_WORKERS");
   1090   if (z)
   1091     override_z_dim = atoi (z);
   1092 
   1093   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
   1094   if (heap)
   1095     {
   1096       size_t tmp = atol (heap);
   1097       if (tmp)
   1098 	gcn_kernel_heap_size = tmp;
   1099     }
   1100 }
   1101 
   1102 /* Return malloc'd string with name of SYMBOL.  */
   1103 
   1104 static char *
   1105 get_executable_symbol_name (hsa_executable_symbol_t symbol)
   1106 {
   1107   hsa_status_t status;
   1108   char *res;
   1109   uint32_t len;
   1110   const hsa_executable_symbol_info_t info_name_length
   1111     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
   1112 
   1113   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
   1114 						      &len);
   1115   if (status != HSA_STATUS_SUCCESS)
   1116     {
   1117       hsa_error ("Could not get length of symbol name", status);
   1118       return NULL;
   1119     }
   1120 
   1121   res = GOMP_PLUGIN_malloc (len + 1);
   1122 
   1123   const hsa_executable_symbol_info_t info_name
   1124     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
   1125 
   1126   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
   1127 
   1128   if (status != HSA_STATUS_SUCCESS)
   1129     {
   1130       hsa_error ("Could not get symbol name", status);
   1131       free (res);
   1132       return NULL;
   1133     }
   1134 
   1135   res[len] = '\0';
   1136 
   1137   return res;
   1138 }
   1139 
   1140 /* Helper function for find_executable_symbol.  */
   1141 
   1142 static hsa_status_t
   1143 find_executable_symbol_1 (hsa_executable_t executable,
   1144 			  hsa_executable_symbol_t symbol,
   1145 			  void *data)
   1146 {
   1147   hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
   1148   *res = symbol;
   1149   return HSA_STATUS_INFO_BREAK;
   1150 }
   1151 
   1152 /* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true.  If not
   1153    found, return false.  */
   1154 
   1155 static bool
   1156 find_executable_symbol (hsa_executable_t executable,
   1157 			hsa_executable_symbol_t *symbol)
   1158 {
   1159   hsa_status_t status;
   1160 
   1161   status
   1162     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
   1163 						 find_executable_symbol_1,
   1164 						 symbol);
   1165   if (status != HSA_STATUS_INFO_BREAK)
   1166     {
   1167       hsa_error ("Could not find executable symbol", status);
   1168       return false;
   1169     }
   1170 
   1171   return true;
   1172 }
   1173 
   1174 /* Get the number of GPU Compute Units.  */
   1175 
   1176 static int
   1177 get_cu_count (struct agent_info *agent)
   1178 {
   1179   uint32_t cu_count;
   1180   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
   1181     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
   1182   if (status == HSA_STATUS_SUCCESS)
   1183     return cu_count;
   1184   else
   1185     return 64;  /* The usual number for older devices.  */
   1186 }
   1187 
   1188 /* Calculate the maximum grid size for OMP threads / OACC workers.
   1189    This depends on the kernel's resource usage levels.  */
   1190 
   1191 static int
   1192 limit_worker_threads (int threads)
   1193 {
   1194   /* FIXME Do something more inteligent here.
   1195      GCN can always run 4 threads within a Compute Unit, but
   1196      more than that depends on register usage.  */
   1197   if (threads > 16)
   1198     threads = 16;
   1199   return threads;
   1200 }
   1201 
   1202 /* Parse the target attributes INPUT provided by the compiler and return true
   1203    if we should run anything all.  If INPUT is NULL, fill DEF with default
   1204    values, then store INPUT or DEF into *RESULT.
   1205 
   1206    This is used for OpenMP only.  */
   1207 
   1208 static bool
   1209 parse_target_attributes (void **input,
   1210 			 struct GOMP_kernel_launch_attributes *def,
   1211 			 struct GOMP_kernel_launch_attributes **result,
   1212 			 struct agent_info *agent)
   1213 {
   1214   if (!input)
   1215     GOMP_PLUGIN_fatal ("No target arguments provided");
   1216 
   1217   bool grid_attrs_found = false;
   1218   bool gcn_dims_found = false;
   1219   int gcn_teams = 0;
   1220   int gcn_threads = 0;
   1221   while (*input)
   1222     {
   1223       intptr_t id = (intptr_t) *input++, val;
   1224 
   1225       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
   1226 	val = (intptr_t) *input++;
   1227       else
   1228 	val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
   1229 
   1230       val = (val > INT_MAX) ? INT_MAX : val;
   1231 
   1232       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
   1233 	  && ((id & GOMP_TARGET_ARG_ID_MASK)
   1234 	      == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
   1235 	{
   1236 	  grid_attrs_found = true;
   1237 	  break;
   1238 	}
   1239       else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
   1240 	       == GOMP_TARGET_ARG_DEVICE_ALL)
   1241 	{
   1242 	  gcn_dims_found = true;
   1243 	  switch (id & GOMP_TARGET_ARG_ID_MASK)
   1244 	    {
   1245 	    case GOMP_TARGET_ARG_NUM_TEAMS:
   1246 	      gcn_teams = val;
   1247 	      break;
   1248 	    case GOMP_TARGET_ARG_THREAD_LIMIT:
   1249 	      gcn_threads = limit_worker_threads (val);
   1250 	      break;
   1251 	    default:
   1252 	      ;
   1253 	    }
   1254 	}
   1255     }
   1256 
   1257   if (gcn_dims_found)
   1258     {
   1259       if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
   1260 	  && gcn_threads == 0 && override_z_dim == 0)
   1261 	{
   1262 	  gcn_threads = 4;
   1263 	  GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
   1264 		       "threads to 4 per team.\n");
   1265 	  GCN_WARNING (" - If this is not a Vega 10 device, please use "
   1266 		       "GCN_NUM_THREADS=16\n");
   1267 	}
   1268 
   1269       def->ndim = 3;
   1270       /* Fiji has 64 CUs, but Vega20 has 60.  */
   1271       def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
   1272       /* Each thread is 64 work items wide.  */
   1273       def->gdims[1] = 64;
   1274       /* A work group can have 16 wavefronts.  */
   1275       def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
   1276       def->wdims[0] = 1; /* Single team per work-group.  */
   1277       def->wdims[1] = 64;
   1278       def->wdims[2] = 16;
   1279       *result = def;
   1280       return true;
   1281     }
   1282   else if (!grid_attrs_found)
   1283     {
   1284       def->ndim = 1;
   1285       def->gdims[0] = 1;
   1286       def->gdims[1] = 1;
   1287       def->gdims[2] = 1;
   1288       def->wdims[0] = 1;
   1289       def->wdims[1] = 1;
   1290       def->wdims[2] = 1;
   1291       *result = def;
   1292       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
   1293       return true;
   1294     }
   1295 
   1296   struct GOMP_kernel_launch_attributes *kla;
   1297   kla = (struct GOMP_kernel_launch_attributes *) *input;
   1298   *result = kla;
   1299   if (kla->ndim == 0 || kla->ndim > 3)
   1300     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
   1301 
   1302   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
   1303   unsigned i;
   1304   for (i = 0; i < kla->ndim; i++)
   1305     {
   1306       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
   1307 		 kla->gdims[i], kla->wdims[i]);
   1308       if (kla->gdims[i] == 0)
   1309 	return false;
   1310     }
   1311   return true;
   1312 }
   1313 
   1314 /* Return the group size given the requested GROUP size, GRID size and number
   1315    of grid dimensions NDIM.  */
   1316 
   1317 static uint32_t
   1318 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
   1319 {
   1320   if (group == 0)
   1321     {
   1322       /* TODO: Provide a default via environment or device characteristics.  */
   1323       if (ndim == 1)
   1324 	group = 64;
   1325       else if (ndim == 2)
   1326 	group = 8;
   1327       else
   1328 	group = 4;
   1329     }
   1330 
   1331   if (group > grid)
   1332     group = grid;
   1333   return group;
   1334 }
   1335 
   1336 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
   1337 
   1338 static void
   1339 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
   1340 {
   1341   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
   1342 }
   1343 
   1344 /* A never-called callback for the HSA command queues.  These signal events
   1345    that we don't use, so we trigger an error.
   1346 
   1347    This "queue" is not to be confused with the async queues, below.  */
   1348 
   1349 static void
   1350 hsa_queue_callback (hsa_status_t status,
   1351 		hsa_queue_t *queue __attribute__ ((unused)),
   1352 		void *data __attribute__ ((unused)))
   1353 {
   1354   hsa_fatal ("Asynchronous queue error", status);
   1355 }
   1356 
   1357 /* }}}  */
   1358 /* {{{ HSA initialization  */
   1359 
   1360 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
   1361 
   1362 static bool
   1363 init_hsa_runtime_functions (void)
   1364 {
   1365 #define DLSYM_FN(function) \
   1366   hsa_fns.function##_fn = dlsym (handle, #function); \
   1367   if (hsa_fns.function##_fn == NULL) \
   1368     return false;
   1369   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
   1370   if (handle == NULL)
   1371     return false;
   1372 
   1373   DLSYM_FN (hsa_status_string)
   1374   DLSYM_FN (hsa_system_get_info)
   1375   DLSYM_FN (hsa_agent_get_info)
   1376   DLSYM_FN (hsa_init)
   1377   DLSYM_FN (hsa_iterate_agents)
   1378   DLSYM_FN (hsa_region_get_info)
   1379   DLSYM_FN (hsa_queue_create)
   1380   DLSYM_FN (hsa_agent_iterate_regions)
   1381   DLSYM_FN (hsa_executable_destroy)
   1382   DLSYM_FN (hsa_executable_create)
   1383   DLSYM_FN (hsa_executable_global_variable_define)
   1384   DLSYM_FN (hsa_executable_load_code_object)
   1385   DLSYM_FN (hsa_executable_freeze)
   1386   DLSYM_FN (hsa_signal_create)
   1387   DLSYM_FN (hsa_memory_allocate)
   1388   DLSYM_FN (hsa_memory_assign_agent)
   1389   DLSYM_FN (hsa_memory_copy)
   1390   DLSYM_FN (hsa_memory_free)
   1391   DLSYM_FN (hsa_signal_destroy)
   1392   DLSYM_FN (hsa_executable_get_symbol)
   1393   DLSYM_FN (hsa_executable_symbol_get_info)
   1394   DLSYM_FN (hsa_executable_iterate_symbols)
   1395   DLSYM_FN (hsa_queue_add_write_index_release)
   1396   DLSYM_FN (hsa_queue_load_read_index_acquire)
   1397   DLSYM_FN (hsa_signal_wait_acquire)
   1398   DLSYM_FN (hsa_signal_store_relaxed)
   1399   DLSYM_FN (hsa_signal_store_release)
   1400   DLSYM_FN (hsa_signal_load_acquire)
   1401   DLSYM_FN (hsa_queue_destroy)
   1402   DLSYM_FN (hsa_code_object_deserialize)
   1403   return true;
   1404 #undef DLSYM_FN
   1405 }
   1406 
   1407 /* Return true if the agent is a GPU and can accept of concurrent submissions
   1408    from different threads.  */
   1409 
   1410 static bool
   1411 suitable_hsa_agent_p (hsa_agent_t agent)
   1412 {
   1413   hsa_device_type_t device_type;
   1414   hsa_status_t status
   1415     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
   1416 				     &device_type);
   1417   if (status != HSA_STATUS_SUCCESS)
   1418     return false;
   1419 
   1420   switch (device_type)
   1421     {
   1422     case HSA_DEVICE_TYPE_GPU:
   1423       break;
   1424     case HSA_DEVICE_TYPE_CPU:
   1425       if (!support_cpu_devices)
   1426 	return false;
   1427       break;
   1428     default:
   1429       return false;
   1430     }
   1431 
   1432   uint32_t features = 0;
   1433   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
   1434 					  &features);
   1435   if (status != HSA_STATUS_SUCCESS
   1436       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
   1437     return false;
   1438   hsa_queue_type_t queue_type;
   1439   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
   1440 					  &queue_type);
   1441   if (status != HSA_STATUS_SUCCESS
   1442       || (queue_type != HSA_QUEUE_TYPE_MULTI))
   1443     return false;
   1444 
   1445   return true;
   1446 }
   1447 
   1448 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
   1449    agent_count in hsa_context.  */
   1450 
   1451 static hsa_status_t
   1452 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
   1453 {
   1454   if (suitable_hsa_agent_p (agent))
   1455     hsa_context.agent_count++;
   1456   return HSA_STATUS_SUCCESS;
   1457 }
   1458 
   1459 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
   1460    id to the describing structure in the hsa context.  The index of the
   1461    structure is pointed to by DATA, increment it afterwards.  */
   1462 
   1463 static hsa_status_t
   1464 assign_agent_ids (hsa_agent_t agent, void *data)
   1465 {
   1466   if (suitable_hsa_agent_p (agent))
   1467     {
   1468       int *agent_index = (int *) data;
   1469       hsa_context.agents[*agent_index].id = agent;
   1470       ++*agent_index;
   1471     }
   1472   return HSA_STATUS_SUCCESS;
   1473 }
   1474 
   1475 /* Initialize hsa_context if it has not already been done.
   1476    Return TRUE on success.  */
   1477 
   1478 static bool
   1479 init_hsa_context (void)
   1480 {
   1481   hsa_status_t status;
   1482   int agent_index = 0;
   1483 
   1484   if (hsa_context.initialized)
   1485     return true;
   1486   init_environment_variables ();
   1487   if (!init_hsa_runtime_functions ())
   1488     {
   1489       GCN_WARNING ("Run-time could not be dynamically opened\n");
   1490       if (suppress_host_fallback)
   1491 	GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
   1492       return false;
   1493     }
   1494   status = hsa_fns.hsa_init_fn ();
   1495   if (status != HSA_STATUS_SUCCESS)
   1496     return hsa_error ("Run-time could not be initialized", status);
   1497   GCN_DEBUG ("HSA run-time initialized for GCN\n");
   1498 
   1499   if (debug)
   1500     dump_hsa_system_info ();
   1501 
   1502   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
   1503   if (status != HSA_STATUS_SUCCESS)
   1504     return hsa_error ("GCN GPU devices could not be enumerated", status);
   1505   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
   1506 
   1507   hsa_context.agents
   1508     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
   1509 				  * sizeof (struct agent_info));
   1510   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
   1511   if (status != HSA_STATUS_SUCCESS)
   1512     return hsa_error ("Scanning compute agents failed", status);
   1513   if (agent_index != hsa_context.agent_count)
   1514     {
   1515       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
   1516       return false;
   1517     }
   1518 
   1519   if (debug)
   1520     {
   1521       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
   1522       if (status != HSA_STATUS_SUCCESS)
   1523 	GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
   1524     }
   1525 
   1526   uint16_t minor, major;
   1527   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
   1528 					   &minor);
   1529   if (status != HSA_STATUS_SUCCESS)
   1530     GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
   1531   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
   1532 					   &major);
   1533   if (status != HSA_STATUS_SUCCESS)
   1534     GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
   1535 
   1536   size_t len = sizeof hsa_context.driver_version_s;
   1537   int printed = snprintf (hsa_context.driver_version_s, len,
   1538 			  "HSA Runtime %hu.%hu", (unsigned short int)major,
   1539 			  (unsigned short int)minor);
   1540   if (printed >= len)
   1541     GCN_WARNING ("HSA runtime version string was truncated."
   1542 		 "Version %hu.%hu is too long.", (unsigned short int)major,
   1543 		 (unsigned short int)minor);
   1544 
   1545   hsa_context.initialized = true;
   1546   return true;
   1547 }
   1548 
   1549 /* Verify that hsa_context has already been initialized and return the
   1550    agent_info structure describing device number N.  Return NULL on error.  */
   1551 
   1552 static struct agent_info *
   1553 get_agent_info (int n)
   1554 {
   1555   if (!hsa_context.initialized)
   1556     {
   1557       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
   1558       return NULL;
   1559     }
   1560   if (n >= hsa_context.agent_count)
   1561     {
   1562       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
   1563       return NULL;
   1564     }
   1565   if (!hsa_context.agents[n].initialized)
   1566     {
   1567       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
   1568       return NULL;
   1569     }
   1570   return &hsa_context.agents[n];
   1571 }
   1572 
   1573 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
   1574 
   1575    Selects (breaks at) a suitable region of type KIND.  */
   1576 
   1577 static hsa_status_t
   1578 get_memory_region (hsa_region_t region, hsa_region_t *retval,
   1579 		   hsa_region_global_flag_t kind)
   1580 {
   1581   hsa_status_t status;
   1582   hsa_region_segment_t segment;
   1583 
   1584   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
   1585 					   &segment);
   1586   if (status != HSA_STATUS_SUCCESS)
   1587     return status;
   1588   if (segment != HSA_REGION_SEGMENT_GLOBAL)
   1589     return HSA_STATUS_SUCCESS;
   1590 
   1591   uint32_t flags;
   1592   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
   1593 					   &flags);
   1594   if (status != HSA_STATUS_SUCCESS)
   1595     return status;
   1596   if (flags & kind)
   1597     {
   1598       *retval = region;
   1599       return HSA_STATUS_INFO_BREAK;
   1600     }
   1601   return HSA_STATUS_SUCCESS;
   1602 }
   1603 
   1604 /* Callback of hsa_agent_iterate_regions.
   1605 
   1606    Selects a kernargs memory region.  */
   1607 
   1608 static hsa_status_t
   1609 get_kernarg_memory_region (hsa_region_t region, void *data)
   1610 {
   1611   return get_memory_region (region, (hsa_region_t *)data,
   1612 			    HSA_REGION_GLOBAL_FLAG_KERNARG);
   1613 }
   1614 
   1615 /* Callback of hsa_agent_iterate_regions.
   1616 
   1617    Selects a coarse-grained memory region suitable for the heap and
   1618    offload data.  */
   1619 
   1620 static hsa_status_t
   1621 get_data_memory_region (hsa_region_t region, void *data)
   1622 {
   1623   return get_memory_region (region, (hsa_region_t *)data,
   1624 			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
   1625 }
   1626 
   1627 static int
   1628 elf_gcn_isa_field (Elf64_Ehdr *image)
   1629 {
   1630   return image->e_flags & EF_AMDGPU_MACH_MASK;
   1631 }
   1632 
   1633 const static char *gcn_gfx803_s = "gfx803";
   1634 const static char *gcn_gfx900_s = "gfx900";
   1635 const static char *gcn_gfx906_s = "gfx906";
   1636 const static int gcn_isa_name_len = 6;
   1637 
   1638 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
   1639    support the ISA. */
   1640 
   1641 static const char*
   1642 isa_hsa_name (int isa) {
   1643   switch(isa)
   1644     {
   1645     case EF_AMDGPU_MACH_AMDGCN_GFX803:
   1646       return gcn_gfx803_s;
   1647     case EF_AMDGPU_MACH_AMDGCN_GFX900:
   1648       return gcn_gfx900_s;
   1649     case EF_AMDGPU_MACH_AMDGCN_GFX906:
   1650       return gcn_gfx906_s;
   1651     }
   1652   return NULL;
   1653 }
   1654 
   1655 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
   1656    with -march) or NULL if we do not support the ISA.
   1657    Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
   1658 
   1659 static const char*
   1660 isa_gcc_name (int isa) {
   1661   switch(isa)
   1662     {
   1663     case EF_AMDGPU_MACH_AMDGCN_GFX803:
   1664       return "fiji";
   1665     default:
   1666       return isa_hsa_name (isa);
   1667     }
   1668 }
   1669 
   1670 /* Returns the code which is used in the GCN object code to identify the ISA with
   1671    the given name (as used by the HSA runtime).  */
   1672 
   1673 static gcn_isa
   1674 isa_code(const char *isa) {
   1675   if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
   1676     return EF_AMDGPU_MACH_AMDGCN_GFX803;
   1677 
   1678   if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
   1679     return EF_AMDGPU_MACH_AMDGCN_GFX900;
   1680 
   1681   if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
   1682     return EF_AMDGPU_MACH_AMDGCN_GFX906;
   1683 
   1684   return -1;
   1685 }
   1686 
   1687 /* }}}  */
   1688 /* {{{ Run  */
   1689 
   1690 /* Create or reuse a team arena.
   1691 
   1692    Team arenas are used by OpenMP to avoid calling malloc multiple times
   1693    while setting up each team.  This is purely a performance optimization.
   1694 
   1695    Allocating an arena also costs performance, albeit on the host side, so
   1696    this function will reuse an existing arena if a large enough one is idle.
   1697    The arena is released, but not deallocated, when the kernel exits.  */
   1698 
   1699 static void *
   1700 get_team_arena (struct agent_info *agent, int num_teams)
   1701 {
   1702   struct team_arena_list **next_ptr = &agent->team_arena_list;
   1703   struct team_arena_list *item;
   1704 
   1705   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
   1706     {
   1707       if (item->num_teams < num_teams)
   1708 	continue;
   1709 
   1710       if (pthread_mutex_trylock (&item->in_use))
   1711 	continue;
   1712 
   1713       return item->arena;
   1714     }
   1715 
   1716   GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
   1717 
   1718   if (pthread_mutex_lock (&agent->team_arena_write_lock))
   1719     {
   1720       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
   1721       return false;
   1722     }
   1723   item = malloc (sizeof (*item));
   1724   item->num_teams = num_teams;
   1725   item->next = NULL;
   1726   *next_ptr = item;
   1727 
   1728   if (pthread_mutex_init (&item->in_use, NULL))
   1729     {
   1730       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
   1731       return false;
   1732     }
   1733   if (pthread_mutex_lock (&item->in_use))
   1734     {
   1735       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
   1736       return false;
   1737     }
   1738   if (pthread_mutex_unlock (&agent->team_arena_write_lock))
   1739     {
   1740       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
   1741       return false;
   1742     }
   1743 
   1744   const int TEAM_ARENA_SIZE = 64*1024;  /* Must match libgomp.h.  */
   1745   hsa_status_t status;
   1746   status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
   1747 					   TEAM_ARENA_SIZE*num_teams,
   1748 					   &item->arena);
   1749   if (status != HSA_STATUS_SUCCESS)
   1750     hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
   1751   status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
   1752 					       HSA_ACCESS_PERMISSION_RW);
   1753   if (status != HSA_STATUS_SUCCESS)
   1754     hsa_fatal ("Could not assign arena memory to device", status);
   1755 
   1756   return item->arena;
   1757 }
   1758 
   1759 /* Mark a team arena available for reuse.  */
   1760 
   1761 static void
   1762 release_team_arena (struct agent_info* agent, void *arena)
   1763 {
   1764   struct team_arena_list *item;
   1765 
   1766   for (item = agent->team_arena_list; item; item = item->next)
   1767     {
   1768       if (item->arena == arena)
   1769 	{
   1770 	  if (pthread_mutex_unlock (&item->in_use))
   1771 	    GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
   1772 	  return;
   1773 	}
   1774     }
   1775   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
   1776 }
   1777 
   1778 /* Clean up all the allocated team arenas.  */
   1779 
   1780 static bool
   1781 destroy_team_arenas (struct agent_info *agent)
   1782 {
   1783   struct team_arena_list *item, *next;
   1784 
   1785   for (item = agent->team_arena_list; item; item = next)
   1786     {
   1787       next = item->next;
   1788       hsa_fns.hsa_memory_free_fn (item->arena);
   1789       if (pthread_mutex_destroy (&item->in_use))
   1790 	{
   1791 	  GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
   1792 	  return false;
   1793 	}
   1794       free (item);
   1795     }
   1796   agent->team_arena_list = NULL;
   1797 
   1798   return true;
   1799 }
   1800 
   1801 /* Allocate memory on a specified device.  */
   1802 
   1803 static void *
   1804 alloc_by_agent (struct agent_info *agent, size_t size)
   1805 {
   1806   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
   1807 
   1808   /* Zero-size allocations are invalid, so in order to return a valid pointer
   1809      we need to pass a valid size.  One source of zero-size allocations is
   1810      kernargs for kernels that have no inputs or outputs (the kernel may
   1811      only use console output, for example).  */
   1812   if (size == 0)
   1813     size = 4;
   1814 
   1815   void *ptr;
   1816   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
   1817 							size, &ptr);
   1818   if (status != HSA_STATUS_SUCCESS)
   1819     {
   1820       hsa_error ("Could not allocate device memory", status);
   1821       return NULL;
   1822     }
   1823 
   1824   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
   1825 					       HSA_ACCESS_PERMISSION_RW);
   1826   if (status != HSA_STATUS_SUCCESS)
   1827     {
   1828       hsa_error ("Could not assign data memory to device", status);
   1829       return NULL;
   1830     }
   1831 
   1832   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
   1833   bool profiling_dispatch_p
   1834     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
   1835   if (profiling_dispatch_p)
   1836     {
   1837       acc_prof_info *prof_info = thr->prof_info;
   1838       acc_event_info data_event_info;
   1839       acc_api_info *api_info = thr->api_info;
   1840 
   1841       prof_info->event_type = acc_ev_alloc;
   1842 
   1843       data_event_info.data_event.event_type = prof_info->event_type;
   1844       data_event_info.data_event.valid_bytes
   1845 	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
   1846       data_event_info.data_event.parent_construct
   1847 	= acc_construct_parallel;
   1848       data_event_info.data_event.implicit = 1;
   1849       data_event_info.data_event.tool_info = NULL;
   1850       data_event_info.data_event.var_name = NULL;
   1851       data_event_info.data_event.bytes = size;
   1852       data_event_info.data_event.host_ptr = NULL;
   1853       data_event_info.data_event.device_ptr = (void *) ptr;
   1854 
   1855       api_info->device_api = acc_device_api_other;
   1856 
   1857       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
   1858 					    api_info);
   1859     }
   1860 
   1861   return ptr;
   1862 }
   1863 
   1864 /* Create kernel dispatch data structure for given KERNEL, along with
   1865    the necessary device signals and memory allocations.  */
   1866 
   1867 static struct kernel_dispatch *
   1868 create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
   1869 {
   1870   struct agent_info *agent = kernel->agent;
   1871   struct kernel_dispatch *shadow
   1872     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
   1873 
   1874   shadow->agent = kernel->agent;
   1875   shadow->object = kernel->object;
   1876 
   1877   hsa_signal_t sync_signal;
   1878   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
   1879   if (status != HSA_STATUS_SUCCESS)
   1880     hsa_fatal ("Error creating the GCN sync signal", status);
   1881 
   1882   shadow->signal = sync_signal.handle;
   1883   shadow->private_segment_size = kernel->private_segment_size;
   1884   shadow->group_segment_size = kernel->group_segment_size;
   1885 
   1886   /* We expect kernels to request a single pointer, explicitly, and the
   1887      rest of struct kernargs, implicitly.  If they request anything else
   1888      then something is wrong.  */
   1889   if (kernel->kernarg_segment_size > 8)
   1890     {
   1891       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
   1892       return NULL;
   1893     }
   1894 
   1895   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
   1896 					   sizeof (struct kernargs),
   1897 					   &shadow->kernarg_address);
   1898   if (status != HSA_STATUS_SUCCESS)
   1899     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
   1900   struct kernargs *kernargs = shadow->kernarg_address;
   1901 
   1902   /* Zero-initialize the output_data (minimum needed).  */
   1903   kernargs->out_ptr = (int64_t)&kernargs->output_data;
   1904   kernargs->output_data.next_output = 0;
   1905   for (unsigned i = 0;
   1906        i < (sizeof (kernargs->output_data.queue)
   1907 	    / sizeof (kernargs->output_data.queue[0]));
   1908        i++)
   1909     kernargs->output_data.queue[i].written = 0;
   1910   kernargs->output_data.consumed = 0;
   1911 
   1912   /* Pass in the heap location.  */
   1913   kernargs->heap_ptr = (int64_t)kernel->module->heap;
   1914 
   1915   /* Create an arena.  */
   1916   if (kernel->kind == KIND_OPENMP)
   1917     kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
   1918   else
   1919     kernargs->arena_ptr = 0;
   1920 
   1921   /* Ensure we can recognize unset return values.  */
   1922   kernargs->output_data.return_value = 0xcafe0000;
   1923 
   1924   return shadow;
   1925 }
   1926 
   1927 /* Output any data written to console output from the kernel.  It is expected
   1928    that this function is polled during kernel execution.
   1929 
   1930    We print all entries from the last item printed to the next entry without
   1931    a "written" flag.  If the "final" flag is set then it'll continue right to
   1932    the end.
   1933 
   1934    The print buffer is circular, but the from and to locations don't wrap when
   1935    the buffer does, so the output limit is UINT_MAX.  The target blocks on
   1936    output when the buffer is full.  */
   1937 
   1938 static void
   1939 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
   1940 		bool final)
   1941 {
   1942   unsigned int limit = (sizeof (kernargs->output_data.queue)
   1943 			/ sizeof (kernargs->output_data.queue[0]));
   1944 
   1945   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
   1946 				       __ATOMIC_ACQUIRE);
   1947   unsigned int to = kernargs->output_data.next_output;
   1948 
   1949   if (from > to)
   1950     {
   1951       /* Overflow.  */
   1952       if (final)
   1953 	printf ("GCN print buffer overflowed.\n");
   1954       return;
   1955     }
   1956 
   1957   unsigned int i;
   1958   for (i = from; i < to; i++)
   1959     {
   1960       struct printf_data *data = &kernargs->output_data.queue[i%limit];
   1961 
   1962       if (!data->written && !final)
   1963 	break;
   1964 
   1965       switch (data->type)
   1966 	{
   1967 	case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
   1968 	case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
   1969 	case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
   1970 	case 3: printf ("%.128s%.128s", data->msg, data->text); break;
   1971 	default: printf ("GCN print buffer error!\n"); break;
   1972 	}
   1973       data->written = 0;
   1974       __atomic_store_n (&kernargs->output_data.consumed, i+1,
   1975 			__ATOMIC_RELEASE);
   1976     }
   1977   fflush (stdout);
   1978 }
   1979 
   1980 /* Release data structure created for a kernel dispatch in SHADOW argument,
   1981    and clean up the signal and memory allocations.  */
   1982 
   1983 static void
   1984 release_kernel_dispatch (struct kernel_dispatch *shadow)
   1985 {
   1986   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
   1987 
   1988   struct kernargs *kernargs = shadow->kernarg_address;
   1989   void *arena = (void *)kernargs->arena_ptr;
   1990   if (arena)
   1991     release_team_arena (shadow->agent, arena);
   1992 
   1993   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
   1994 
   1995   hsa_signal_t s;
   1996   s.handle = shadow->signal;
   1997   hsa_fns.hsa_signal_destroy_fn (s);
   1998 
   1999   free (shadow);
   2000 }
   2001 
   2002 /* Extract the properties from a kernel binary.  */
   2003 
   2004 static void
   2005 init_kernel_properties (struct kernel_info *kernel)
   2006 {
   2007   hsa_status_t status;
   2008   struct agent_info *agent = kernel->agent;
   2009   hsa_executable_symbol_t kernel_symbol;
   2010   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   2011 						 kernel->name, agent->id,
   2012 						 0, &kernel_symbol);
   2013   if (status != HSA_STATUS_SUCCESS)
   2014     {
   2015       hsa_warn ("Could not find symbol for kernel in the code object", status);
   2016       fprintf (stderr, "not found name: '%s'\n", kernel->name);
   2017       dump_executable_symbols (agent->executable);
   2018       goto failure;
   2019     }
   2020   GCN_DEBUG ("Located kernel %s\n", kernel->name);
   2021   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2022     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
   2023   if (status != HSA_STATUS_SUCCESS)
   2024     hsa_fatal ("Could not extract a kernel object from its symbol", status);
   2025   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2026     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
   2027      &kernel->kernarg_segment_size);
   2028   if (status != HSA_STATUS_SUCCESS)
   2029     hsa_fatal ("Could not get info about kernel argument size", status);
   2030   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2031     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
   2032      &kernel->group_segment_size);
   2033   if (status != HSA_STATUS_SUCCESS)
   2034     hsa_fatal ("Could not get info about kernel group segment size", status);
   2035   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2036     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
   2037      &kernel->private_segment_size);
   2038   if (status != HSA_STATUS_SUCCESS)
   2039     hsa_fatal ("Could not get info about kernel private segment size",
   2040 	       status);
   2041 
   2042   /* The kernel type is not known until something tries to launch it.  */
   2043   kernel->kind = KIND_UNKNOWN;
   2044 
   2045   GCN_DEBUG ("Kernel structure for %s fully initialized with "
   2046 	     "following segment sizes: \n", kernel->name);
   2047   GCN_DEBUG ("  group_segment_size: %u\n",
   2048 	     (unsigned) kernel->group_segment_size);
   2049   GCN_DEBUG ("  private_segment_size: %u\n",
   2050 	     (unsigned) kernel->private_segment_size);
   2051   GCN_DEBUG ("  kernarg_segment_size: %u\n",
   2052 	     (unsigned) kernel->kernarg_segment_size);
   2053   return;
   2054 
   2055 failure:
   2056   kernel->initialization_failed = true;
   2057 }
   2058 
   2059 /* Do all the work that is necessary before running KERNEL for the first time.
   2060    The function assumes the program has been created, finalized and frozen by
   2061    create_and_finalize_hsa_program.  */
   2062 
   2063 static void
   2064 init_kernel (struct kernel_info *kernel)
   2065 {
   2066   if (pthread_mutex_lock (&kernel->init_mutex))
   2067     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
   2068   if (kernel->initialized)
   2069     {
   2070       if (pthread_mutex_unlock (&kernel->init_mutex))
   2071 	GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
   2072 			   "mutex");
   2073 
   2074       return;
   2075     }
   2076 
   2077   init_kernel_properties (kernel);
   2078 
   2079   if (!kernel->initialization_failed)
   2080     {
   2081       GCN_DEBUG ("\n");
   2082 
   2083       kernel->initialized = true;
   2084     }
   2085   if (pthread_mutex_unlock (&kernel->init_mutex))
   2086     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
   2087 		       "mutex");
   2088 }
   2089 
   2090 /* Run KERNEL on its agent, pass VARS to it as arguments and take
   2091    launch attributes from KLA.
   2092 
   2093    MODULE_LOCKED indicates that the caller already holds the lock and
   2094    run_kernel need not lock it again.
   2095    If AQ is NULL then agent->sync_queue will be used.  */
   2096 
   2097 static void
   2098 run_kernel (struct kernel_info *kernel, void *vars,
   2099 	    struct GOMP_kernel_launch_attributes *kla,
   2100 	    struct goacc_asyncqueue *aq, bool module_locked)
   2101 {
   2102   GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
   2103 	     kernel->description->vpgr_count);
   2104 
   2105   /* Reduce the number of threads/workers if there are insufficient
   2106      VGPRs available to run the kernels together.  */
   2107   if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
   2108     {
   2109       int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
   2110       int max_threads = (256 / granulated_vgprs) * 4;
   2111       if (kla->gdims[2] > max_threads)
   2112 	{
   2113 	  GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
   2114 		       " per team/gang - reducing to %d threads/workers.\n",
   2115 		       kla->gdims[2], max_threads);
   2116 	  kla->gdims[2] = max_threads;
   2117 	}
   2118     }
   2119 
   2120   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
   2121 	     (aq ? aq->id : 0));
   2122   GCN_DEBUG ("GCN launch attribs: gdims:[");
   2123   int i;
   2124   for (i = 0; i < kla->ndim; ++i)
   2125     {
   2126       if (i)
   2127 	DEBUG_PRINT (", ");
   2128       DEBUG_PRINT ("%u", kla->gdims[i]);
   2129     }
   2130   DEBUG_PRINT ("], normalized gdims:[");
   2131   for (i = 0; i < kla->ndim; ++i)
   2132     {
   2133       if (i)
   2134 	DEBUG_PRINT (", ");
   2135       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
   2136     }
   2137   DEBUG_PRINT ("], wdims:[");
   2138   for (i = 0; i < kla->ndim; ++i)
   2139     {
   2140       if (i)
   2141 	DEBUG_PRINT (", ");
   2142       DEBUG_PRINT ("%u", kla->wdims[i]);
   2143     }
   2144   DEBUG_PRINT ("]\n");
   2145   DEBUG_FLUSH ();
   2146 
   2147   struct agent_info *agent = kernel->agent;
   2148   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
   2149     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
   2150 
   2151   if (!agent->initialized)
   2152     GOMP_PLUGIN_fatal ("Agent must be initialized");
   2153 
   2154   if (!kernel->initialized)
   2155     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
   2156 
   2157   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
   2158 
   2159   uint64_t index
   2160     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
   2161   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
   2162 
   2163   /* Wait until the queue is not full before writing the packet.   */
   2164   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
   2165 	 >= command_q->size)
   2166     ;
   2167 
   2168   /* Do not allow the dimensions to be overridden when running
   2169      constructors or destructors.  */
   2170   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
   2171   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
   2172 
   2173   hsa_kernel_dispatch_packet_t *packet;
   2174   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
   2175 	   + index % command_q->size;
   2176 
   2177   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
   2178   packet->grid_size_x = override_x ? : kla->gdims[0];
   2179   packet->workgroup_size_x = get_group_size (kla->ndim,
   2180 					     packet->grid_size_x,
   2181 					     kla->wdims[0]);
   2182 
   2183   if (kla->ndim >= 2)
   2184     {
   2185       packet->grid_size_y = kla->gdims[1];
   2186       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
   2187 						 kla->wdims[1]);
   2188     }
   2189   else
   2190     {
   2191       packet->grid_size_y = 1;
   2192       packet->workgroup_size_y = 1;
   2193     }
   2194 
   2195   if (kla->ndim == 3)
   2196     {
   2197       packet->grid_size_z = limit_worker_threads (override_z
   2198 						  ? : kla->gdims[2]);
   2199       packet->workgroup_size_z = get_group_size (kla->ndim,
   2200 						 packet->grid_size_z,
   2201 						 kla->wdims[2]);
   2202     }
   2203   else
   2204     {
   2205       packet->grid_size_z = 1;
   2206       packet->workgroup_size_z = 1;
   2207     }
   2208 
   2209   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
   2210 	     " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
   2211 	     packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
   2212 	     packet->grid_size_x / packet->workgroup_size_x,
   2213 	     packet->grid_size_y / packet->workgroup_size_y,
   2214 	     packet->grid_size_z / packet->workgroup_size_z,
   2215 	     packet->workgroup_size_x, packet->workgroup_size_y,
   2216 	     packet->workgroup_size_z);
   2217 
   2218   struct kernel_dispatch *shadow
   2219     = create_kernel_dispatch (kernel, packet->grid_size_x);
   2220   shadow->queue = command_q;
   2221 
   2222   if (debug)
   2223     {
   2224       fprintf (stderr, "\nKernel has following dependencies:\n");
   2225       print_kernel_dispatch (shadow, 2);
   2226     }
   2227 
   2228   packet->private_segment_size = kernel->private_segment_size;
   2229   packet->group_segment_size = kernel->group_segment_size;
   2230   packet->kernel_object = kernel->object;
   2231   packet->kernarg_address = shadow->kernarg_address;
   2232   hsa_signal_t s;
   2233   s.handle = shadow->signal;
   2234   packet->completion_signal = s;
   2235   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
   2236   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
   2237 
   2238   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
   2239 
   2240   uint16_t header;
   2241   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
   2242   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
   2243   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
   2244 
   2245   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
   2246 	     agent->device_id);
   2247 
   2248   packet_store_release ((uint32_t *) packet, header,
   2249 			(uint16_t) kla->ndim
   2250 			<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
   2251 
   2252   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
   2253 				       index);
   2254 
   2255   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
   2256 
   2257   /* Root signal waits with 1ms timeout.  */
   2258   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
   2259 					     1000 * 1000,
   2260 					     HSA_WAIT_STATE_BLOCKED) != 0)
   2261     {
   2262       console_output (kernel, shadow->kernarg_address, false);
   2263     }
   2264   console_output (kernel, shadow->kernarg_address, true);
   2265 
   2266   struct kernargs *kernargs = shadow->kernarg_address;
   2267   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
   2268 
   2269   release_kernel_dispatch (shadow);
   2270 
   2271   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
   2272     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
   2273 
   2274   unsigned int upper = (return_value & ~0xffff) >> 16;
   2275   if (upper == 0xcafe)
   2276     ; // exit not called, normal termination.
   2277   else if (upper == 0xffff)
   2278     ; // exit called.
   2279   else
   2280     {
   2281       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
   2282 			 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
   2283 			 return_value);
   2284       abort ();
   2285     }
   2286 
   2287   if (upper == 0xffff)
   2288     {
   2289       unsigned int signal = (return_value >> 8) & 0xff;
   2290 
   2291       if (signal == SIGABRT)
   2292 	{
   2293 	  GCN_WARNING ("GCN Kernel aborted\n");
   2294 	  abort ();
   2295 	}
   2296       else if (signal != 0)
   2297 	{
   2298 	  GCN_WARNING ("GCN Kernel received unknown signal\n");
   2299 	  abort ();
   2300 	}
   2301 
   2302       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
   2303       exit (return_value & 0xff);
   2304     }
   2305 }
   2306 
   2307 /* }}}  */
   2308 /* {{{ Load/Unload  */
   2309 
   2310 /* Initialize KERNEL from D and other parameters.  Return true on success. */
   2311 
   2312 static bool
   2313 init_basic_kernel_info (struct kernel_info *kernel,
   2314 			struct hsa_kernel_description *d,
   2315 			struct agent_info *agent,
   2316 			struct module_info *module)
   2317 {
   2318   kernel->agent = agent;
   2319   kernel->module = module;
   2320   kernel->name = d->name;
   2321   kernel->description = d;
   2322   if (pthread_mutex_init (&kernel->init_mutex, NULL))
   2323     {
   2324       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
   2325       return false;
   2326     }
   2327   return true;
   2328 }
   2329 
   2330 /* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true.  If
   2331    not found, return false.  */
   2332 
   2333 static bool
   2334 find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
   2335 		  struct module_info *module, Elf64_Ehdr *image,
   2336 		  Elf64_Shdr *sections)
   2337 {
   2338   bool res = false;
   2339 
   2340   hsa_status_t status;
   2341 
   2342   hsa_executable_symbol_t symbol;
   2343   if (!find_executable_symbol (agent->executable, &symbol))
   2344     return false;
   2345 
   2346   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2347     (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
   2348   if (status != HSA_STATUS_SUCCESS)
   2349     {
   2350       hsa_error ("Could not extract symbol address", status);
   2351       return false;
   2352     }
   2353 
   2354   char *symbol_name = get_executable_symbol_name (symbol);
   2355   if (symbol_name == NULL)
   2356     return false;
   2357 
   2358   /* Find the kernel function in ELF, and calculate actual load offset.  */
   2359   for (int i = 0; i < image->e_shnum; i++)
   2360     if (sections[i].sh_type == SHT_SYMTAB)
   2361       {
   2362 	Elf64_Shdr *strtab = &sections[sections[i].sh_link];
   2363 	char *strings = (char *)image + strtab->sh_offset;
   2364 
   2365 	for (size_t offset = 0;
   2366 	     offset < sections[i].sh_size;
   2367 	     offset += sections[i].sh_entsize)
   2368 	  {
   2369 	    Elf64_Sym *sym = (Elf64_Sym*)((char*)image
   2370 					  + sections[i].sh_offset
   2371 					  + offset);
   2372 	    if (strcmp (symbol_name, strings + sym->st_name) == 0)
   2373 	      {
   2374 		*load_offset -= sym->st_value;
   2375 		res = true;
   2376 		break;
   2377 	      }
   2378 	  }
   2379       }
   2380 
   2381   free (symbol_name);
   2382   return res;
   2383 }
   2384 
   2385 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
   2386 
   2387 static bool
   2388 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
   2389 {
   2390   int isa_field = elf_gcn_isa_field (image);
   2391   const char* isa_s = isa_hsa_name (isa_field);
   2392   if (!isa_s)
   2393     {
   2394       hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
   2395       return false;
   2396     }
   2397 
   2398   if (isa_field != agent->device_isa)
   2399     {
   2400       char msg[120];
   2401       const char *agent_isa_s = isa_hsa_name (agent->device_isa);
   2402       const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
   2403       assert (agent_isa_s);
   2404       assert (agent_isa_gcc_s);
   2405 
   2406       snprintf (msg, sizeof msg,
   2407 		"GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
   2408 		"Try to recompile with '-foffload=-march=%s'.\n",
   2409 		isa_s, agent_isa_s, agent_isa_gcc_s);
   2410 
   2411       hsa_error (msg, HSA_STATUS_ERROR);
   2412       return false;
   2413     }
   2414 
   2415   return true;
   2416 }
   2417 
   2418 /* Create and finalize the program consisting of all loaded modules.  */
   2419 
   2420 static bool
   2421 create_and_finalize_hsa_program (struct agent_info *agent)
   2422 {
   2423   hsa_status_t status;
   2424   int reloc_count = 0;
   2425   bool res = true;
   2426   if (pthread_mutex_lock (&agent->prog_mutex))
   2427     {
   2428       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
   2429       return false;
   2430     }
   2431   if (agent->prog_finalized)
   2432     goto final;
   2433 
   2434   status
   2435     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
   2436 					HSA_EXECUTABLE_STATE_UNFROZEN,
   2437 					"", &agent->executable);
   2438   if (status != HSA_STATUS_SUCCESS)
   2439     {
   2440       hsa_error ("Could not create GCN executable", status);
   2441       goto fail;
   2442     }
   2443 
   2444   /* Load any GCN modules.  */
   2445   struct module_info *module = agent->module;
   2446   if (module)
   2447     {
   2448       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
   2449 
   2450       if (!isa_matches_agent (agent, image))
   2451 	goto fail;
   2452 
   2453       /* Hide relocations from the HSA runtime loader.
   2454 	 Keep a copy of the unmodified section headers to use later.  */
   2455       Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
   2456 						  + image->e_shoff);
   2457       for (int i = image->e_shnum - 1; i >= 0; i--)
   2458 	{
   2459 	  if (image_sections[i].sh_type == SHT_RELA
   2460 	      || image_sections[i].sh_type == SHT_REL)
   2461 	    /* Change section type to something harmless.  */
   2462 	    image_sections[i].sh_type |= 0x80;
   2463 	}
   2464 
   2465       hsa_code_object_t co = { 0 };
   2466       status = hsa_fns.hsa_code_object_deserialize_fn
   2467 	(module->image_desc->gcn_image->image,
   2468 	 module->image_desc->gcn_image->size,
   2469 	 NULL, &co);
   2470       if (status != HSA_STATUS_SUCCESS)
   2471 	{
   2472 	  hsa_error ("Could not deserialize GCN code object", status);
   2473 	  goto fail;
   2474 	}
   2475 
   2476       status = hsa_fns.hsa_executable_load_code_object_fn
   2477 	(agent->executable, agent->id, co, "");
   2478       if (status != HSA_STATUS_SUCCESS)
   2479 	{
   2480 	  hsa_error ("Could not load GCN code object", status);
   2481 	  goto fail;
   2482 	}
   2483 
   2484       if (!module->heap)
   2485 	{
   2486 	  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
   2487 						   gcn_kernel_heap_size,
   2488 						   (void**)&module->heap);
   2489 	  if (status != HSA_STATUS_SUCCESS)
   2490 	    {
   2491 	      hsa_error ("Could not allocate memory for GCN heap", status);
   2492 	      goto fail;
   2493 	    }
   2494 
   2495 	  status = hsa_fns.hsa_memory_assign_agent_fn
   2496 			(module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
   2497 	  if (status != HSA_STATUS_SUCCESS)
   2498 	    {
   2499 	      hsa_error ("Could not assign GCN heap memory to device", status);
   2500 	      goto fail;
   2501 	    }
   2502 
   2503 	  hsa_fns.hsa_memory_copy_fn (&module->heap->size,
   2504 				      &gcn_kernel_heap_size,
   2505 				      sizeof (gcn_kernel_heap_size));
   2506 	}
   2507 
   2508     }
   2509 
   2510   if (debug)
   2511     dump_executable_symbols (agent->executable);
   2512 
   2513   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
   2514   if (status != HSA_STATUS_SUCCESS)
   2515     {
   2516       hsa_error ("Could not freeze the GCN executable", status);
   2517       goto fail;
   2518     }
   2519 
   2520   if (agent->module)
   2521     {
   2522       struct module_info *module = agent->module;
   2523       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
   2524       Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
   2525 
   2526       Elf64_Addr load_offset;
   2527       if (!find_load_offset (&load_offset, agent, module, image, sections))
   2528 	goto fail;
   2529 
   2530       /* Record the physical load address range.
   2531 	 We need this for data copies later.  */
   2532       Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
   2533       Elf64_Addr low = ~0, high = 0;
   2534       for (int i = 0; i < image->e_phnum; i++)
   2535 	if (segments[i].p_memsz > 0)
   2536 	  {
   2537 	    if (segments[i].p_paddr < low)
   2538 	      low = segments[i].p_paddr;
   2539 	    if (segments[i].p_paddr > high)
   2540 	      high = segments[i].p_paddr + segments[i].p_memsz - 1;
   2541 	  }
   2542       module->phys_address_start = low + load_offset;
   2543       module->phys_address_end = high + load_offset;
   2544 
   2545       // Find dynamic symbol table
   2546       Elf64_Shdr *dynsym = NULL;
   2547       for (int i = 0; i < image->e_shnum; i++)
   2548 	if (sections[i].sh_type == SHT_DYNSYM)
   2549 	  {
   2550 	    dynsym = &sections[i];
   2551 	    break;
   2552 	  }
   2553 
   2554       /* Fix up relocations.  */
   2555       for (int i = 0; i < image->e_shnum; i++)
   2556 	{
   2557 	  if (sections[i].sh_type == (SHT_RELA | 0x80))
   2558 	    for (size_t offset = 0;
   2559 		 offset < sections[i].sh_size;
   2560 		 offset += sections[i].sh_entsize)
   2561 	      {
   2562 		Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
   2563 						  + sections[i].sh_offset
   2564 						  + offset);
   2565 		Elf64_Sym *sym =
   2566 		  (dynsym
   2567 		   ? (Elf64_Sym*)((char*)image
   2568 				  + dynsym->sh_offset
   2569 				  + (dynsym->sh_entsize
   2570 				     * ELF64_R_SYM (reloc->r_info)))
   2571 		   : NULL);
   2572 
   2573 		int64_t S = (sym ? sym->st_value : 0);
   2574 		int64_t P = reloc->r_offset + load_offset;
   2575 		int64_t A = reloc->r_addend;
   2576 		int64_t B = load_offset;
   2577 		int64_t V, size;
   2578 		switch (ELF64_R_TYPE (reloc->r_info))
   2579 		  {
   2580 		  case R_AMDGPU_ABS32_LO:
   2581 		    V = (S + A) & 0xFFFFFFFF;
   2582 		    size = 4;
   2583 		    break;
   2584 		  case R_AMDGPU_ABS32_HI:
   2585 		    V = (S + A) >> 32;
   2586 		    size = 4;
   2587 		    break;
   2588 		  case R_AMDGPU_ABS64:
   2589 		    V = S + A;
   2590 		    size = 8;
   2591 		    break;
   2592 		  case R_AMDGPU_REL32:
   2593 		    V = S + A - P;
   2594 		    size = 4;
   2595 		    break;
   2596 		  case R_AMDGPU_REL64:
   2597 		    /* FIXME
   2598 		       LLD seems to emit REL64 where the the assembler has
   2599 		       ABS64.  This is clearly wrong because it's not what the
   2600 		       compiler is expecting.  Let's assume, for now, that
   2601 		       it's a bug.  In any case, GCN kernels are always self
   2602 		       contained and therefore relative relocations will have
   2603 		       been resolved already, so this should be a safe
   2604 		       workaround.  */
   2605 		    V = S + A/* - P*/;
   2606 		    size = 8;
   2607 		    break;
   2608 		  case R_AMDGPU_ABS32:
   2609 		    V = S + A;
   2610 		    size = 4;
   2611 		    break;
   2612 		    /* TODO R_AMDGPU_GOTPCREL */
   2613 		    /* TODO R_AMDGPU_GOTPCREL32_LO */
   2614 		    /* TODO R_AMDGPU_GOTPCREL32_HI */
   2615 		  case R_AMDGPU_REL32_LO:
   2616 		    V = (S + A - P) & 0xFFFFFFFF;
   2617 		    size = 4;
   2618 		    break;
   2619 		  case R_AMDGPU_REL32_HI:
   2620 		    V = (S + A - P) >> 32;
   2621 		    size = 4;
   2622 		    break;
   2623 		  case R_AMDGPU_RELATIVE64:
   2624 		    V = B + A;
   2625 		    size = 8;
   2626 		    break;
   2627 		  default:
   2628 		    fprintf (stderr, "Error: unsupported relocation type.\n");
   2629 		    exit (1);
   2630 		  }
   2631 		status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
   2632 		if (status != HSA_STATUS_SUCCESS)
   2633 		  {
   2634 		    hsa_error ("Failed to fix up relocation", status);
   2635 		    goto fail;
   2636 		  }
   2637 		reloc_count++;
   2638 	      }
   2639 	}
   2640     }
   2641 
   2642   GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
   2643 	     agent->device_id, reloc_count);
   2644 
   2645 final:
   2646   agent->prog_finalized = true;
   2647 
   2648   if (pthread_mutex_unlock (&agent->prog_mutex))
   2649     {
   2650       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
   2651       res = false;
   2652     }
   2653 
   2654   return res;
   2655 
   2656 fail:
   2657   res = false;
   2658   goto final;
   2659 }
   2660 
   2661 /* Free the HSA program in agent and everything associated with it and set
   2662    agent->prog_finalized and the initialized flags of all kernels to false.
   2663    Return TRUE on success.  */
   2664 
   2665 static bool
   2666 destroy_hsa_program (struct agent_info *agent)
   2667 {
   2668   if (!agent->prog_finalized)
   2669     return true;
   2670 
   2671   hsa_status_t status;
   2672 
   2673   GCN_DEBUG ("Destroying the current GCN program.\n");
   2674 
   2675   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
   2676   if (status != HSA_STATUS_SUCCESS)
   2677     return hsa_error ("Could not destroy GCN executable", status);
   2678 
   2679   if (agent->module)
   2680     {
   2681       int i;
   2682       for (i = 0; i < agent->module->kernel_count; i++)
   2683 	agent->module->kernels[i].initialized = false;
   2684 
   2685       if (agent->module->heap)
   2686 	{
   2687 	  hsa_fns.hsa_memory_free_fn (agent->module->heap);
   2688 	  agent->module->heap = NULL;
   2689 	}
   2690     }
   2691   agent->prog_finalized = false;
   2692   return true;
   2693 }
   2694 
   2695 /* Deinitialize all information associated with MODULE and kernels within
   2696    it.  Return TRUE on success.  */
   2697 
   2698 static bool
   2699 destroy_module (struct module_info *module, bool locked)
   2700 {
   2701   /* Run destructors before destroying module.  */
   2702   struct GOMP_kernel_launch_attributes kla =
   2703     { 3,
   2704       /* Grid size.  */
   2705       { 1, 64, 1 },
   2706       /* Work-group size.  */
   2707       { 1, 64, 1 }
   2708     };
   2709 
   2710   if (module->fini_array_func)
   2711     {
   2712       init_kernel (module->fini_array_func);
   2713       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
   2714     }
   2715   module->constructors_run_p = false;
   2716 
   2717   int i;
   2718   for (i = 0; i < module->kernel_count; i++)
   2719     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
   2720       {
   2721 	GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
   2722 			   "mutex");
   2723 	return false;
   2724       }
   2725 
   2726   return true;
   2727 }
   2728 
   2729 /* }}}  */
   2730 /* {{{ Async  */
   2731 
   2732 /* Callback of dispatch queues to report errors.  */
   2733 
   2734 static void
   2735 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
   2736 {
   2737   struct queue_entry *entry = &aq->queue[index];
   2738 
   2739   switch (entry->type)
   2740     {
   2741     case KERNEL_LAUNCH:
   2742       if (DEBUG_QUEUES)
   2743 	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
   2744 		   aq->agent->device_id, aq->id, index);
   2745       run_kernel (entry->u.launch.kernel,
   2746 		  entry->u.launch.vars,
   2747 		  &entry->u.launch.kla, aq, false);
   2748       if (DEBUG_QUEUES)
   2749 	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
   2750 		   aq->agent->device_id, aq->id, index);
   2751       break;
   2752 
   2753     case CALLBACK:
   2754       if (DEBUG_QUEUES)
   2755 	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
   2756 		   aq->agent->device_id, aq->id, index);
   2757       entry->u.callback.fn (entry->u.callback.data);
   2758       if (DEBUG_QUEUES)
   2759 	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
   2760 		   aq->agent->device_id, aq->id, index);
   2761       break;
   2762 
   2763     case ASYNC_WAIT:
   2764       {
   2765 	/* FIXME: is it safe to access a placeholder that may already have
   2766 	   been executed?  */
   2767         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
   2768 
   2769 	if (DEBUG_QUEUES)
   2770           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
   2771 		     aq->agent->device_id, aq->id, index);
   2772 
   2773 	pthread_mutex_lock (&placeholderp->mutex);
   2774 
   2775 	while (!placeholderp->executed)
   2776           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
   2777 
   2778 	pthread_mutex_unlock (&placeholderp->mutex);
   2779 
   2780 	if (pthread_cond_destroy (&placeholderp->cond))
   2781 	  GOMP_PLUGIN_error ("Failed to destroy serialization cond");
   2782 
   2783 	if (pthread_mutex_destroy (&placeholderp->mutex))
   2784 	  GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
   2785 
   2786 	if (DEBUG_QUEUES)
   2787           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
   2788 		     "entry (%d) done\n", aq->agent->device_id, aq->id, index);
   2789       }
   2790       break;
   2791 
   2792     case ASYNC_PLACEHOLDER:
   2793       pthread_mutex_lock (&entry->u.placeholder.mutex);
   2794       entry->u.placeholder.executed = 1;
   2795       pthread_cond_signal (&entry->u.placeholder.cond);
   2796       pthread_mutex_unlock (&entry->u.placeholder.mutex);
   2797       break;
   2798 
   2799     default:
   2800       GOMP_PLUGIN_fatal ("Unknown queue element");
   2801     }
   2802 }
   2803 
   2804 /* This function is run as a thread to service an async queue in the
   2805    background.  It runs continuously until the stop flag is set.  */
   2806 
   2807 static void *
   2808 drain_queue (void *thread_arg)
   2809 {
   2810   struct goacc_asyncqueue *aq = thread_arg;
   2811 
   2812   if (DRAIN_QUEUE_SYNCHRONOUS_P)
   2813     {
   2814       aq->drain_queue_stop = 2;
   2815       return NULL;
   2816     }
   2817 
   2818   pthread_mutex_lock (&aq->mutex);
   2819 
   2820   while (true)
   2821     {
   2822       if (aq->drain_queue_stop)
   2823 	break;
   2824 
   2825       if (aq->queue_n > 0)
   2826 	{
   2827 	  pthread_mutex_unlock (&aq->mutex);
   2828 	  execute_queue_entry (aq, aq->queue_first);
   2829 
   2830 	  pthread_mutex_lock (&aq->mutex);
   2831 	  aq->queue_first = ((aq->queue_first + 1)
   2832 			     % ASYNC_QUEUE_SIZE);
   2833 	  aq->queue_n--;
   2834 
   2835 	  if (DEBUG_THREAD_SIGNAL)
   2836 	    GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
   2837 		       aq->agent->device_id, aq->id);
   2838 	  pthread_cond_broadcast (&aq->queue_cond_out);
   2839 	  pthread_mutex_unlock (&aq->mutex);
   2840 
   2841 	  if (DEBUG_QUEUES)
   2842 	    GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
   2843 		       aq->id);
   2844 	  pthread_mutex_lock (&aq->mutex);
   2845 	}
   2846       else
   2847 	{
   2848 	  if (DEBUG_THREAD_SLEEP)
   2849 	    GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
   2850 		       aq->agent->device_id, aq->id);
   2851 	  pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
   2852 	  if (DEBUG_THREAD_SLEEP)
   2853 	    GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
   2854 		       aq->agent->device_id, aq->id);
   2855 	}
   2856     }
   2857 
   2858   aq->drain_queue_stop = 2;
   2859   if (DEBUG_THREAD_SIGNAL)
   2860     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
   2861 	       aq->agent->device_id, aq->id);
   2862   pthread_cond_broadcast (&aq->queue_cond_out);
   2863   pthread_mutex_unlock (&aq->mutex);
   2864 
   2865   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
   2866   return NULL;
   2867 }
   2868 
   2869 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
   2870    is not usually the case.  This is just a debug tool.  */
   2871 
   2872 static void
   2873 drain_queue_synchronous (struct goacc_asyncqueue *aq)
   2874 {
   2875   pthread_mutex_lock (&aq->mutex);
   2876 
   2877   while (aq->queue_n > 0)
   2878     {
   2879       execute_queue_entry (aq, aq->queue_first);
   2880 
   2881       aq->queue_first = ((aq->queue_first + 1)
   2882 			 % ASYNC_QUEUE_SIZE);
   2883       aq->queue_n--;
   2884     }
   2885 
   2886   pthread_mutex_unlock (&aq->mutex);
   2887 }
   2888 
   2889 /* Block the current thread until an async queue is writable.  The aq->mutex
   2890    lock should be held on entry, and remains locked on exit.  */
   2891 
   2892 static void
   2893 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
   2894 {
   2895   if (aq->queue_n == ASYNC_QUEUE_SIZE)
   2896     {
   2897       /* Queue is full.  Wait for it to not be full.  */
   2898       while (aq->queue_n == ASYNC_QUEUE_SIZE)
   2899 	pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
   2900     }
   2901 }
   2902 
   2903 /* Request an asynchronous kernel launch on the specified queue.  This
   2904    may block if the queue is full, but returns without waiting for the
   2905    kernel to run.  */
   2906 
   2907 static void
   2908 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
   2909 		   void *vars, struct GOMP_kernel_launch_attributes *kla)
   2910 {
   2911   assert (aq->agent == kernel->agent);
   2912 
   2913   pthread_mutex_lock (&aq->mutex);
   2914 
   2915   wait_for_queue_nonfull (aq);
   2916 
   2917   int queue_last = ((aq->queue_first + aq->queue_n)
   2918 		    % ASYNC_QUEUE_SIZE);
   2919   if (DEBUG_QUEUES)
   2920     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
   2921 	       aq->id, queue_last);
   2922 
   2923   aq->queue[queue_last].type = KERNEL_LAUNCH;
   2924   aq->queue[queue_last].u.launch.kernel = kernel;
   2925   aq->queue[queue_last].u.launch.vars = vars;
   2926   aq->queue[queue_last].u.launch.kla = *kla;
   2927 
   2928   aq->queue_n++;
   2929 
   2930   if (DEBUG_THREAD_SIGNAL)
   2931     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
   2932 	       aq->agent->device_id, aq->id);
   2933   pthread_cond_signal (&aq->queue_cond_in);
   2934 
   2935   pthread_mutex_unlock (&aq->mutex);
   2936 }
   2937 
   2938 /* Request an asynchronous callback on the specified queue.  The callback
   2939    function will be called, with the given opaque data, from the appropriate
   2940    async thread, when all previous items on that queue are complete.  */
   2941 
   2942 static void
   2943 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
   2944 		     void *data)
   2945 {
   2946   pthread_mutex_lock (&aq->mutex);
   2947 
   2948   wait_for_queue_nonfull (aq);
   2949 
   2950   int queue_last = ((aq->queue_first + aq->queue_n)
   2951 		    % ASYNC_QUEUE_SIZE);
   2952   if (DEBUG_QUEUES)
   2953     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
   2954 	       aq->id, queue_last);
   2955 
   2956   aq->queue[queue_last].type = CALLBACK;
   2957   aq->queue[queue_last].u.callback.fn = fn;
   2958   aq->queue[queue_last].u.callback.data = data;
   2959 
   2960   aq->queue_n++;
   2961 
   2962   if (DEBUG_THREAD_SIGNAL)
   2963     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
   2964 	       aq->agent->device_id, aq->id);
   2965   pthread_cond_signal (&aq->queue_cond_in);
   2966 
   2967   pthread_mutex_unlock (&aq->mutex);
   2968 }
   2969 
   2970 /* Request that a given async thread wait for another thread (unspecified) to
   2971    reach the given placeholder.  The wait will occur when all previous entries
   2972    on the queue are complete.  A placeholder is effectively a kind of signal
   2973    which simply sets a flag when encountered in a queue.  */
   2974 
   2975 static void
   2976 queue_push_asyncwait (struct goacc_asyncqueue *aq,
   2977 		      struct placeholder *placeholderp)
   2978 {
   2979   pthread_mutex_lock (&aq->mutex);
   2980 
   2981   wait_for_queue_nonfull (aq);
   2982 
   2983   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
   2984   if (DEBUG_QUEUES)
   2985     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
   2986 	       aq->id, queue_last);
   2987 
   2988   aq->queue[queue_last].type = ASYNC_WAIT;
   2989   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
   2990 
   2991   aq->queue_n++;
   2992 
   2993   if (DEBUG_THREAD_SIGNAL)
   2994     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
   2995 	       aq->agent->device_id, aq->id);
   2996   pthread_cond_signal (&aq->queue_cond_in);
   2997 
   2998   pthread_mutex_unlock (&aq->mutex);
   2999 }
   3000 
   3001 /* Add a placeholder into an async queue.  When the async thread reaches the
   3002    placeholder it will set the "executed" flag to true and continue.
   3003    Another thread may be waiting on this thread reaching the placeholder.  */
   3004 
   3005 static struct placeholder *
   3006 queue_push_placeholder (struct goacc_asyncqueue *aq)
   3007 {
   3008   struct placeholder *placeholderp;
   3009 
   3010   pthread_mutex_lock (&aq->mutex);
   3011 
   3012   wait_for_queue_nonfull (aq);
   3013 
   3014   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
   3015   if (DEBUG_QUEUES)
   3016     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
   3017 	       aq->id, queue_last);
   3018 
   3019   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
   3020   placeholderp = &aq->queue[queue_last].u.placeholder;
   3021 
   3022   if (pthread_mutex_init (&placeholderp->mutex, NULL))
   3023     {
   3024       pthread_mutex_unlock (&aq->mutex);
   3025       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
   3026     }
   3027 
   3028   if (pthread_cond_init (&placeholderp->cond, NULL))
   3029     {
   3030       pthread_mutex_unlock (&aq->mutex);
   3031       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
   3032     }
   3033 
   3034   placeholderp->executed = 0;
   3035 
   3036   aq->queue_n++;
   3037 
   3038   if (DEBUG_THREAD_SIGNAL)
   3039     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
   3040 	       aq->agent->device_id, aq->id);
   3041   pthread_cond_signal (&aq->queue_cond_in);
   3042 
   3043   pthread_mutex_unlock (&aq->mutex);
   3044 
   3045   return placeholderp;
   3046 }
   3047 
   3048 /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
   3049 
   3050 static void
   3051 finalize_async_thread (struct goacc_asyncqueue *aq)
   3052 {
   3053   pthread_mutex_lock (&aq->mutex);
   3054   if (aq->drain_queue_stop == 2)
   3055     {
   3056       pthread_mutex_unlock (&aq->mutex);
   3057       return;
   3058     }
   3059 
   3060   aq->drain_queue_stop = 1;
   3061 
   3062   if (DEBUG_THREAD_SIGNAL)
   3063     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
   3064 	       aq->agent->device_id, aq->id);
   3065   pthread_cond_signal (&aq->queue_cond_in);
   3066 
   3067   while (aq->drain_queue_stop != 2)
   3068     {
   3069       if (DEBUG_THREAD_SLEEP)
   3070 	GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
   3071 		   " to sleep\n", aq->agent->device_id, aq->id);
   3072       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
   3073       if (DEBUG_THREAD_SLEEP)
   3074 	GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
   3075 		   aq->agent->device_id, aq->id);
   3076     }
   3077 
   3078   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
   3079 	     aq->id);
   3080   pthread_mutex_unlock (&aq->mutex);
   3081 
   3082   int err = pthread_join (aq->thread_drain_queue, NULL);
   3083   if (err != 0)
   3084     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
   3085 		       aq->agent->device_id, aq->id, strerror (err));
   3086   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
   3087 }
   3088 
   3089 /* Set up an async queue for OpenMP.  There will be only one.  The
   3090    implementation simply uses an OpenACC async queue.
   3091    FIXME: is this thread-safe if two threads call this function?  */
   3092 
   3093 static void
   3094 maybe_init_omp_async (struct agent_info *agent)
   3095 {
   3096   if (!agent->omp_async_queue)
   3097     agent->omp_async_queue
   3098       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
   3099 }
   3100 
   3101 /* A wrapper that works around an issue in the HSA runtime with host-to-device
   3102    copies from read-only pages.  */
   3103 
   3104 static void
   3105 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
   3106 {
   3107   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
   3108 
   3109   if (status == HSA_STATUS_SUCCESS)
   3110     return;
   3111 
   3112   /* It appears that the copy fails if the source data is in a read-only page.
   3113      We can't detect that easily, so try copying the data to a temporary buffer
   3114      and doing the copy again if we got an error above.  */
   3115 
   3116   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
   3117 	       "[%p:+%d]\n", (void *) src, (int) len);
   3118 
   3119   void *src_copy = malloc (len);
   3120   memcpy (src_copy, src, len);
   3121   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
   3122   free (src_copy);
   3123   if (status != HSA_STATUS_SUCCESS)
   3124     GOMP_PLUGIN_error ("memory copy failed");
   3125 }
   3126 
   3127 /* Copy data to or from a device.  This is intended for use as an async
   3128    callback event.  */
   3129 
   3130 static void
   3131 copy_data (void *data_)
   3132 {
   3133   struct copy_data *data = (struct copy_data *)data_;
   3134   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
   3135 	     data->aq->agent->device_id, data->aq->id, data->len, data->src,
   3136 	     data->dst);
   3137   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
   3138   if (data->free_src)
   3139     free ((void *) data->src);
   3140   free (data);
   3141 }
   3142 
   3143 /* Free device data.  This is intended for use as an async callback event.  */
   3144 
   3145 static void
   3146 gomp_offload_free (void *ptr)
   3147 {
   3148   GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
   3149   GOMP_OFFLOAD_free (0, ptr);
   3150 }
   3151 
   3152 /* Request an asynchronous data copy, to or from a device, on a given queue.
   3153    The event will be registered as a callback.  If FREE_SRC is true
   3154    then the source data will be freed following the copy.  */
   3155 
   3156 static void
   3157 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
   3158 		 size_t len, bool free_src)
   3159 {
   3160   if (DEBUG_QUEUES)
   3161     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
   3162 	       aq->agent->device_id, aq->id, len, src, dst);
   3163   struct copy_data *data
   3164     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
   3165   data->dst = dst;
   3166   data->src = src;
   3167   data->len = len;
   3168   data->free_src = free_src;
   3169   data->aq = aq;
   3170   queue_push_callback (aq, copy_data, data);
   3171 }
   3172 
   3173 /* Return true if the given queue is currently empty.  */
   3174 
   3175 static int
   3176 queue_empty (struct goacc_asyncqueue *aq)
   3177 {
   3178   pthread_mutex_lock (&aq->mutex);
   3179   int res = aq->queue_n == 0 ? 1 : 0;
   3180   pthread_mutex_unlock (&aq->mutex);
   3181 
   3182   return res;
   3183 }
   3184 
   3185 /* Wait for a given queue to become empty.  This implements an OpenACC wait
   3186    directive.  */
   3187 
   3188 static void
   3189 wait_queue (struct goacc_asyncqueue *aq)
   3190 {
   3191   if (DRAIN_QUEUE_SYNCHRONOUS_P)
   3192     {
   3193       drain_queue_synchronous (aq);
   3194       return;
   3195     }
   3196 
   3197   pthread_mutex_lock (&aq->mutex);
   3198 
   3199   while (aq->queue_n > 0)
   3200     {
   3201       if (DEBUG_THREAD_SLEEP)
   3202 	GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
   3203 		   aq->agent->device_id, aq->id);
   3204       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
   3205       if (DEBUG_THREAD_SLEEP)
   3206 	GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
   3207 		   aq->id);
   3208     }
   3209 
   3210   pthread_mutex_unlock (&aq->mutex);
   3211   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
   3212 }
   3213 
   3214 /* }}}  */
   3215 /* {{{ OpenACC support  */
   3216 
   3217 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
   3218 
   3219 static void
   3220 gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
   3221 	  void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
   3222 	  struct goacc_asyncqueue *aq)
   3223 {
   3224   if (!GOMP_OFFLOAD_can_run (kernel))
   3225     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
   3226 
   3227   /* If we get here then this must be an OpenACC kernel.  */
   3228   kernel->kind = KIND_OPENACC;
   3229 
   3230   /* devaddrs must be double-indirect on the target.  */
   3231   void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
   3232   for (size_t i = 0; i < mapnum; i++)
   3233     hsa_fns.hsa_memory_copy_fn (&ind_da[i],
   3234 				devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
   3235 				sizeof (void *));
   3236 
   3237   struct hsa_kernel_description *hsa_kernel_desc = NULL;
   3238   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
   3239     {
   3240       struct hsa_kernel_description *d
   3241 	= &kernel->module->image_desc->kernel_infos[i];
   3242       if (d->name == kernel->name)
   3243 	{
   3244 	  hsa_kernel_desc = d;
   3245 	  break;
   3246 	}
   3247     }
   3248 
   3249   /* We may have statically-determined dimensions in
   3250      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
   3251      invocation at runtime in dims[].  We allow static dimensions to take
   3252      priority over dynamic dimensions when present (non-zero).  */
   3253   if (hsa_kernel_desc->oacc_dims[0] > 0)
   3254     dims[0] = hsa_kernel_desc->oacc_dims[0];
   3255   if (hsa_kernel_desc->oacc_dims[1] > 0)
   3256     dims[1] = hsa_kernel_desc->oacc_dims[1];
   3257   if (hsa_kernel_desc->oacc_dims[2] > 0)
   3258     dims[2] = hsa_kernel_desc->oacc_dims[2];
   3259 
   3260   /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
   3261      There isn't really a correct answer for this without a clue about the
   3262      problem size, so let's do a reasonable number of single-worker gangs.
   3263      64 gangs matches a typical Fiji device.  */
   3264 
   3265   /* NOTE: Until support for middle-end worker partitioning is merged, use 1
   3266      for the default number of workers.  */
   3267   if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs.  */
   3268   if (dims[1] == 0) dims[1] = 1;  /* Workers.  */
   3269 
   3270   /* The incoming dimensions are expressed in terms of gangs, workers, and
   3271      vectors.  The HSA dimensions are expressed in terms of "work-items",
   3272      which means multiples of vector lanes.
   3273 
   3274      The "grid size" specifies the size of the problem space, and the
   3275      "work-group size" specifies how much of that we want a single compute
   3276      unit to chew on at once.
   3277 
   3278      The three dimensions do not really correspond to hardware, but the
   3279      important thing is that the HSA runtime will launch as many
   3280      work-groups as it takes to process the entire grid, and each
   3281      work-group will contain as many wave-fronts as it takes to process
   3282      the work-items in that group.
   3283 
   3284      Essentially, as long as we set the Y dimension to 64 (the number of
   3285      vector lanes in hardware), and the Z group size to the maximum (16),
   3286      then we will get the gangs (X) and workers (Z) launched as we expect.
   3287 
   3288      The reason for the apparent reversal of vector and worker dimension
   3289      order is to do with the way the run-time distributes work-items across
   3290      v1 and v2.  */
   3291   struct GOMP_kernel_launch_attributes kla =
   3292     {3,
   3293      /* Grid size.  */
   3294      {dims[0], 64, dims[1]},
   3295      /* Work-group size.  */
   3296      {1,       64, 16}
   3297     };
   3298 
   3299   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
   3300   acc_prof_info *prof_info = thr->prof_info;
   3301   acc_event_info enqueue_launch_event_info;
   3302   acc_api_info *api_info = thr->api_info;
   3303   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
   3304   if (profiling_dispatch_p)
   3305     {
   3306       prof_info->event_type = acc_ev_enqueue_launch_start;
   3307 
   3308       enqueue_launch_event_info.launch_event.event_type
   3309 	= prof_info->event_type;
   3310       enqueue_launch_event_info.launch_event.valid_bytes
   3311 	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
   3312       enqueue_launch_event_info.launch_event.parent_construct
   3313 	= acc_construct_parallel;
   3314       enqueue_launch_event_info.launch_event.implicit = 1;
   3315       enqueue_launch_event_info.launch_event.tool_info = NULL;
   3316       enqueue_launch_event_info.launch_event.kernel_name
   3317 	= (char *) kernel->name;
   3318       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
   3319       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
   3320       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
   3321 
   3322       api_info->device_api = acc_device_api_other;
   3323 
   3324       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
   3325 	&enqueue_launch_event_info, api_info);
   3326     }
   3327 
   3328   if (!async)
   3329     {
   3330       run_kernel (kernel, ind_da, &kla, NULL, false);
   3331       gomp_offload_free (ind_da);
   3332     }
   3333   else
   3334     {
   3335       queue_push_launch (aq, kernel, ind_da, &kla);
   3336       if (DEBUG_QUEUES)
   3337 	GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
   3338 		   aq->agent->device_id, aq->id, ind_da);
   3339       queue_push_callback (aq, gomp_offload_free, ind_da);
   3340     }
   3341 
   3342   if (profiling_dispatch_p)
   3343     {
   3344       prof_info->event_type = acc_ev_enqueue_launch_end;
   3345       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
   3346       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
   3347 					    &enqueue_launch_event_info,
   3348 					    api_info);
   3349     }
   3350 }
   3351 
   3352 /* }}}  */
   3353 /* {{{ Generic Plugin API  */
   3354 
   3355 /* Return the name of the accelerator, which is "gcn".  */
   3356 
   3357 const char *
   3358 GOMP_OFFLOAD_get_name (void)
   3359 {
   3360   return "gcn";
   3361 }
   3362 
   3363 /* Return the specific capabilities the HSA accelerator have.  */
   3364 
   3365 unsigned int
   3366 GOMP_OFFLOAD_get_caps (void)
   3367 {
   3368   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
   3369   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
   3370 	    | GOMP_OFFLOAD_CAP_OPENACC_200;
   3371 }
   3372 
   3373 /* Identify as GCN accelerator.  */
   3374 
   3375 int
   3376 GOMP_OFFLOAD_get_type (void)
   3377 {
   3378   return OFFLOAD_TARGET_TYPE_GCN;
   3379 }
   3380 
   3381 /* Return the libgomp version number we're compatible with.  There is
   3382    no requirement for cross-version compatibility.  */
   3383 
   3384 unsigned
   3385 GOMP_OFFLOAD_version (void)
   3386 {
   3387   return GOMP_VERSION;
   3388 }
   3389 
   3390 /* Return the number of GCN devices on the system.  */
   3391 
   3392 int
   3393 GOMP_OFFLOAD_get_num_devices (void)
   3394 {
   3395   if (!init_hsa_context ())
   3396     return 0;
   3397   return hsa_context.agent_count;
   3398 }
   3399 
   3400 /* Initialize device (agent) number N so that it can be used for computation.
   3401    Return TRUE on success.  */
   3402 
   3403 bool
   3404 GOMP_OFFLOAD_init_device (int n)
   3405 {
   3406   if (!init_hsa_context ())
   3407     return false;
   3408   if (n >= hsa_context.agent_count)
   3409     {
   3410       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
   3411       return false;
   3412     }
   3413   struct agent_info *agent = &hsa_context.agents[n];
   3414 
   3415   if (agent->initialized)
   3416     return true;
   3417 
   3418   agent->device_id = n;
   3419 
   3420   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
   3421     {
   3422       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
   3423       return false;
   3424     }
   3425   if (pthread_mutex_init (&agent->prog_mutex, NULL))
   3426     {
   3427       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
   3428       return false;
   3429     }
   3430   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
   3431     {
   3432       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
   3433       return false;
   3434     }
   3435   if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
   3436     {
   3437       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
   3438       return false;
   3439     }
   3440   agent->async_queues = NULL;
   3441   agent->omp_async_queue = NULL;
   3442   agent->team_arena_list = NULL;
   3443 
   3444   uint32_t queue_size;
   3445   hsa_status_t status;
   3446   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
   3447 					  HSA_AGENT_INFO_QUEUE_MAX_SIZE,
   3448 					  &queue_size);
   3449   if (status != HSA_STATUS_SUCCESS)
   3450     return hsa_error ("Error requesting maximum queue size of the GCN agent",
   3451 		      status);
   3452 
   3453   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
   3454 					  &agent->name);
   3455   if (status != HSA_STATUS_SUCCESS)
   3456     return hsa_error ("Error querying the name of the agent", status);
   3457 
   3458   agent->device_isa = isa_code (agent->name);
   3459   if (agent->device_isa < 0)
   3460     return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
   3461 
   3462   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
   3463 					  &agent->vendor_name);
   3464   if (status != HSA_STATUS_SUCCESS)
   3465     return hsa_error ("Error querying the vendor name of the agent", status);
   3466 
   3467   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
   3468 					HSA_QUEUE_TYPE_MULTI,
   3469 					hsa_queue_callback, NULL, UINT32_MAX,
   3470 					UINT32_MAX, &agent->sync_queue);
   3471   if (status != HSA_STATUS_SUCCESS)
   3472     return hsa_error ("Error creating command queue", status);
   3473 
   3474   agent->kernarg_region.handle = (uint64_t) -1;
   3475   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
   3476 						 get_kernarg_memory_region,
   3477 						 &agent->kernarg_region);
   3478   if (status != HSA_STATUS_SUCCESS
   3479       && status != HSA_STATUS_INFO_BREAK)
   3480     hsa_error ("Scanning memory regions failed", status);
   3481   if (agent->kernarg_region.handle == (uint64_t) -1)
   3482     {
   3483       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
   3484 			 "arguments");
   3485       return false;
   3486     }
   3487   GCN_DEBUG ("Selected kernel arguments memory region:\n");
   3488   dump_hsa_region (agent->kernarg_region, NULL);
   3489 
   3490   agent->data_region.handle = (uint64_t) -1;
   3491   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
   3492 						 get_data_memory_region,
   3493 						 &agent->data_region);
   3494   if (status != HSA_STATUS_SUCCESS
   3495       && status != HSA_STATUS_INFO_BREAK)
   3496     hsa_error ("Scanning memory regions failed", status);
   3497   if (agent->data_region.handle == (uint64_t) -1)
   3498     {
   3499       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
   3500 			 "data");
   3501       return false;
   3502     }
   3503   GCN_DEBUG ("Selected device data memory region:\n");
   3504   dump_hsa_region (agent->data_region, NULL);
   3505 
   3506   GCN_DEBUG ("GCN agent %d initialized\n", n);
   3507 
   3508   agent->initialized = true;
   3509   return true;
   3510 }
   3511 
   3512 /* Load GCN object-code module described by struct gcn_image_desc in
   3513    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
   3514    If there are any constructors then run them.  */
   3515 
   3516 int
   3517 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
   3518 			 struct addr_pair **target_table)
   3519 {
   3520   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
   3521     {
   3522       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
   3523 			 " (expected %u, received %u)",
   3524 			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
   3525       return -1;
   3526     }
   3527 
   3528   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
   3529   struct agent_info *agent;
   3530   struct addr_pair *pair;
   3531   struct module_info *module;
   3532   struct kernel_info *kernel;
   3533   int kernel_count = image_desc->kernel_count;
   3534   unsigned var_count = image_desc->global_variable_count;
   3535 
   3536   agent = get_agent_info (ord);
   3537   if (!agent)
   3538     return -1;
   3539 
   3540   if (pthread_rwlock_wrlock (&agent->module_rwlock))
   3541     {
   3542       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
   3543       return -1;
   3544     }
   3545   if (agent->prog_finalized
   3546       && !destroy_hsa_program (agent))
   3547     return -1;
   3548 
   3549   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
   3550   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
   3551   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
   3552 			     * sizeof (struct addr_pair));
   3553   *target_table = pair;
   3554   module = (struct module_info *)
   3555     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
   3556 				+ kernel_count * sizeof (struct kernel_info));
   3557   module->image_desc = image_desc;
   3558   module->kernel_count = kernel_count;
   3559   module->heap = NULL;
   3560   module->constructors_run_p = false;
   3561 
   3562   kernel = &module->kernels[0];
   3563 
   3564   /* Allocate memory for kernel dependencies.  */
   3565   for (unsigned i = 0; i < kernel_count; i++)
   3566     {
   3567       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
   3568       if (!init_basic_kernel_info (kernel, d, agent, module))
   3569 	return -1;
   3570       if (strcmp (d->name, "_init_array") == 0)
   3571 	module->init_array_func = kernel;
   3572       else if (strcmp (d->name, "_fini_array") == 0)
   3573         module->fini_array_func = kernel;
   3574       else
   3575 	{
   3576 	  pair->start = (uintptr_t) kernel;
   3577 	  pair->end = (uintptr_t) (kernel + 1);
   3578 	  pair++;
   3579 	}
   3580       kernel++;
   3581     }
   3582 
   3583   agent->module = module;
   3584   if (pthread_rwlock_unlock (&agent->module_rwlock))
   3585     {
   3586       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
   3587       return -1;
   3588     }
   3589 
   3590   if (!create_and_finalize_hsa_program (agent))
   3591     return -1;
   3592 
   3593   for (unsigned i = 0; i < var_count; i++)
   3594     {
   3595       struct global_var_info *v = &image_desc->global_variables[i];
   3596       GCN_DEBUG ("Looking for variable %s\n", v->name);
   3597 
   3598       hsa_status_t status;
   3599       hsa_executable_symbol_t var_symbol;
   3600       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   3601 						     v->name, agent->id,
   3602 						     0, &var_symbol);
   3603 
   3604       if (status != HSA_STATUS_SUCCESS)
   3605 	hsa_fatal ("Could not find symbol for variable in the code object",
   3606 		   status);
   3607 
   3608       uint64_t var_addr;
   3609       uint32_t var_size;
   3610       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3611 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
   3612       if (status != HSA_STATUS_SUCCESS)
   3613 	hsa_fatal ("Could not extract a variable from its symbol", status);
   3614       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3615 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
   3616       if (status != HSA_STATUS_SUCCESS)
   3617 	hsa_fatal ("Could not extract a variable size from its symbol", status);
   3618 
   3619       pair->start = var_addr;
   3620       pair->end = var_addr + var_size;
   3621       GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
   3622 		 (void *)var_addr, var_size);
   3623       pair++;
   3624     }
   3625 
   3626   /* Ensure that constructors are run first.  */
   3627   struct GOMP_kernel_launch_attributes kla =
   3628     { 3,
   3629       /* Grid size.  */
   3630       { 1, 64, 1 },
   3631       /* Work-group size.  */
   3632       { 1, 64, 1 }
   3633     };
   3634 
   3635   if (module->init_array_func)
   3636     {
   3637       init_kernel (module->init_array_func);
   3638       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
   3639     }
   3640   module->constructors_run_p = true;
   3641 
   3642   /* Don't report kernels that libgomp need not know about.  */
   3643   if (module->init_array_func)
   3644     kernel_count--;
   3645   if (module->fini_array_func)
   3646     kernel_count--;
   3647 
   3648   return kernel_count + var_count;
   3649 }
   3650 
   3651 /* Unload GCN object-code module described by struct gcn_image_desc in
   3652    TARGET_DATA from agent number N.  Return TRUE on success.  */
   3653 
   3654 bool
   3655 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
   3656 {
   3657   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
   3658     {
   3659       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
   3660 			 " (expected %u, received %u)",
   3661 			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
   3662       return false;
   3663     }
   3664 
   3665   struct agent_info *agent;
   3666   agent = get_agent_info (n);
   3667   if (!agent)
   3668     return false;
   3669 
   3670   if (pthread_rwlock_wrlock (&agent->module_rwlock))
   3671     {
   3672       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
   3673       return false;
   3674     }
   3675 
   3676   if (!agent->module || agent->module->image_desc != target_data)
   3677     {
   3678       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
   3679 			 "loaded before");
   3680       return false;
   3681     }
   3682 
   3683   if (!destroy_module (agent->module, true))
   3684     return false;
   3685   free (agent->module);
   3686   agent->module = NULL;
   3687   if (!destroy_hsa_program (agent))
   3688     return false;
   3689   if (pthread_rwlock_unlock (&agent->module_rwlock))
   3690     {
   3691       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
   3692       return false;
   3693     }
   3694   return true;
   3695 }
   3696 
   3697 /* Deinitialize all information and status associated with agent number N.  We
   3698    do not attempt any synchronization, assuming the user and libgomp will not
   3699    attempt deinitialization of a device that is in any way being used at the
   3700    same time.  Return TRUE on success.  */
   3701 
   3702 bool
   3703 GOMP_OFFLOAD_fini_device (int n)
   3704 {
   3705   struct agent_info *agent = get_agent_info (n);
   3706   if (!agent)
   3707     return false;
   3708 
   3709   if (!agent->initialized)
   3710     return true;
   3711 
   3712   if (agent->omp_async_queue)
   3713     {
   3714       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
   3715       agent->omp_async_queue = NULL;
   3716     }
   3717 
   3718   if (agent->module)
   3719     {
   3720       if (!destroy_module (agent->module, false))
   3721 	return false;
   3722       free (agent->module);
   3723       agent->module = NULL;
   3724     }
   3725 
   3726   if (!destroy_team_arenas (agent))
   3727     return false;
   3728 
   3729   if (!destroy_hsa_program (agent))
   3730     return false;
   3731 
   3732   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
   3733   if (status != HSA_STATUS_SUCCESS)
   3734     return hsa_error ("Error destroying command queue", status);
   3735 
   3736   if (pthread_mutex_destroy (&agent->prog_mutex))
   3737     {
   3738       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
   3739       return false;
   3740     }
   3741   if (pthread_rwlock_destroy (&agent->module_rwlock))
   3742     {
   3743       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
   3744       return false;
   3745     }
   3746 
   3747   if (pthread_mutex_destroy (&agent->async_queues_mutex))
   3748     {
   3749       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
   3750       return false;
   3751     }
   3752   if (pthread_mutex_destroy (&agent->team_arena_write_lock))
   3753     {
   3754       GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
   3755       return false;
   3756     }
   3757   agent->initialized = false;
   3758   return true;
   3759 }
   3760 
   3761 /* Return true if the HSA runtime can run function FN_PTR.  */
   3762 
   3763 bool
   3764 GOMP_OFFLOAD_can_run (void *fn_ptr)
   3765 {
   3766   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
   3767 
   3768   init_kernel (kernel);
   3769   if (kernel->initialization_failed)
   3770     goto failure;
   3771 
   3772   return true;
   3773 
   3774 failure:
   3775   if (suppress_host_fallback)
   3776     GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
   3777   GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
   3778   return false;
   3779 }
   3780 
   3781 /* Allocate memory on device N.  */
   3782 
   3783 void *
   3784 GOMP_OFFLOAD_alloc (int n, size_t size)
   3785 {
   3786   struct agent_info *agent = get_agent_info (n);
   3787   return alloc_by_agent (agent, size);
   3788 }
   3789 
   3790 /* Free memory from device N.  */
   3791 
   3792 bool
   3793 GOMP_OFFLOAD_free (int device, void *ptr)
   3794 {
   3795   GCN_DEBUG ("Freeing memory on device %d\n", device);
   3796 
   3797   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
   3798   if (status != HSA_STATUS_SUCCESS)
   3799     {
   3800       hsa_error ("Could not free device memory", status);
   3801       return false;
   3802     }
   3803 
   3804   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
   3805   bool profiling_dispatch_p
   3806     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
   3807   if (profiling_dispatch_p)
   3808     {
   3809       acc_prof_info *prof_info = thr->prof_info;
   3810       acc_event_info data_event_info;
   3811       acc_api_info *api_info = thr->api_info;
   3812 
   3813       prof_info->event_type = acc_ev_free;
   3814 
   3815       data_event_info.data_event.event_type = prof_info->event_type;
   3816       data_event_info.data_event.valid_bytes
   3817 	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
   3818       data_event_info.data_event.parent_construct
   3819 	= acc_construct_parallel;
   3820       data_event_info.data_event.implicit = 1;
   3821       data_event_info.data_event.tool_info = NULL;
   3822       data_event_info.data_event.var_name = NULL;
   3823       data_event_info.data_event.bytes = 0;
   3824       data_event_info.data_event.host_ptr = NULL;
   3825       data_event_info.data_event.device_ptr = (void *) ptr;
   3826 
   3827       api_info->device_api = acc_device_api_other;
   3828 
   3829       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
   3830 					    api_info);
   3831     }
   3832 
   3833   return true;
   3834 }
   3835 
   3836 /* Copy data from DEVICE to host.  */
   3837 
   3838 bool
   3839 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
   3840 {
   3841   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
   3842 	     src, dst);
   3843   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
   3844   if (status != HSA_STATUS_SUCCESS)
   3845     GOMP_PLUGIN_error ("memory copy failed");
   3846   return true;
   3847 }
   3848 
   3849 /* Copy data from host to DEVICE.  */
   3850 
   3851 bool
   3852 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
   3853 {
   3854   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
   3855 	     device, dst);
   3856   hsa_memory_copy_wrapper (dst, src, n);
   3857   return true;
   3858 }
   3859 
   3860 /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
   3861 
   3862 bool
   3863 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
   3864 {
   3865   struct gcn_thread *thread_data = gcn_thread ();
   3866 
   3867   if (thread_data && !async_synchronous_p (thread_data->async))
   3868     {
   3869       struct agent_info *agent = get_agent_info (device);
   3870       maybe_init_omp_async (agent);
   3871       queue_push_copy (agent->omp_async_queue, dst, src, n, false);
   3872       return true;
   3873     }
   3874 
   3875   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
   3876 	     device, src, device, dst);
   3877   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
   3878   if (status != HSA_STATUS_SUCCESS)
   3879     GOMP_PLUGIN_error ("memory copy failed");
   3880   return true;
   3881 }
   3882 
   3883 /* }}}  */
   3884 /* {{{ OpenMP Plugin API  */
   3885 
   3886 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
   3887    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
   3888    to a kernel_info structure, and must have previously been loaded to the
   3889    specified device.  */
   3890 
   3891 void
   3892 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
   3893 {
   3894   struct agent_info *agent = get_agent_info (device);
   3895   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
   3896   struct GOMP_kernel_launch_attributes def;
   3897   struct GOMP_kernel_launch_attributes *kla;
   3898   assert (agent == kernel->agent);
   3899 
   3900   /* If we get here then the kernel must be OpenMP.  */
   3901   kernel->kind = KIND_OPENMP;
   3902 
   3903   if (!parse_target_attributes (args, &def, &kla, agent))
   3904     {
   3905       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
   3906       return;
   3907     }
   3908   run_kernel (kernel, vars, kla, NULL, false);
   3909 }
   3910 
   3911 /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
   3912    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
   3913    GOMP_PLUGIN_target_task_completion when it has finished.  */
   3914 
   3915 void
   3916 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
   3917 			void **args, void *async_data)
   3918 {
   3919   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
   3920   struct agent_info *agent = get_agent_info (device);
   3921   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
   3922   struct GOMP_kernel_launch_attributes def;
   3923   struct GOMP_kernel_launch_attributes *kla;
   3924   assert (agent == kernel->agent);
   3925 
   3926   /* If we get here then the kernel must be OpenMP.  */
   3927   kernel->kind = KIND_OPENMP;
   3928 
   3929   if (!parse_target_attributes (args, &def, &kla, agent))
   3930     {
   3931       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
   3932       return;
   3933     }
   3934 
   3935   maybe_init_omp_async (agent);
   3936   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
   3937   queue_push_callback (agent->omp_async_queue,
   3938 		       GOMP_PLUGIN_target_task_completion, async_data);
   3939 }
   3940 
   3941 /* }}} */
   3942 /* {{{ OpenACC Plugin API  */
   3943 
   3944 /* Run a synchronous OpenACC kernel.  The device number is inferred from the
   3945    already-loaded KERNEL.  */
   3946 
   3947 void
   3948 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
   3949 			   void **hostaddrs, void **devaddrs, unsigned *dims,
   3950 			   void *targ_mem_desc)
   3951 {
   3952   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
   3953 
   3954   gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
   3955 	    NULL);
   3956 }
   3957 
   3958 /* Run an asynchronous OpenACC kernel on the specified queue.  */
   3959 
   3960 void
   3961 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
   3962 				 void **hostaddrs, void **devaddrs,
   3963 				 unsigned *dims, void *targ_mem_desc,
   3964 				 struct goacc_asyncqueue *aq)
   3965 {
   3966   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
   3967 
   3968   gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
   3969 	    aq);
   3970 }
   3971 
   3972 /* Create a new asynchronous thread and queue for running future kernels.  */
   3973 
   3974 struct goacc_asyncqueue *
   3975 GOMP_OFFLOAD_openacc_async_construct (int device)
   3976 {
   3977   struct agent_info *agent = get_agent_info (device);
   3978 
   3979   pthread_mutex_lock (&agent->async_queues_mutex);
   3980 
   3981   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
   3982   aq->agent = get_agent_info (device);
   3983   aq->prev = NULL;
   3984   aq->next = agent->async_queues;
   3985   if (aq->next)
   3986     {
   3987       aq->next->prev = aq;
   3988       aq->id = aq->next->id + 1;
   3989     }
   3990   else
   3991     aq->id = 1;
   3992   agent->async_queues = aq;
   3993 
   3994   aq->queue_first = 0;
   3995   aq->queue_n = 0;
   3996   aq->drain_queue_stop = 0;
   3997 
   3998   if (pthread_mutex_init (&aq->mutex, NULL))
   3999     {
   4000       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
   4001       return false;
   4002     }
   4003   if (pthread_cond_init (&aq->queue_cond_in, NULL))
   4004     {
   4005       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
   4006       return false;
   4007     }
   4008   if (pthread_cond_init (&aq->queue_cond_out, NULL))
   4009     {
   4010       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
   4011       return false;
   4012     }
   4013 
   4014   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
   4015 						     ASYNC_QUEUE_SIZE,
   4016 						     HSA_QUEUE_TYPE_MULTI,
   4017 						     hsa_queue_callback, NULL,
   4018 						     UINT32_MAX, UINT32_MAX,
   4019 						     &aq->hsa_queue);
   4020   if (status != HSA_STATUS_SUCCESS)
   4021     hsa_fatal ("Error creating command queue", status);
   4022 
   4023   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
   4024   if (err != 0)
   4025     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
   4026 		       strerror (err));
   4027   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
   4028 	     aq->id);
   4029 
   4030   pthread_mutex_unlock (&agent->async_queues_mutex);
   4031 
   4032   return aq;
   4033 }
   4034 
   4035 /* Destroy an existing asynchronous thread and queue.  Waits for any
   4036    currently-running task to complete, but cancels any queued tasks.  */
   4037 
   4038 bool
   4039 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
   4040 {
   4041   struct agent_info *agent = aq->agent;
   4042 
   4043   finalize_async_thread (aq);
   4044 
   4045   pthread_mutex_lock (&agent->async_queues_mutex);
   4046 
   4047   int err;
   4048   if ((err = pthread_mutex_destroy (&aq->mutex)))
   4049     {
   4050       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
   4051       goto fail;
   4052     }
   4053   if (pthread_cond_destroy (&aq->queue_cond_in))
   4054     {
   4055       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
   4056       goto fail;
   4057     }
   4058   if (pthread_cond_destroy (&aq->queue_cond_out))
   4059     {
   4060       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
   4061       goto fail;
   4062     }
   4063   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
   4064   if (status != HSA_STATUS_SUCCESS)
   4065     {
   4066       hsa_error ("Error destroying command queue", status);
   4067       goto fail;
   4068     }
   4069 
   4070   if (aq->prev)
   4071     aq->prev->next = aq->next;
   4072   if (aq->next)
   4073     aq->next->prev = aq->prev;
   4074   if (agent->async_queues == aq)
   4075     agent->async_queues = aq->next;
   4076 
   4077   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
   4078 
   4079   free (aq);
   4080   pthread_mutex_unlock (&agent->async_queues_mutex);
   4081   return true;
   4082 
   4083 fail:
   4084   pthread_mutex_unlock (&agent->async_queues_mutex);
   4085   return false;
   4086 }
   4087 
   4088 /* Return true if the specified async queue is currently empty.  */
   4089 
   4090 int
   4091 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
   4092 {
   4093   return queue_empty (aq);
   4094 }
   4095 
   4096 /* Block until the specified queue has executed all its tasks and the
   4097    queue is empty.  */
   4098 
   4099 bool
   4100 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
   4101 {
   4102   wait_queue (aq);
   4103   return true;
   4104 }
   4105 
   4106 /* Add a serialization point across two async queues. Any new tasks added to
   4107    AQ2, after this call, will not run until all tasks on AQ1, at the time
   4108    of this call, have completed.  */
   4109 
   4110 bool
   4111 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
   4112 				      struct goacc_asyncqueue *aq2)
   4113 {
   4114   /* For serialize, stream aq2 waits for aq1 to complete work that has been
   4115      scheduled to run on it up to this point.  */
   4116   if (aq1 != aq2)
   4117     {
   4118       struct placeholder *placeholderp = queue_push_placeholder (aq1);
   4119       queue_push_asyncwait (aq2, placeholderp);
   4120     }
   4121   return true;
   4122 }
   4123 
   4124 /* Add an opaque callback to the given async queue.  */
   4125 
   4126 void
   4127 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
   4128 					   void (*fn) (void *), void *data)
   4129 {
   4130   queue_push_callback (aq, fn, data);
   4131 }
   4132 
   4133 /* Queue up an asynchronous data copy from host to DEVICE.  */
   4134 
   4135 bool
   4136 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
   4137 				     size_t n, struct goacc_asyncqueue *aq)
   4138 {
   4139   struct agent_info *agent = get_agent_info (device);
   4140   assert (agent == aq->agent);
   4141   /* The source data does not necessarily remain live until the deferred
   4142      copy happens.  Taking a snapshot of the data here avoids reading
   4143      uninitialised data later, but means that (a) data is copied twice and
   4144      (b) modifications to the copied data between the "spawning" point of
   4145      the asynchronous kernel and when it is executed will not be seen.
   4146      But, that is probably correct.  */
   4147   void *src_copy = GOMP_PLUGIN_malloc (n);
   4148   memcpy (src_copy, src, n);
   4149   queue_push_copy (aq, dst, src_copy, n, true);
   4150   return true;
   4151 }
   4152 
   4153 /* Queue up an asynchronous data copy from DEVICE to host.  */
   4154 
   4155 bool
   4156 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
   4157 				     size_t n, struct goacc_asyncqueue *aq)
   4158 {
   4159   struct agent_info *agent = get_agent_info (device);
   4160   assert (agent == aq->agent);
   4161   queue_push_copy (aq, dst, src, n, false);
   4162   return true;
   4163 }
   4164 
   4165 union goacc_property_value
   4166 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
   4167 {
   4168   struct agent_info *agent = get_agent_info (device);
   4169 
   4170   union goacc_property_value propval = { .val = 0 };
   4171 
   4172   switch (prop)
   4173     {
   4174     case GOACC_PROPERTY_FREE_MEMORY:
   4175       /* Not supported. */
   4176       break;
   4177     case GOACC_PROPERTY_MEMORY:
   4178       {
   4179 	size_t size;
   4180 	hsa_region_t region = agent->data_region;
   4181 	hsa_status_t status =
   4182 	  hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
   4183 	if (status == HSA_STATUS_SUCCESS)
   4184 	  propval.val = size;
   4185 	break;
   4186       }
   4187     case GOACC_PROPERTY_NAME:
   4188       propval.ptr = agent->name;
   4189       break;
   4190     case GOACC_PROPERTY_VENDOR:
   4191       propval.ptr = agent->vendor_name;
   4192       break;
   4193     case GOACC_PROPERTY_DRIVER:
   4194       propval.ptr = hsa_context.driver_version_s;
   4195       break;
   4196     }
   4197 
   4198   return propval;
   4199 }
   4200 
   4201 /* Set up plugin-specific thread-local-data (host-side).  */
   4202 
   4203 void *
   4204 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
   4205 {
   4206   struct gcn_thread *thread_data
   4207     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
   4208 
   4209   thread_data->async = GOMP_ASYNC_SYNC;
   4210 
   4211   return (void *) thread_data;
   4212 }
   4213 
   4214 /* Clean up plugin-specific thread-local-data.  */
   4215 
   4216 void
   4217 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
   4218 {
   4219   free (data);
   4220 }
   4221 
   4222 /* }}} */
   4223