Home | History | Annotate | Line # | Download | only in plugin
      1      1.1  mrg /* Plugin for AMD GCN execution.
      2      1.1  mrg 
      3  1.1.1.3  mrg    Copyright (C) 2013-2024 Free Software Foundation, Inc.
      4      1.1  mrg 
      5      1.1  mrg    Contributed by Mentor Embedded
      6      1.1  mrg 
      7      1.1  mrg    This file is part of the GNU Offloading and Multi Processing Library
      8      1.1  mrg    (libgomp).
      9      1.1  mrg 
     10      1.1  mrg    Libgomp is free software; you can redistribute it and/or modify it
     11      1.1  mrg    under the terms of the GNU General Public License as published by
     12      1.1  mrg    the Free Software Foundation; either version 3, or (at your option)
     13      1.1  mrg    any later version.
     14      1.1  mrg 
     15      1.1  mrg    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
     16      1.1  mrg    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
     17      1.1  mrg    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
     18      1.1  mrg    more details.
     19      1.1  mrg 
     20      1.1  mrg    Under Section 7 of GPL version 3, you are granted additional
     21      1.1  mrg    permissions described in the GCC Runtime Library Exception, version
     22      1.1  mrg    3.1, as published by the Free Software Foundation.
     23      1.1  mrg 
     24      1.1  mrg    You should have received a copy of the GNU General Public License and
     25      1.1  mrg    a copy of the GCC Runtime Library Exception along with this program;
     26      1.1  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     27      1.1  mrg    <http://www.gnu.org/licenses/>.  */
     28      1.1  mrg 
     29      1.1  mrg /* {{{ Includes and defines  */
     30      1.1  mrg 
     31      1.1  mrg #include "config.h"
     32  1.1.1.2  mrg #include "symcat.h"
     33      1.1  mrg #include <stdio.h>
     34      1.1  mrg #include <stdlib.h>
     35      1.1  mrg #include <string.h>
     36      1.1  mrg #include <pthread.h>
     37      1.1  mrg #include <inttypes.h>
     38      1.1  mrg #include <stdbool.h>
     39      1.1  mrg #include <limits.h>
     40      1.1  mrg #include <hsa.h>
     41  1.1.1.2  mrg #include <hsa_ext_amd.h>
     42      1.1  mrg #include <dlfcn.h>
     43      1.1  mrg #include <signal.h>
     44      1.1  mrg #include "libgomp-plugin.h"
     45  1.1.1.3  mrg #include "config/gcn/libgomp-gcn.h"  /* For struct output.  */
     46      1.1  mrg #include "gomp-constants.h"
     47      1.1  mrg #include <elf.h>
     48      1.1  mrg #include "oacc-plugin.h"
     49      1.1  mrg #include "oacc-int.h"
     50      1.1  mrg #include <assert.h>
     51      1.1  mrg 
     52      1.1  mrg /* These probably won't be in elf.h for a while.  */
     53  1.1.1.2  mrg #ifndef R_AMDGPU_NONE
     54      1.1  mrg #define R_AMDGPU_NONE		0
     55      1.1  mrg #define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
     56      1.1  mrg #define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
     57      1.1  mrg #define R_AMDGPU_ABS64		3	/* S + A  */
     58      1.1  mrg #define R_AMDGPU_REL32		4	/* S + A - P  */
     59      1.1  mrg #define R_AMDGPU_REL64		5	/* S + A - P  */
     60      1.1  mrg #define R_AMDGPU_ABS32		6	/* S + A  */
     61      1.1  mrg #define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
     62      1.1  mrg #define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
     63      1.1  mrg #define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
     64      1.1  mrg #define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
     65      1.1  mrg #define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
     66      1.1  mrg #define R_AMDGPU_RELATIVE64	13	/* B + A  */
     67  1.1.1.2  mrg #endif
     68      1.1  mrg 
     69      1.1  mrg /* GCN specific definitions for asynchronous queues.  */
     70      1.1  mrg 
     71      1.1  mrg #define ASYNC_QUEUE_SIZE 64
     72      1.1  mrg #define DRAIN_QUEUE_SYNCHRONOUS_P false
     73      1.1  mrg #define DEBUG_QUEUES 0
     74      1.1  mrg #define DEBUG_THREAD_SLEEP 0
     75      1.1  mrg #define DEBUG_THREAD_SIGNAL 0
     76      1.1  mrg 
     77      1.1  mrg /* Defaults.  */
     78      1.1  mrg #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
     79      1.1  mrg 
     80      1.1  mrg /* Secure getenv() which returns NULL if running as SUID/SGID.  */
     81      1.1  mrg #ifndef HAVE_SECURE_GETENV
     82      1.1  mrg #ifdef HAVE___SECURE_GETENV
     83      1.1  mrg #define secure_getenv __secure_getenv
     84      1.1  mrg #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
     85      1.1  mrg   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
     86      1.1  mrg 
     87      1.1  mrg #include <unistd.h>
     88      1.1  mrg 
     89      1.1  mrg /* Implementation of secure_getenv() for targets where it is not provided but
     90      1.1  mrg    we have at least means to test real and effective IDs. */
     91      1.1  mrg 
     92      1.1  mrg static char *
     93      1.1  mrg secure_getenv (const char *name)
     94      1.1  mrg {
     95      1.1  mrg   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
     96      1.1  mrg     return getenv (name);
     97      1.1  mrg   else
     98      1.1  mrg     return NULL;
     99      1.1  mrg }
    100      1.1  mrg 
    101      1.1  mrg #else
    102      1.1  mrg #define secure_getenv getenv
    103      1.1  mrg #endif
    104      1.1  mrg #endif
    105      1.1  mrg 
    106      1.1  mrg /* }}}  */
    107      1.1  mrg /* {{{ Types  */
    108      1.1  mrg 
    109      1.1  mrg /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
    110      1.1  mrg 
    111      1.1  mrg struct gcn_thread
    112      1.1  mrg {
    113      1.1  mrg   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
    114      1.1  mrg   int async;
    115      1.1  mrg };
    116      1.1  mrg 
    117      1.1  mrg /* As an HSA runtime is dlopened, following structure defines function
    118      1.1  mrg    pointers utilized by the HSA plug-in.  */
    119      1.1  mrg 
    120      1.1  mrg struct hsa_runtime_fn_info
    121      1.1  mrg {
    122      1.1  mrg   /* HSA runtime.  */
    123      1.1  mrg   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
    124      1.1  mrg 					const char **status_string);
    125      1.1  mrg   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
    126      1.1  mrg 					  void *value);
    127      1.1  mrg   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
    128      1.1  mrg 					 hsa_agent_info_t attribute,
    129      1.1  mrg 					 void *value);
    130      1.1  mrg   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
    131      1.1  mrg 				      hsa_isa_info_t attribute,
    132      1.1  mrg 				      uint32_t index,
    133      1.1  mrg 				      void *value);
    134      1.1  mrg   hsa_status_t (*hsa_init_fn) (void);
    135      1.1  mrg   hsa_status_t (*hsa_iterate_agents_fn)
    136      1.1  mrg     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
    137      1.1  mrg   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
    138      1.1  mrg 					  hsa_region_info_t attribute,
    139      1.1  mrg 					  void *value);
    140      1.1  mrg   hsa_status_t (*hsa_queue_create_fn)
    141      1.1  mrg     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
    142      1.1  mrg      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
    143      1.1  mrg      void *data, uint32_t private_segment_size,
    144      1.1  mrg      uint32_t group_segment_size, hsa_queue_t **queue);
    145      1.1  mrg   hsa_status_t (*hsa_agent_iterate_regions_fn)
    146      1.1  mrg     (hsa_agent_t agent,
    147      1.1  mrg      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
    148      1.1  mrg   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
    149      1.1  mrg   hsa_status_t (*hsa_executable_create_fn)
    150      1.1  mrg     (hsa_profile_t profile, hsa_executable_state_t executable_state,
    151      1.1  mrg      const char *options, hsa_executable_t *executable);
    152      1.1  mrg   hsa_status_t (*hsa_executable_global_variable_define_fn)
    153      1.1  mrg     (hsa_executable_t executable, const char *variable_name, void *address);
    154      1.1  mrg   hsa_status_t (*hsa_executable_load_code_object_fn)
    155      1.1  mrg     (hsa_executable_t executable, hsa_agent_t agent,
    156      1.1  mrg      hsa_code_object_t code_object, const char *options);
    157      1.1  mrg   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
    158      1.1  mrg 					   const char *options);
    159      1.1  mrg   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
    160      1.1  mrg 					uint32_t num_consumers,
    161      1.1  mrg 					const hsa_agent_t *consumers,
    162      1.1  mrg 					hsa_signal_t *signal);
    163      1.1  mrg   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
    164      1.1  mrg 					  void **ptr);
    165      1.1  mrg   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
    166      1.1  mrg 					      hsa_access_permission_t access);
    167      1.1  mrg   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
    168      1.1  mrg   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
    169      1.1  mrg   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
    170      1.1  mrg   hsa_status_t (*hsa_executable_get_symbol_fn)
    171      1.1  mrg     (hsa_executable_t executable, const char *module_name,
    172      1.1  mrg      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
    173      1.1  mrg      hsa_executable_symbol_t *symbol);
    174      1.1  mrg   hsa_status_t (*hsa_executable_symbol_get_info_fn)
    175      1.1  mrg     (hsa_executable_symbol_t executable_symbol,
    176      1.1  mrg      hsa_executable_symbol_info_t attribute, void *value);
    177      1.1  mrg   hsa_status_t (*hsa_executable_iterate_symbols_fn)
    178      1.1  mrg     (hsa_executable_t executable,
    179      1.1  mrg      hsa_status_t (*callback)(hsa_executable_t executable,
    180      1.1  mrg 			      hsa_executable_symbol_t symbol, void *data),
    181      1.1  mrg      void *data);
    182      1.1  mrg   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
    183      1.1  mrg 						    uint64_t value);
    184      1.1  mrg   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
    185      1.1  mrg   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
    186      1.1  mrg 				       hsa_signal_value_t value);
    187      1.1  mrg   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
    188      1.1  mrg 				       hsa_signal_value_t value);
    189      1.1  mrg   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
    190      1.1  mrg     (hsa_signal_t signal, hsa_signal_condition_t condition,
    191      1.1  mrg      hsa_signal_value_t compare_value, uint64_t timeout_hint,
    192      1.1  mrg      hsa_wait_state_t wait_state_hint);
    193      1.1  mrg   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
    194      1.1  mrg   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
    195      1.1  mrg 
    196      1.1  mrg   hsa_status_t (*hsa_code_object_deserialize_fn)
    197      1.1  mrg     (void *serialized_code_object, size_t serialized_code_object_size,
    198      1.1  mrg      const char *options, hsa_code_object_t *code_object);
    199  1.1.1.3  mrg   hsa_status_t (*hsa_amd_memory_lock_fn)
    200  1.1.1.3  mrg     (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
    201  1.1.1.3  mrg      void **agent_ptr);
    202  1.1.1.3  mrg   hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
    203  1.1.1.3  mrg   hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
    204  1.1.1.3  mrg     (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
    205  1.1.1.3  mrg      const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
    206  1.1.1.3  mrg      const hsa_dim3_t *range, hsa_agent_t copy_agent,
    207  1.1.1.3  mrg      hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
    208  1.1.1.3  mrg      const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
    209      1.1  mrg };
    210      1.1  mrg 
    211      1.1  mrg /* Structure describing the run-time and grid properties of an HSA kernel
    212      1.1  mrg    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
    213      1.1  mrg 
    214      1.1  mrg struct GOMP_kernel_launch_attributes
    215      1.1  mrg {
    216      1.1  mrg   /* Number of dimensions the workload has.  Maximum number is 3.  */
    217      1.1  mrg   uint32_t ndim;
    218      1.1  mrg   /* Size of the grid in the three respective dimensions.  */
    219      1.1  mrg   uint32_t gdims[3];
    220      1.1  mrg   /* Size of work-groups in the respective dimensions.  */
    221      1.1  mrg   uint32_t wdims[3];
    222      1.1  mrg };
    223      1.1  mrg 
    224      1.1  mrg /* Collection of information needed for a dispatch of a kernel from a
    225      1.1  mrg    kernel.  */
    226      1.1  mrg 
    227      1.1  mrg struct kernel_dispatch
    228      1.1  mrg {
    229      1.1  mrg   struct agent_info *agent;
    230      1.1  mrg   /* Pointer to a command queue associated with a kernel dispatch agent.  */
    231      1.1  mrg   void *queue;
    232      1.1  mrg   /* Pointer to a memory space used for kernel arguments passing.  */
    233      1.1  mrg   void *kernarg_address;
    234      1.1  mrg   /* Kernel object.  */
    235      1.1  mrg   uint64_t object;
    236      1.1  mrg   /* Synchronization signal used for dispatch synchronization.  */
    237      1.1  mrg   uint64_t signal;
    238      1.1  mrg   /* Private segment size.  */
    239      1.1  mrg   uint32_t private_segment_size;
    240      1.1  mrg   /* Group segment size.  */
    241      1.1  mrg   uint32_t group_segment_size;
    242      1.1  mrg };
    243      1.1  mrg 
    244      1.1  mrg /* Structure of the kernargs segment, supporting console output.
    245      1.1  mrg 
    246      1.1  mrg    This needs to match the definitions in Newlib, and the expectations
    247      1.1  mrg    in libgomp target code.  */
    248      1.1  mrg 
    249      1.1  mrg struct kernargs {
    250  1.1.1.3  mrg   struct kernargs_abi abi;
    251      1.1  mrg 
    252      1.1  mrg   /* Output data.  */
    253  1.1.1.3  mrg   struct output output_data;
    254      1.1  mrg };
    255      1.1  mrg 
    256      1.1  mrg /* A queue entry for a future asynchronous launch.  */
    257      1.1  mrg 
    258      1.1  mrg struct kernel_launch
    259      1.1  mrg {
    260      1.1  mrg   struct kernel_info *kernel;
    261      1.1  mrg   void *vars;
    262      1.1  mrg   struct GOMP_kernel_launch_attributes kla;
    263      1.1  mrg };
    264      1.1  mrg 
    265      1.1  mrg /* A queue entry for a future callback.  */
    266      1.1  mrg 
    267      1.1  mrg struct callback
    268      1.1  mrg {
    269      1.1  mrg   void (*fn)(void *);
    270      1.1  mrg   void *data;
    271      1.1  mrg };
    272      1.1  mrg 
    273      1.1  mrg /* A data struct for the copy_data callback.  */
    274      1.1  mrg 
    275      1.1  mrg struct copy_data
    276      1.1  mrg {
    277      1.1  mrg   void *dst;
    278      1.1  mrg   const void *src;
    279      1.1  mrg   size_t len;
    280      1.1  mrg   struct goacc_asyncqueue *aq;
    281      1.1  mrg };
    282      1.1  mrg 
    283      1.1  mrg /* A queue entry for a placeholder.  These correspond to a wait event.  */
    284      1.1  mrg 
    285      1.1  mrg struct placeholder
    286      1.1  mrg {
    287      1.1  mrg   int executed;
    288      1.1  mrg   pthread_cond_t cond;
    289      1.1  mrg   pthread_mutex_t mutex;
    290      1.1  mrg };
    291      1.1  mrg 
    292      1.1  mrg /* A queue entry for a wait directive.  */
    293      1.1  mrg 
    294      1.1  mrg struct asyncwait_info
    295      1.1  mrg {
    296      1.1  mrg   struct placeholder *placeholderp;
    297      1.1  mrg };
    298      1.1  mrg 
    299      1.1  mrg /* Encode the type of an entry in an async queue.  */
    300      1.1  mrg 
    301      1.1  mrg enum entry_type
    302      1.1  mrg {
    303      1.1  mrg   KERNEL_LAUNCH,
    304      1.1  mrg   CALLBACK,
    305      1.1  mrg   ASYNC_WAIT,
    306      1.1  mrg   ASYNC_PLACEHOLDER
    307      1.1  mrg };
    308      1.1  mrg 
    309      1.1  mrg /* An entry in an async queue.  */
    310      1.1  mrg 
    311      1.1  mrg struct queue_entry
    312      1.1  mrg {
    313      1.1  mrg   enum entry_type type;
    314      1.1  mrg   union {
    315      1.1  mrg     struct kernel_launch launch;
    316      1.1  mrg     struct callback callback;
    317      1.1  mrg     struct asyncwait_info asyncwait;
    318      1.1  mrg     struct placeholder placeholder;
    319      1.1  mrg   } u;
    320      1.1  mrg };
    321      1.1  mrg 
    322      1.1  mrg /* An async queue header.
    323      1.1  mrg 
    324      1.1  mrg    OpenMP may create one of these.
    325      1.1  mrg    OpenACC may create many.  */
    326      1.1  mrg 
    327      1.1  mrg struct goacc_asyncqueue
    328      1.1  mrg {
    329      1.1  mrg   struct agent_info *agent;
    330      1.1  mrg   hsa_queue_t *hsa_queue;
    331      1.1  mrg 
    332      1.1  mrg   pthread_t thread_drain_queue;
    333      1.1  mrg   pthread_mutex_t mutex;
    334      1.1  mrg   pthread_cond_t queue_cond_in;
    335      1.1  mrg   pthread_cond_t queue_cond_out;
    336      1.1  mrg   struct queue_entry queue[ASYNC_QUEUE_SIZE];
    337      1.1  mrg   int queue_first;
    338      1.1  mrg   int queue_n;
    339      1.1  mrg   int drain_queue_stop;
    340      1.1  mrg 
    341      1.1  mrg   int id;
    342      1.1  mrg   struct goacc_asyncqueue *prev;
    343      1.1  mrg   struct goacc_asyncqueue *next;
    344      1.1  mrg };
    345      1.1  mrg 
    346      1.1  mrg /* Mkoffload uses this structure to describe a kernel.
    347      1.1  mrg 
    348      1.1  mrg    OpenMP kernel dimensions are passed at runtime.
    349      1.1  mrg    OpenACC kernel dimensions are passed at compile time, here.  */
    350      1.1  mrg 
    351      1.1  mrg struct hsa_kernel_description
    352      1.1  mrg {
    353      1.1  mrg   const char *name;
    354      1.1  mrg   int oacc_dims[3];  /* Only present for GCN kernels.  */
    355      1.1  mrg   int sgpr_count;
    356      1.1  mrg   int vpgr_count;
    357      1.1  mrg };
    358      1.1  mrg 
    359      1.1  mrg /* Mkoffload uses this structure to describe an offload variable.  */
    360      1.1  mrg 
    361      1.1  mrg struct global_var_info
    362      1.1  mrg {
    363      1.1  mrg   const char *name;
    364      1.1  mrg   void *address;
    365      1.1  mrg };
    366      1.1  mrg 
    367      1.1  mrg /* Mkoffload uses this structure to describe all the kernels in a
    368      1.1  mrg    loadable module.  These are passed the libgomp via static constructors.  */
    369      1.1  mrg 
    370      1.1  mrg struct gcn_image_desc
    371      1.1  mrg {
    372      1.1  mrg   struct gcn_image {
    373      1.1  mrg     size_t size;
    374      1.1  mrg     void *image;
    375      1.1  mrg   } *gcn_image;
    376      1.1  mrg   const unsigned kernel_count;
    377      1.1  mrg   struct hsa_kernel_description *kernel_infos;
    378  1.1.1.3  mrg   const unsigned ind_func_count;
    379      1.1  mrg   const unsigned global_variable_count;
    380      1.1  mrg };
    381      1.1  mrg 
    382      1.1  mrg /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
    383      1.1  mrg    support.
    384      1.1  mrg    See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
    385      1.1  mrg 
    386      1.1  mrg typedef enum {
    387  1.1.1.3  mrg   EF_AMDGPU_MACH_UNSUPPORTED = -1,
    388      1.1  mrg   EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
    389      1.1  mrg   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
    390      1.1  mrg   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
    391  1.1.1.3  mrg   EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
    392  1.1.1.3  mrg   EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
    393  1.1.1.3  mrg   EF_AMDGPU_MACH_AMDGCN_GFX90c = 0x032,
    394  1.1.1.3  mrg   EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
    395  1.1.1.3  mrg   EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045,
    396  1.1.1.3  mrg   EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
    397  1.1.1.3  mrg   EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044
    398      1.1  mrg } EF_AMDGPU_MACH;
    399      1.1  mrg 
    400      1.1  mrg const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
    401      1.1  mrg typedef EF_AMDGPU_MACH gcn_isa;
    402      1.1  mrg 
    403      1.1  mrg /* Description of an HSA GPU agent (device) and the program associated with
    404      1.1  mrg    it.  */
    405      1.1  mrg 
    406      1.1  mrg struct agent_info
    407      1.1  mrg {
    408      1.1  mrg   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
    409      1.1  mrg   hsa_agent_t id;
    410      1.1  mrg   /* The user-visible device number.  */
    411      1.1  mrg   int device_id;
    412      1.1  mrg   /* Whether the agent has been initialized.  The fields below are usable only
    413      1.1  mrg      if it has been.  */
    414      1.1  mrg   bool initialized;
    415      1.1  mrg 
    416      1.1  mrg   /* The instruction set architecture of the device. */
    417      1.1  mrg   gcn_isa device_isa;
    418      1.1  mrg   /* Name of the agent. */
    419      1.1  mrg   char name[64];
    420      1.1  mrg   /* Name of the vendor of the agent. */
    421      1.1  mrg   char vendor_name[64];
    422      1.1  mrg   /* Command queues of the agent.  */
    423      1.1  mrg   hsa_queue_t *sync_queue;
    424      1.1  mrg   struct goacc_asyncqueue *async_queues, *omp_async_queue;
    425      1.1  mrg   pthread_mutex_t async_queues_mutex;
    426      1.1  mrg 
    427      1.1  mrg   /* The HSA memory region from which to allocate kernel arguments.  */
    428      1.1  mrg   hsa_region_t kernarg_region;
    429      1.1  mrg 
    430      1.1  mrg   /* The HSA memory region from which to allocate device data.  */
    431      1.1  mrg   hsa_region_t data_region;
    432      1.1  mrg 
    433  1.1.1.3  mrg   /* Allocated ephemeral memories (team arena and stack space).  */
    434  1.1.1.3  mrg   struct ephemeral_memories_list *ephemeral_memories_list;
    435  1.1.1.3  mrg   pthread_mutex_t ephemeral_memories_write_lock;
    436      1.1  mrg 
    437      1.1  mrg   /* Read-write lock that protects kernels which are running or about to be run
    438      1.1  mrg      from interference with loading and unloading of images.  Needs to be
    439      1.1  mrg      locked for reading while a kernel is being run, and for writing if the
    440      1.1  mrg      list of modules is manipulated (and thus the HSA program invalidated).  */
    441      1.1  mrg   pthread_rwlock_t module_rwlock;
    442      1.1  mrg 
    443      1.1  mrg   /* The module associated with this kernel.  */
    444      1.1  mrg   struct module_info *module;
    445      1.1  mrg 
    446      1.1  mrg   /* Mutex enforcing that only one thread will finalize the HSA program.  A
    447      1.1  mrg      thread should have locked agent->module_rwlock for reading before
    448      1.1  mrg      acquiring it.  */
    449      1.1  mrg   pthread_mutex_t prog_mutex;
    450      1.1  mrg   /* Flag whether the HSA program that consists of all the modules has been
    451      1.1  mrg      finalized.  */
    452      1.1  mrg   bool prog_finalized;
    453      1.1  mrg   /* HSA executable - the finalized program that is used to locate kernels.  */
    454      1.1  mrg   hsa_executable_t executable;
    455      1.1  mrg };
    456      1.1  mrg 
    457      1.1  mrg /* Information required to identify, finalize and run any given kernel.  */
    458      1.1  mrg 
    459      1.1  mrg enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
    460      1.1  mrg 
    461      1.1  mrg struct kernel_info
    462      1.1  mrg {
    463      1.1  mrg   /* Name of the kernel, required to locate it within the GCN object-code
    464      1.1  mrg      module.  */
    465      1.1  mrg   const char *name;
    466      1.1  mrg   /* The specific agent the kernel has been or will be finalized for and run
    467      1.1  mrg      on.  */
    468      1.1  mrg   struct agent_info *agent;
    469      1.1  mrg   /* The specific module where the kernel takes place.  */
    470      1.1  mrg   struct module_info *module;
    471      1.1  mrg   /* Information provided by mkoffload associated with the kernel.  */
    472      1.1  mrg   struct hsa_kernel_description *description;
    473      1.1  mrg   /* Mutex enforcing that at most once thread ever initializes a kernel for
    474      1.1  mrg      use.  A thread should have locked agent->module_rwlock for reading before
    475      1.1  mrg      acquiring it.  */
    476      1.1  mrg   pthread_mutex_t init_mutex;
    477      1.1  mrg   /* Flag indicating whether the kernel has been initialized and all fields
    478      1.1  mrg      below it contain valid data.  */
    479      1.1  mrg   bool initialized;
    480      1.1  mrg   /* Flag indicating that the kernel has a problem that blocks an execution.  */
    481      1.1  mrg   bool initialization_failed;
    482      1.1  mrg   /* The object to be put into the dispatch queue.  */
    483      1.1  mrg   uint64_t object;
    484      1.1  mrg   /* Required size of kernel arguments.  */
    485      1.1  mrg   uint32_t kernarg_segment_size;
    486      1.1  mrg   /* Required size of group segment.  */
    487      1.1  mrg   uint32_t group_segment_size;
    488      1.1  mrg   /* Required size of private segment.  */
    489      1.1  mrg   uint32_t private_segment_size;
    490      1.1  mrg   /* Set up for OpenMP or OpenACC?  */
    491      1.1  mrg   enum offload_kind kind;
    492      1.1  mrg };
    493      1.1  mrg 
    494      1.1  mrg /* Information about a particular GCN module, its image and kernels.  */
    495      1.1  mrg 
    496      1.1  mrg struct module_info
    497      1.1  mrg {
    498      1.1  mrg   /* The description with which the program has registered the image.  */
    499      1.1  mrg   struct gcn_image_desc *image_desc;
    500      1.1  mrg   /* GCN heap allocation.  */
    501      1.1  mrg   struct heap *heap;
    502      1.1  mrg   /* Physical boundaries of the loaded module.  */
    503      1.1  mrg   Elf64_Addr phys_address_start;
    504      1.1  mrg   Elf64_Addr phys_address_end;
    505      1.1  mrg 
    506      1.1  mrg   bool constructors_run_p;
    507      1.1  mrg   struct kernel_info *init_array_func, *fini_array_func;
    508      1.1  mrg 
    509      1.1  mrg   /* Number of kernels in this module.  */
    510      1.1  mrg   int kernel_count;
    511      1.1  mrg   /* An array of kernel_info structures describing each kernel in this
    512      1.1  mrg      module.  */
    513      1.1  mrg   struct kernel_info kernels[];
    514      1.1  mrg };
    515      1.1  mrg 
    516      1.1  mrg /* A linked list of memory arenas allocated on the device.
    517  1.1.1.3  mrg    These are used by OpenMP, as a means to optimize per-team malloc,
    518  1.1.1.3  mrg    and for host-accessible stack space.  */
    519      1.1  mrg 
    520  1.1.1.3  mrg struct ephemeral_memories_list
    521      1.1  mrg {
    522  1.1.1.3  mrg   struct ephemeral_memories_list *next;
    523      1.1  mrg 
    524  1.1.1.3  mrg   /* The size is determined by the number of teams and threads.  */
    525  1.1.1.3  mrg   size_t size;
    526  1.1.1.3  mrg   /* The device address allocated memory.  */
    527  1.1.1.3  mrg   void *address;
    528  1.1.1.3  mrg   /* A flag to prevent two asynchronous kernels trying to use the same memory.
    529      1.1  mrg      The mutex is locked until the kernel exits.  */
    530      1.1  mrg   pthread_mutex_t in_use;
    531      1.1  mrg };
    532      1.1  mrg 
    533      1.1  mrg /* Information about the whole HSA environment and all of its agents.  */
    534      1.1  mrg 
    535      1.1  mrg struct hsa_context_info
    536      1.1  mrg {
    537      1.1  mrg   /* Whether the structure has been initialized.  */
    538      1.1  mrg   bool initialized;
    539      1.1  mrg   /* Number of usable GPU HSA agents in the system.  */
    540      1.1  mrg   int agent_count;
    541      1.1  mrg   /* Array of agent_info structures describing the individual HSA agents.  */
    542      1.1  mrg   struct agent_info *agents;
    543      1.1  mrg   /* Driver version string. */
    544      1.1  mrg   char driver_version_s[30];
    545      1.1  mrg };
    546      1.1  mrg 
    547      1.1  mrg /* }}}  */
    548      1.1  mrg /* {{{ Global variables  */
    549      1.1  mrg 
    550      1.1  mrg /* Information about the whole HSA environment and all of its agents.  */
    551      1.1  mrg 
    552      1.1  mrg static struct hsa_context_info hsa_context;
    553      1.1  mrg 
    554      1.1  mrg /* HSA runtime functions that are initialized in init_hsa_context.  */
    555      1.1  mrg 
    556      1.1  mrg static struct hsa_runtime_fn_info hsa_fns;
    557      1.1  mrg 
    558      1.1  mrg /* Heap space, allocated target-side, provided for use of newlib malloc.
    559      1.1  mrg    Each module should have it's own heap allocated.
    560      1.1  mrg    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
    561      1.1  mrg 
    562      1.1  mrg static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
    563      1.1  mrg 
    564  1.1.1.3  mrg /* Ephemeral memory sizes for each kernel launch.  */
    565  1.1.1.3  mrg 
    566  1.1.1.3  mrg static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
    567  1.1.1.3  mrg static int stack_size = DEFAULT_GCN_STACK_SIZE;
    568  1.1.1.3  mrg static int lowlat_size = -1;
    569  1.1.1.3  mrg 
    570      1.1  mrg /* Flag to decide whether print to stderr information about what is going on.
    571      1.1  mrg    Set in init_debug depending on environment variables.  */
    572      1.1  mrg 
    573      1.1  mrg static bool debug;
    574      1.1  mrg 
    575      1.1  mrg /* Flag to decide if the runtime should suppress a possible fallback to host
    576      1.1  mrg    execution.  */
    577      1.1  mrg 
    578      1.1  mrg static bool suppress_host_fallback;
    579      1.1  mrg 
    580      1.1  mrg /* Flag to locate HSA runtime shared library that is dlopened
    581      1.1  mrg    by this plug-in.  */
    582      1.1  mrg 
    583      1.1  mrg static const char *hsa_runtime_lib;
    584      1.1  mrg 
    585      1.1  mrg /* Flag to decide if the runtime should support also CPU devices (can be
    586      1.1  mrg    a simulator).  */
    587      1.1  mrg 
    588      1.1  mrg static bool support_cpu_devices;
    589      1.1  mrg 
    590      1.1  mrg /* Runtime dimension overrides.  Zero indicates default.  */
    591      1.1  mrg 
    592      1.1  mrg static int override_x_dim = 0;
    593      1.1  mrg static int override_z_dim = 0;
    594      1.1  mrg 
    595      1.1  mrg /* }}}  */
    596      1.1  mrg /* {{{ Debug & Diagnostic  */
    597      1.1  mrg 
    598      1.1  mrg /* Print a message to stderr if GCN_DEBUG value is set to true.  */
    599      1.1  mrg 
    600      1.1  mrg #define DEBUG_PRINT(...) \
    601      1.1  mrg   do \
    602      1.1  mrg   { \
    603      1.1  mrg     if (debug) \
    604      1.1  mrg       { \
    605      1.1  mrg 	fprintf (stderr, __VA_ARGS__); \
    606      1.1  mrg       } \
    607      1.1  mrg   } \
    608      1.1  mrg   while (false);
    609      1.1  mrg 
    610      1.1  mrg /* Flush stderr if GCN_DEBUG value is set to true.  */
    611      1.1  mrg 
    612      1.1  mrg #define DEBUG_FLUSH()				\
    613      1.1  mrg   do {						\
    614      1.1  mrg     if (debug)					\
    615      1.1  mrg       fflush (stderr);				\
    616      1.1  mrg   } while (false)
    617      1.1  mrg 
    618      1.1  mrg /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
    619      1.1  mrg    is set to true.  */
    620      1.1  mrg 
    621      1.1  mrg #define DEBUG_LOG(prefix, ...)			\
    622      1.1  mrg   do						\
    623      1.1  mrg     {						\
    624      1.1  mrg       DEBUG_PRINT (prefix);			\
    625      1.1  mrg       DEBUG_PRINT (__VA_ARGS__);			\
    626      1.1  mrg       DEBUG_FLUSH ();				\
    627      1.1  mrg     } while (false)
    628      1.1  mrg 
    629      1.1  mrg /* Print a debugging message to stderr.  */
    630      1.1  mrg 
    631      1.1  mrg #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
    632      1.1  mrg 
    633      1.1  mrg /* Print a warning message to stderr.  */
    634      1.1  mrg 
    635      1.1  mrg #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
    636      1.1  mrg 
    637      1.1  mrg /* Print HSA warning STR with an HSA STATUS code.  */
    638      1.1  mrg 
    639      1.1  mrg static void
    640      1.1  mrg hsa_warn (const char *str, hsa_status_t status)
    641      1.1  mrg {
    642      1.1  mrg   if (!debug)
    643      1.1  mrg     return;
    644      1.1  mrg 
    645      1.1  mrg   const char *hsa_error_msg = "[unknown]";
    646      1.1  mrg   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
    647      1.1  mrg 
    648      1.1  mrg   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
    649      1.1  mrg 	   hsa_error_msg);
    650      1.1  mrg }
    651      1.1  mrg 
    652      1.1  mrg /* Report a fatal error STR together with the HSA error corresponding to STATUS
    653      1.1  mrg    and terminate execution of the current process.  */
    654      1.1  mrg 
    655      1.1  mrg static void
    656      1.1  mrg hsa_fatal (const char *str, hsa_status_t status)
    657      1.1  mrg {
    658      1.1  mrg   const char *hsa_error_msg = "[unknown]";
    659      1.1  mrg   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
    660      1.1  mrg   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
    661      1.1  mrg 		     hsa_error_msg);
    662      1.1  mrg }
    663      1.1  mrg 
    664      1.1  mrg /* Like hsa_fatal, except only report error message, and return FALSE
    665      1.1  mrg    for propagating error processing to outside of plugin.  */
    666      1.1  mrg 
    667      1.1  mrg static bool
    668      1.1  mrg hsa_error (const char *str, hsa_status_t status)
    669      1.1  mrg {
    670      1.1  mrg   const char *hsa_error_msg = "[unknown]";
    671      1.1  mrg   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
    672      1.1  mrg   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
    673      1.1  mrg 		     hsa_error_msg);
    674      1.1  mrg   return false;
    675      1.1  mrg }
    676      1.1  mrg 
    677      1.1  mrg /* Dump information about the available hardware.  */
    678      1.1  mrg 
    679      1.1  mrg static void
    680      1.1  mrg dump_hsa_system_info (void)
    681      1.1  mrg {
    682      1.1  mrg   hsa_status_t status;
    683      1.1  mrg 
    684      1.1  mrg   hsa_endianness_t endianness;
    685      1.1  mrg   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
    686      1.1  mrg 					   &endianness);
    687      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    688      1.1  mrg     switch (endianness)
    689      1.1  mrg       {
    690      1.1  mrg       case HSA_ENDIANNESS_LITTLE:
    691      1.1  mrg 	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
    692      1.1  mrg 	break;
    693      1.1  mrg       case HSA_ENDIANNESS_BIG:
    694      1.1  mrg 	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
    695      1.1  mrg 	break;
    696      1.1  mrg       default:
    697      1.1  mrg 	GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
    698      1.1  mrg       }
    699      1.1  mrg   else
    700      1.1  mrg     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
    701      1.1  mrg 
    702      1.1  mrg   uint8_t extensions[128];
    703      1.1  mrg   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
    704      1.1  mrg 					   &extensions);
    705      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    706      1.1  mrg     {
    707      1.1  mrg       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
    708      1.1  mrg 	GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
    709      1.1  mrg     }
    710      1.1  mrg   else
    711      1.1  mrg     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
    712      1.1  mrg }
    713      1.1  mrg 
    714      1.1  mrg /* Dump information about the available hardware.  */
    715      1.1  mrg 
    716      1.1  mrg static void
    717      1.1  mrg dump_machine_model (hsa_machine_model_t machine_model, const char *s)
    718      1.1  mrg {
    719      1.1  mrg   switch (machine_model)
    720      1.1  mrg     {
    721      1.1  mrg     case HSA_MACHINE_MODEL_SMALL:
    722      1.1  mrg       GCN_DEBUG ("%s: SMALL\n", s);
    723      1.1  mrg       break;
    724      1.1  mrg     case HSA_MACHINE_MODEL_LARGE:
    725      1.1  mrg       GCN_DEBUG ("%s: LARGE\n", s);
    726      1.1  mrg       break;
    727      1.1  mrg     default:
    728      1.1  mrg       GCN_WARNING ("%s: UNKNOWN\n", s);
    729      1.1  mrg       break;
    730      1.1  mrg     }
    731      1.1  mrg }
    732      1.1  mrg 
    733      1.1  mrg /* Dump information about the available hardware.  */
    734      1.1  mrg 
    735      1.1  mrg static void
    736      1.1  mrg dump_profile (hsa_profile_t profile, const char *s)
    737      1.1  mrg {
    738      1.1  mrg   switch (profile)
    739      1.1  mrg     {
    740      1.1  mrg     case HSA_PROFILE_FULL:
    741      1.1  mrg       GCN_DEBUG ("%s: FULL\n", s);
    742      1.1  mrg       break;
    743      1.1  mrg     case HSA_PROFILE_BASE:
    744      1.1  mrg       GCN_DEBUG ("%s: BASE\n", s);
    745      1.1  mrg       break;
    746      1.1  mrg     default:
    747      1.1  mrg       GCN_WARNING ("%s: UNKNOWN\n", s);
    748      1.1  mrg       break;
    749      1.1  mrg     }
    750      1.1  mrg }
    751      1.1  mrg 
    752      1.1  mrg /* Dump information about a device memory region.  */
    753      1.1  mrg 
    754      1.1  mrg static hsa_status_t
    755      1.1  mrg dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
    756      1.1  mrg {
    757      1.1  mrg   hsa_status_t status;
    758      1.1  mrg 
    759      1.1  mrg   hsa_region_segment_t segment;
    760      1.1  mrg   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
    761      1.1  mrg 					   &segment);
    762      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    763      1.1  mrg     {
    764      1.1  mrg       if (segment == HSA_REGION_SEGMENT_GLOBAL)
    765      1.1  mrg 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
    766      1.1  mrg       else if (segment == HSA_REGION_SEGMENT_READONLY)
    767      1.1  mrg 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
    768      1.1  mrg       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
    769      1.1  mrg 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
    770      1.1  mrg       else if (segment == HSA_REGION_SEGMENT_GROUP)
    771      1.1  mrg 	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
    772      1.1  mrg       else
    773      1.1  mrg 	GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
    774      1.1  mrg     }
    775      1.1  mrg   else
    776      1.1  mrg     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
    777      1.1  mrg 
    778      1.1  mrg   if (segment == HSA_REGION_SEGMENT_GLOBAL)
    779      1.1  mrg     {
    780      1.1  mrg       uint32_t flags;
    781      1.1  mrg       status
    782      1.1  mrg 	= hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
    783      1.1  mrg 					  &flags);
    784      1.1  mrg       if (status == HSA_STATUS_SUCCESS)
    785      1.1  mrg 	{
    786      1.1  mrg 	  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
    787      1.1  mrg 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
    788      1.1  mrg 	  if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
    789      1.1  mrg 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
    790      1.1  mrg 	  if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
    791      1.1  mrg 	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
    792      1.1  mrg 	}
    793      1.1  mrg       else
    794      1.1  mrg 	GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
    795      1.1  mrg     }
    796      1.1  mrg 
    797      1.1  mrg   size_t size;
    798      1.1  mrg   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
    799      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    800      1.1  mrg     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
    801      1.1  mrg   else
    802      1.1  mrg     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
    803      1.1  mrg 
    804      1.1  mrg   status
    805      1.1  mrg     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
    806      1.1  mrg 				      &size);
    807      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    808      1.1  mrg     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
    809      1.1  mrg   else
    810      1.1  mrg     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
    811      1.1  mrg 
    812      1.1  mrg   bool alloc_allowed;
    813      1.1  mrg   status
    814      1.1  mrg     = hsa_fns.hsa_region_get_info_fn (region,
    815      1.1  mrg 				      HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
    816      1.1  mrg 				      &alloc_allowed);
    817      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    818      1.1  mrg     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
    819      1.1  mrg   else
    820      1.1  mrg     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
    821      1.1  mrg 
    822      1.1  mrg   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
    823      1.1  mrg     return HSA_STATUS_SUCCESS;
    824      1.1  mrg 
    825      1.1  mrg   status
    826      1.1  mrg     = hsa_fns.hsa_region_get_info_fn (region,
    827      1.1  mrg 				      HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
    828      1.1  mrg 				      &size);
    829      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    830      1.1  mrg     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
    831      1.1  mrg   else
    832      1.1  mrg     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
    833      1.1  mrg 
    834      1.1  mrg   size_t align;
    835      1.1  mrg   status
    836      1.1  mrg     = hsa_fns.hsa_region_get_info_fn (region,
    837      1.1  mrg 				      HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
    838      1.1  mrg 				      &align);
    839      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    840      1.1  mrg     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
    841      1.1  mrg   else
    842      1.1  mrg     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
    843      1.1  mrg 
    844      1.1  mrg   return HSA_STATUS_SUCCESS;
    845      1.1  mrg }
    846      1.1  mrg 
    847      1.1  mrg /* Dump information about all the device memory regions.  */
    848      1.1  mrg 
    849      1.1  mrg static void
    850      1.1  mrg dump_hsa_regions (hsa_agent_t agent)
    851      1.1  mrg {
    852      1.1  mrg   hsa_status_t status;
    853      1.1  mrg   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
    854      1.1  mrg 						 dump_hsa_region,
    855      1.1  mrg 						 NULL);
    856      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
    857      1.1  mrg     hsa_error ("Dumping hsa regions failed", status);
    858      1.1  mrg }
    859      1.1  mrg 
    860      1.1  mrg /* Dump information about the available devices.  */
    861      1.1  mrg 
    862      1.1  mrg static hsa_status_t
    863      1.1  mrg dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
    864      1.1  mrg {
    865      1.1  mrg   hsa_status_t status;
    866      1.1  mrg 
    867      1.1  mrg   char buf[64];
    868      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
    869      1.1  mrg 					  &buf);
    870      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    871      1.1  mrg     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
    872      1.1  mrg   else
    873      1.1  mrg     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
    874      1.1  mrg 
    875      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
    876      1.1  mrg 					  &buf);
    877      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    878      1.1  mrg     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
    879      1.1  mrg   else
    880      1.1  mrg     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
    881      1.1  mrg 
    882      1.1  mrg   hsa_machine_model_t machine_model;
    883      1.1  mrg   status
    884      1.1  mrg     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
    885      1.1  mrg 				     &machine_model);
    886      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    887      1.1  mrg     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
    888      1.1  mrg   else
    889      1.1  mrg     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
    890      1.1  mrg 
    891      1.1  mrg   hsa_profile_t profile;
    892      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
    893      1.1  mrg 					  &profile);
    894      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    895      1.1  mrg     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
    896      1.1  mrg   else
    897      1.1  mrg     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
    898      1.1  mrg 
    899      1.1  mrg   hsa_device_type_t device_type;
    900      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
    901      1.1  mrg 					  &device_type);
    902      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    903      1.1  mrg     {
    904      1.1  mrg       switch (device_type)
    905      1.1  mrg 	{
    906      1.1  mrg 	case HSA_DEVICE_TYPE_CPU:
    907      1.1  mrg 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
    908      1.1  mrg 	  break;
    909      1.1  mrg 	case HSA_DEVICE_TYPE_GPU:
    910      1.1  mrg 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
    911      1.1  mrg 	  break;
    912      1.1  mrg 	case HSA_DEVICE_TYPE_DSP:
    913      1.1  mrg 	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
    914      1.1  mrg 	  break;
    915      1.1  mrg 	default:
    916      1.1  mrg 	  GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
    917      1.1  mrg 	  break;
    918      1.1  mrg 	}
    919      1.1  mrg     }
    920      1.1  mrg   else
    921      1.1  mrg     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
    922      1.1  mrg 
    923      1.1  mrg   uint32_t cu_count;
    924      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn
    925      1.1  mrg     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
    926      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    927      1.1  mrg     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
    928      1.1  mrg   else
    929      1.1  mrg     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
    930      1.1  mrg 
    931      1.1  mrg   uint32_t size;
    932      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
    933      1.1  mrg 					  &size);
    934      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    935      1.1  mrg     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
    936      1.1  mrg   else
    937      1.1  mrg     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
    938      1.1  mrg 
    939      1.1  mrg   uint32_t max_dim;
    940      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent,
    941      1.1  mrg 					  HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
    942      1.1  mrg 					  &max_dim);
    943      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    944      1.1  mrg     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
    945      1.1  mrg   else
    946      1.1  mrg     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
    947      1.1  mrg 
    948      1.1  mrg   uint32_t max_size;
    949      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent,
    950      1.1  mrg 					  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
    951      1.1  mrg 					  &max_size);
    952      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    953      1.1  mrg     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
    954      1.1  mrg   else
    955      1.1  mrg     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
    956      1.1  mrg 
    957      1.1  mrg   uint32_t grid_max_dim;
    958      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
    959      1.1  mrg 					  &grid_max_dim);
    960      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    961      1.1  mrg     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
    962      1.1  mrg   else
    963      1.1  mrg     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
    964      1.1  mrg 
    965      1.1  mrg   uint32_t grid_max_size;
    966      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
    967      1.1  mrg 					  &grid_max_size);
    968      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
    969      1.1  mrg     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
    970      1.1  mrg   else
    971      1.1  mrg     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
    972      1.1  mrg 
    973      1.1  mrg   dump_hsa_regions (agent);
    974      1.1  mrg 
    975      1.1  mrg   return HSA_STATUS_SUCCESS;
    976      1.1  mrg }
    977      1.1  mrg 
    978      1.1  mrg /* Forward reference.  */
    979      1.1  mrg 
    980      1.1  mrg static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
    981      1.1  mrg 
    982      1.1  mrg /* Helper function for dump_executable_symbols.  */
    983      1.1  mrg 
    984      1.1  mrg static hsa_status_t
    985      1.1  mrg dump_executable_symbol (hsa_executable_t executable,
    986      1.1  mrg 			hsa_executable_symbol_t symbol,
    987      1.1  mrg 			void *data __attribute__((unused)))
    988      1.1  mrg {
    989      1.1  mrg   char *name = get_executable_symbol_name (symbol);
    990      1.1  mrg 
    991      1.1  mrg   if (name)
    992      1.1  mrg     {
    993      1.1  mrg       GCN_DEBUG ("executable symbol: %s\n", name);
    994      1.1  mrg       free (name);
    995      1.1  mrg     }
    996      1.1  mrg 
    997      1.1  mrg   return HSA_STATUS_SUCCESS;
    998      1.1  mrg }
    999      1.1  mrg 
   1000      1.1  mrg /* Dump all global symbol in an executable.  */
   1001      1.1  mrg 
   1002      1.1  mrg static void
   1003      1.1  mrg dump_executable_symbols (hsa_executable_t executable)
   1004      1.1  mrg {
   1005      1.1  mrg   hsa_status_t status;
   1006      1.1  mrg   status
   1007      1.1  mrg     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
   1008      1.1  mrg 						 dump_executable_symbol,
   1009      1.1  mrg 						 NULL);
   1010      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1011      1.1  mrg     hsa_fatal ("Could not dump HSA executable symbols", status);
   1012      1.1  mrg }
   1013      1.1  mrg 
   1014      1.1  mrg /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
   1015      1.1  mrg 
   1016      1.1  mrg static void
   1017      1.1  mrg print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
   1018      1.1  mrg {
   1019      1.1  mrg   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
   1020      1.1  mrg 
   1021      1.1  mrg   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
   1022      1.1  mrg   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
   1023      1.1  mrg   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
   1024      1.1  mrg   fprintf (stderr, "%*sheap address: %p\n", indent, "",
   1025  1.1.1.3  mrg 	   (void*)kernargs->abi.heap_ptr);
   1026  1.1.1.3  mrg   fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
   1027  1.1.1.3  mrg 	   "", (void*)kernargs->abi.arena_ptr,
   1028  1.1.1.3  mrg 	   kernargs->abi.arena_size_per_team);
   1029  1.1.1.3  mrg   fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
   1030  1.1.1.3  mrg 	   "", (void*)kernargs->abi.stack_ptr,
   1031  1.1.1.3  mrg 	   kernargs->abi.stack_size_per_thread);
   1032      1.1  mrg   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
   1033      1.1  mrg   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
   1034      1.1  mrg 	   dispatch->private_segment_size);
   1035  1.1.1.3  mrg   fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
   1036  1.1.1.3  mrg 	   "", dispatch->group_segment_size);
   1037      1.1  mrg   fprintf (stderr, "\n");
   1038      1.1  mrg }
   1039      1.1  mrg 
   1040      1.1  mrg /* }}}  */
   1041      1.1  mrg /* {{{ Utility functions  */
   1042      1.1  mrg 
   1043      1.1  mrg /* Cast the thread local storage to gcn_thread.  */
   1044      1.1  mrg 
   1045      1.1  mrg static inline struct gcn_thread *
   1046      1.1  mrg gcn_thread (void)
   1047      1.1  mrg {
   1048      1.1  mrg   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
   1049      1.1  mrg }
   1050      1.1  mrg 
   1051      1.1  mrg /* Initialize debug and suppress_host_fallback according to the environment.  */
   1052      1.1  mrg 
   1053      1.1  mrg static void
   1054      1.1  mrg init_environment_variables (void)
   1055      1.1  mrg {
   1056      1.1  mrg   if (secure_getenv ("GCN_DEBUG"))
   1057      1.1  mrg     debug = true;
   1058      1.1  mrg   else
   1059      1.1  mrg     debug = false;
   1060      1.1  mrg 
   1061      1.1  mrg   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
   1062      1.1  mrg     suppress_host_fallback = true;
   1063      1.1  mrg   else
   1064      1.1  mrg     suppress_host_fallback = false;
   1065      1.1  mrg 
   1066      1.1  mrg   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
   1067      1.1  mrg   if (hsa_runtime_lib == NULL)
   1068  1.1.1.2  mrg     hsa_runtime_lib = "libhsa-runtime64.so.1";
   1069      1.1  mrg 
   1070      1.1  mrg   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
   1071      1.1  mrg 
   1072      1.1  mrg   const char *x = secure_getenv ("GCN_NUM_TEAMS");
   1073      1.1  mrg   if (!x)
   1074      1.1  mrg     x = secure_getenv ("GCN_NUM_GANGS");
   1075      1.1  mrg   if (x)
   1076      1.1  mrg     override_x_dim = atoi (x);
   1077      1.1  mrg 
   1078      1.1  mrg   const char *z = secure_getenv ("GCN_NUM_THREADS");
   1079      1.1  mrg   if (!z)
   1080      1.1  mrg     z = secure_getenv ("GCN_NUM_WORKERS");
   1081      1.1  mrg   if (z)
   1082      1.1  mrg     override_z_dim = atoi (z);
   1083      1.1  mrg 
   1084      1.1  mrg   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
   1085      1.1  mrg   if (heap)
   1086      1.1  mrg     {
   1087      1.1  mrg       size_t tmp = atol (heap);
   1088      1.1  mrg       if (tmp)
   1089      1.1  mrg 	gcn_kernel_heap_size = tmp;
   1090      1.1  mrg     }
   1091  1.1.1.3  mrg 
   1092  1.1.1.3  mrg   const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
   1093  1.1.1.3  mrg   if (arena)
   1094  1.1.1.3  mrg     {
   1095  1.1.1.3  mrg       int tmp = atoi (arena);
   1096  1.1.1.3  mrg       if (tmp)
   1097  1.1.1.3  mrg 	team_arena_size = tmp;;
   1098  1.1.1.3  mrg     }
   1099  1.1.1.3  mrg 
   1100  1.1.1.3  mrg   const char *stack = secure_getenv ("GCN_STACK_SIZE");
   1101  1.1.1.3  mrg   if (stack)
   1102  1.1.1.3  mrg     {
   1103  1.1.1.3  mrg       int tmp = atoi (stack);
   1104  1.1.1.3  mrg       if (tmp)
   1105  1.1.1.3  mrg 	stack_size = tmp;;
   1106  1.1.1.3  mrg     }
   1107  1.1.1.3  mrg 
   1108  1.1.1.3  mrg   const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
   1109  1.1.1.3  mrg   if (lowlat)
   1110  1.1.1.3  mrg     lowlat_size = atoi (lowlat);
   1111      1.1  mrg }
   1112      1.1  mrg 
   1113      1.1  mrg /* Return malloc'd string with name of SYMBOL.  */
   1114      1.1  mrg 
   1115      1.1  mrg static char *
   1116      1.1  mrg get_executable_symbol_name (hsa_executable_symbol_t symbol)
   1117      1.1  mrg {
   1118      1.1  mrg   hsa_status_t status;
   1119      1.1  mrg   char *res;
   1120      1.1  mrg   uint32_t len;
   1121      1.1  mrg   const hsa_executable_symbol_info_t info_name_length
   1122      1.1  mrg     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
   1123      1.1  mrg 
   1124      1.1  mrg   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
   1125      1.1  mrg 						      &len);
   1126      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1127      1.1  mrg     {
   1128      1.1  mrg       hsa_error ("Could not get length of symbol name", status);
   1129      1.1  mrg       return NULL;
   1130      1.1  mrg     }
   1131      1.1  mrg 
   1132      1.1  mrg   res = GOMP_PLUGIN_malloc (len + 1);
   1133      1.1  mrg 
   1134      1.1  mrg   const hsa_executable_symbol_info_t info_name
   1135      1.1  mrg     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
   1136      1.1  mrg 
   1137      1.1  mrg   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
   1138      1.1  mrg 
   1139      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1140      1.1  mrg     {
   1141      1.1  mrg       hsa_error ("Could not get symbol name", status);
   1142      1.1  mrg       free (res);
   1143      1.1  mrg       return NULL;
   1144      1.1  mrg     }
   1145      1.1  mrg 
   1146      1.1  mrg   res[len] = '\0';
   1147      1.1  mrg 
   1148      1.1  mrg   return res;
   1149      1.1  mrg }
   1150      1.1  mrg 
   1151      1.1  mrg /* Get the number of GPU Compute Units.  */
   1152      1.1  mrg 
   1153      1.1  mrg static int
   1154      1.1  mrg get_cu_count (struct agent_info *agent)
   1155      1.1  mrg {
   1156      1.1  mrg   uint32_t cu_count;
   1157      1.1  mrg   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
   1158      1.1  mrg     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
   1159      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
   1160      1.1  mrg     return cu_count;
   1161      1.1  mrg   else
   1162      1.1  mrg     return 64;  /* The usual number for older devices.  */
   1163      1.1  mrg }
   1164      1.1  mrg 
   1165      1.1  mrg /* Calculate the maximum grid size for OMP threads / OACC workers.
   1166      1.1  mrg    This depends on the kernel's resource usage levels.  */
   1167      1.1  mrg 
   1168      1.1  mrg static int
   1169      1.1  mrg limit_worker_threads (int threads)
   1170      1.1  mrg {
   1171      1.1  mrg   /* FIXME Do something more inteligent here.
   1172      1.1  mrg      GCN can always run 4 threads within a Compute Unit, but
   1173      1.1  mrg      more than that depends on register usage.  */
   1174      1.1  mrg   if (threads > 16)
   1175      1.1  mrg     threads = 16;
   1176      1.1  mrg   return threads;
   1177      1.1  mrg }
   1178      1.1  mrg 
   1179  1.1.1.3  mrg /* This sets the maximum number of teams to twice the number of GPU Compute
   1180  1.1.1.3  mrg    Units to avoid memory waste and corresponding memory access faults.  */
   1181  1.1.1.3  mrg 
   1182  1.1.1.3  mrg static int
   1183  1.1.1.3  mrg limit_teams (int teams, struct agent_info *agent)
   1184  1.1.1.3  mrg {
   1185  1.1.1.3  mrg   int max_teams = 2 * get_cu_count (agent);
   1186  1.1.1.3  mrg   if (teams > max_teams)
   1187  1.1.1.3  mrg     teams = max_teams;
   1188  1.1.1.3  mrg   return teams;
   1189  1.1.1.3  mrg }
   1190  1.1.1.3  mrg 
   1191      1.1  mrg /* Parse the target attributes INPUT provided by the compiler and return true
   1192      1.1  mrg    if we should run anything all.  If INPUT is NULL, fill DEF with default
   1193      1.1  mrg    values, then store INPUT or DEF into *RESULT.
   1194      1.1  mrg 
   1195      1.1  mrg    This is used for OpenMP only.  */
   1196      1.1  mrg 
   1197      1.1  mrg static bool
   1198      1.1  mrg parse_target_attributes (void **input,
   1199      1.1  mrg 			 struct GOMP_kernel_launch_attributes *def,
   1200      1.1  mrg 			 struct GOMP_kernel_launch_attributes **result,
   1201      1.1  mrg 			 struct agent_info *agent)
   1202      1.1  mrg {
   1203      1.1  mrg   if (!input)
   1204      1.1  mrg     GOMP_PLUGIN_fatal ("No target arguments provided");
   1205      1.1  mrg 
   1206      1.1  mrg   bool grid_attrs_found = false;
   1207      1.1  mrg   bool gcn_dims_found = false;
   1208      1.1  mrg   int gcn_teams = 0;
   1209      1.1  mrg   int gcn_threads = 0;
   1210      1.1  mrg   while (*input)
   1211      1.1  mrg     {
   1212      1.1  mrg       intptr_t id = (intptr_t) *input++, val;
   1213      1.1  mrg 
   1214      1.1  mrg       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
   1215      1.1  mrg 	val = (intptr_t) *input++;
   1216      1.1  mrg       else
   1217      1.1  mrg 	val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
   1218      1.1  mrg 
   1219      1.1  mrg       val = (val > INT_MAX) ? INT_MAX : val;
   1220      1.1  mrg 
   1221      1.1  mrg       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
   1222      1.1  mrg 	  && ((id & GOMP_TARGET_ARG_ID_MASK)
   1223      1.1  mrg 	      == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
   1224      1.1  mrg 	{
   1225      1.1  mrg 	  grid_attrs_found = true;
   1226      1.1  mrg 	  break;
   1227      1.1  mrg 	}
   1228      1.1  mrg       else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
   1229      1.1  mrg 	       == GOMP_TARGET_ARG_DEVICE_ALL)
   1230      1.1  mrg 	{
   1231      1.1  mrg 	  gcn_dims_found = true;
   1232      1.1  mrg 	  switch (id & GOMP_TARGET_ARG_ID_MASK)
   1233      1.1  mrg 	    {
   1234      1.1  mrg 	    case GOMP_TARGET_ARG_NUM_TEAMS:
   1235  1.1.1.3  mrg 	      gcn_teams = limit_teams (val, agent);
   1236      1.1  mrg 	      break;
   1237      1.1  mrg 	    case GOMP_TARGET_ARG_THREAD_LIMIT:
   1238      1.1  mrg 	      gcn_threads = limit_worker_threads (val);
   1239      1.1  mrg 	      break;
   1240      1.1  mrg 	    default:
   1241      1.1  mrg 	      ;
   1242      1.1  mrg 	    }
   1243      1.1  mrg 	}
   1244      1.1  mrg     }
   1245      1.1  mrg 
   1246      1.1  mrg   if (gcn_dims_found)
   1247      1.1  mrg     {
   1248  1.1.1.2  mrg       bool gfx900_workaround_p = false;
   1249  1.1.1.2  mrg 
   1250      1.1  mrg       if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
   1251      1.1  mrg 	  && gcn_threads == 0 && override_z_dim == 0)
   1252      1.1  mrg 	{
   1253  1.1.1.2  mrg 	  gfx900_workaround_p = true;
   1254      1.1  mrg 	  GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
   1255  1.1.1.2  mrg 		       "threads to at most 4 per team.\n");
   1256      1.1  mrg 	  GCN_WARNING (" - If this is not a Vega 10 device, please use "
   1257      1.1  mrg 		       "GCN_NUM_THREADS=16\n");
   1258      1.1  mrg 	}
   1259      1.1  mrg 
   1260  1.1.1.2  mrg       /* Ideally, when a dimension isn't explicitly specified, we should
   1261  1.1.1.2  mrg 	 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
   1262  1.1.1.2  mrg 	 In practice, we tune for peak performance on BabelStream, which
   1263  1.1.1.2  mrg 	 for OpenACC is currently 32 threads per CU.  */
   1264      1.1  mrg       def->ndim = 3;
   1265  1.1.1.2  mrg       if (gcn_teams <= 0 && gcn_threads <= 0)
   1266  1.1.1.2  mrg 	{
   1267  1.1.1.2  mrg 	  /* Set up a reasonable number of teams and threads.  */
   1268  1.1.1.2  mrg 	  gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
   1269  1.1.1.2  mrg 	  def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
   1270  1.1.1.2  mrg 	  def->gdims[2] = gcn_threads;
   1271  1.1.1.2  mrg 	}
   1272  1.1.1.2  mrg       else if (gcn_teams <= 0 && gcn_threads > 0)
   1273  1.1.1.2  mrg 	{
   1274  1.1.1.2  mrg 	  /* Auto-scale the number of teams with the number of threads.  */
   1275  1.1.1.2  mrg 	  def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
   1276  1.1.1.2  mrg 	  def->gdims[2] = gcn_threads;
   1277  1.1.1.2  mrg 	}
   1278  1.1.1.2  mrg       else if (gcn_teams > 0 && gcn_threads <= 0)
   1279  1.1.1.2  mrg 	{
   1280  1.1.1.2  mrg 	  int max_threads = gfx900_workaround_p ? 4 : 16;
   1281  1.1.1.2  mrg 
   1282  1.1.1.2  mrg 	  /* Auto-scale the number of threads with the number of teams.  */
   1283  1.1.1.2  mrg 	  def->gdims[0] = gcn_teams;
   1284  1.1.1.2  mrg 	  def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
   1285  1.1.1.2  mrg 	  if (def->gdims[2] == 0)
   1286  1.1.1.2  mrg 	    def->gdims[2] = 1;
   1287  1.1.1.2  mrg 	  else if (def->gdims[2] > max_threads)
   1288  1.1.1.2  mrg 	    def->gdims[2] = max_threads;
   1289  1.1.1.2  mrg 	}
   1290  1.1.1.2  mrg       else
   1291  1.1.1.2  mrg 	{
   1292  1.1.1.2  mrg 	  def->gdims[0] = gcn_teams;
   1293  1.1.1.2  mrg 	  def->gdims[2] = gcn_threads;
   1294  1.1.1.2  mrg 	}
   1295  1.1.1.2  mrg       def->gdims[1] = 64; /* Each thread is 64 work items wide.  */
   1296  1.1.1.2  mrg       def->wdims[0] = 1;  /* Single team per work-group.  */
   1297      1.1  mrg       def->wdims[1] = 64;
   1298      1.1  mrg       def->wdims[2] = 16;
   1299      1.1  mrg       *result = def;
   1300      1.1  mrg       return true;
   1301      1.1  mrg     }
   1302      1.1  mrg   else if (!grid_attrs_found)
   1303      1.1  mrg     {
   1304      1.1  mrg       def->ndim = 1;
   1305      1.1  mrg       def->gdims[0] = 1;
   1306      1.1  mrg       def->gdims[1] = 1;
   1307      1.1  mrg       def->gdims[2] = 1;
   1308      1.1  mrg       def->wdims[0] = 1;
   1309      1.1  mrg       def->wdims[1] = 1;
   1310      1.1  mrg       def->wdims[2] = 1;
   1311      1.1  mrg       *result = def;
   1312      1.1  mrg       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
   1313      1.1  mrg       return true;
   1314      1.1  mrg     }
   1315      1.1  mrg 
   1316      1.1  mrg   struct GOMP_kernel_launch_attributes *kla;
   1317      1.1  mrg   kla = (struct GOMP_kernel_launch_attributes *) *input;
   1318      1.1  mrg   *result = kla;
   1319      1.1  mrg   if (kla->ndim == 0 || kla->ndim > 3)
   1320      1.1  mrg     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
   1321      1.1  mrg 
   1322      1.1  mrg   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
   1323      1.1  mrg   unsigned i;
   1324      1.1  mrg   for (i = 0; i < kla->ndim; i++)
   1325      1.1  mrg     {
   1326      1.1  mrg       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
   1327      1.1  mrg 		 kla->gdims[i], kla->wdims[i]);
   1328      1.1  mrg       if (kla->gdims[i] == 0)
   1329      1.1  mrg 	return false;
   1330      1.1  mrg     }
   1331      1.1  mrg   return true;
   1332      1.1  mrg }
   1333      1.1  mrg 
   1334      1.1  mrg /* Return the group size given the requested GROUP size, GRID size and number
   1335      1.1  mrg    of grid dimensions NDIM.  */
   1336      1.1  mrg 
   1337      1.1  mrg static uint32_t
   1338      1.1  mrg get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
   1339      1.1  mrg {
   1340      1.1  mrg   if (group == 0)
   1341      1.1  mrg     {
   1342      1.1  mrg       /* TODO: Provide a default via environment or device characteristics.  */
   1343      1.1  mrg       if (ndim == 1)
   1344      1.1  mrg 	group = 64;
   1345      1.1  mrg       else if (ndim == 2)
   1346      1.1  mrg 	group = 8;
   1347      1.1  mrg       else
   1348      1.1  mrg 	group = 4;
   1349      1.1  mrg     }
   1350      1.1  mrg 
   1351      1.1  mrg   if (group > grid)
   1352      1.1  mrg     group = grid;
   1353      1.1  mrg   return group;
   1354      1.1  mrg }
   1355      1.1  mrg 
   1356      1.1  mrg /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
   1357      1.1  mrg 
   1358      1.1  mrg static void
   1359      1.1  mrg packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
   1360      1.1  mrg {
   1361      1.1  mrg   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
   1362      1.1  mrg }
   1363      1.1  mrg 
   1364      1.1  mrg /* A never-called callback for the HSA command queues.  These signal events
   1365      1.1  mrg    that we don't use, so we trigger an error.
   1366      1.1  mrg 
   1367      1.1  mrg    This "queue" is not to be confused with the async queues, below.  */
   1368      1.1  mrg 
   1369      1.1  mrg static void
   1370      1.1  mrg hsa_queue_callback (hsa_status_t status,
   1371      1.1  mrg 		hsa_queue_t *queue __attribute__ ((unused)),
   1372      1.1  mrg 		void *data __attribute__ ((unused)))
   1373      1.1  mrg {
   1374      1.1  mrg   hsa_fatal ("Asynchronous queue error", status);
   1375      1.1  mrg }
   1376      1.1  mrg 
   1377      1.1  mrg /* }}}  */
   1378      1.1  mrg /* {{{ HSA initialization  */
   1379      1.1  mrg 
   1380      1.1  mrg /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
   1381      1.1  mrg 
   1382      1.1  mrg static bool
   1383      1.1  mrg init_hsa_runtime_functions (void)
   1384      1.1  mrg {
   1385      1.1  mrg #define DLSYM_FN(function) \
   1386      1.1  mrg   hsa_fns.function##_fn = dlsym (handle, #function); \
   1387      1.1  mrg   if (hsa_fns.function##_fn == NULL) \
   1388  1.1.1.3  mrg     GOMP_PLUGIN_fatal ("'%s' is missing '%s'", hsa_runtime_lib, #function);
   1389  1.1.1.3  mrg #define DLSYM_OPT_FN(function) \
   1390  1.1.1.3  mrg   hsa_fns.function##_fn = dlsym (handle, #function);
   1391  1.1.1.3  mrg 
   1392      1.1  mrg   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
   1393      1.1  mrg   if (handle == NULL)
   1394      1.1  mrg     return false;
   1395      1.1  mrg 
   1396      1.1  mrg   DLSYM_FN (hsa_status_string)
   1397      1.1  mrg   DLSYM_FN (hsa_system_get_info)
   1398      1.1  mrg   DLSYM_FN (hsa_agent_get_info)
   1399      1.1  mrg   DLSYM_FN (hsa_init)
   1400      1.1  mrg   DLSYM_FN (hsa_iterate_agents)
   1401      1.1  mrg   DLSYM_FN (hsa_region_get_info)
   1402      1.1  mrg   DLSYM_FN (hsa_queue_create)
   1403      1.1  mrg   DLSYM_FN (hsa_agent_iterate_regions)
   1404      1.1  mrg   DLSYM_FN (hsa_executable_destroy)
   1405      1.1  mrg   DLSYM_FN (hsa_executable_create)
   1406      1.1  mrg   DLSYM_FN (hsa_executable_global_variable_define)
   1407      1.1  mrg   DLSYM_FN (hsa_executable_load_code_object)
   1408      1.1  mrg   DLSYM_FN (hsa_executable_freeze)
   1409      1.1  mrg   DLSYM_FN (hsa_signal_create)
   1410      1.1  mrg   DLSYM_FN (hsa_memory_allocate)
   1411      1.1  mrg   DLSYM_FN (hsa_memory_assign_agent)
   1412      1.1  mrg   DLSYM_FN (hsa_memory_copy)
   1413      1.1  mrg   DLSYM_FN (hsa_memory_free)
   1414      1.1  mrg   DLSYM_FN (hsa_signal_destroy)
   1415      1.1  mrg   DLSYM_FN (hsa_executable_get_symbol)
   1416      1.1  mrg   DLSYM_FN (hsa_executable_symbol_get_info)
   1417      1.1  mrg   DLSYM_FN (hsa_executable_iterate_symbols)
   1418      1.1  mrg   DLSYM_FN (hsa_queue_add_write_index_release)
   1419      1.1  mrg   DLSYM_FN (hsa_queue_load_read_index_acquire)
   1420      1.1  mrg   DLSYM_FN (hsa_signal_wait_acquire)
   1421      1.1  mrg   DLSYM_FN (hsa_signal_store_relaxed)
   1422      1.1  mrg   DLSYM_FN (hsa_signal_store_release)
   1423      1.1  mrg   DLSYM_FN (hsa_signal_load_acquire)
   1424      1.1  mrg   DLSYM_FN (hsa_queue_destroy)
   1425      1.1  mrg   DLSYM_FN (hsa_code_object_deserialize)
   1426  1.1.1.3  mrg   DLSYM_OPT_FN (hsa_amd_memory_lock)
   1427  1.1.1.3  mrg   DLSYM_OPT_FN (hsa_amd_memory_unlock)
   1428  1.1.1.3  mrg   DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
   1429      1.1  mrg   return true;
   1430  1.1.1.3  mrg #undef DLSYM_OPT_FN
   1431      1.1  mrg #undef DLSYM_FN
   1432      1.1  mrg }
   1433      1.1  mrg 
   1434  1.1.1.3  mrg static gcn_isa isa_code (const char *isa);
   1435  1.1.1.3  mrg 
   1436      1.1  mrg /* Return true if the agent is a GPU and can accept of concurrent submissions
   1437      1.1  mrg    from different threads.  */
   1438      1.1  mrg 
   1439      1.1  mrg static bool
   1440      1.1  mrg suitable_hsa_agent_p (hsa_agent_t agent)
   1441      1.1  mrg {
   1442      1.1  mrg   hsa_device_type_t device_type;
   1443      1.1  mrg   hsa_status_t status
   1444      1.1  mrg     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
   1445      1.1  mrg 				     &device_type);
   1446      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1447      1.1  mrg     return false;
   1448      1.1  mrg 
   1449      1.1  mrg   switch (device_type)
   1450      1.1  mrg     {
   1451      1.1  mrg     case HSA_DEVICE_TYPE_GPU:
   1452  1.1.1.3  mrg       {
   1453  1.1.1.3  mrg 	char name[64];
   1454  1.1.1.3  mrg 	hsa_status_t status
   1455  1.1.1.3  mrg 	  = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, name);
   1456  1.1.1.3  mrg 	if (status != HSA_STATUS_SUCCESS
   1457  1.1.1.3  mrg 	    || isa_code (name) == EF_AMDGPU_MACH_UNSUPPORTED)
   1458  1.1.1.3  mrg 	  {
   1459  1.1.1.3  mrg 	    GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
   1460  1.1.1.3  mrg 		       status == HSA_STATUS_SUCCESS ? name : "invalid");
   1461  1.1.1.3  mrg 	    return false;
   1462  1.1.1.3  mrg 	  }
   1463  1.1.1.3  mrg       }
   1464      1.1  mrg       break;
   1465      1.1  mrg     case HSA_DEVICE_TYPE_CPU:
   1466      1.1  mrg       if (!support_cpu_devices)
   1467      1.1  mrg 	return false;
   1468      1.1  mrg       break;
   1469      1.1  mrg     default:
   1470      1.1  mrg       return false;
   1471      1.1  mrg     }
   1472      1.1  mrg 
   1473      1.1  mrg   uint32_t features = 0;
   1474      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
   1475      1.1  mrg 					  &features);
   1476      1.1  mrg   if (status != HSA_STATUS_SUCCESS
   1477      1.1  mrg       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
   1478      1.1  mrg     return false;
   1479      1.1  mrg   hsa_queue_type_t queue_type;
   1480      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
   1481      1.1  mrg 					  &queue_type);
   1482      1.1  mrg   if (status != HSA_STATUS_SUCCESS
   1483      1.1  mrg       || (queue_type != HSA_QUEUE_TYPE_MULTI))
   1484      1.1  mrg     return false;
   1485      1.1  mrg 
   1486      1.1  mrg   return true;
   1487      1.1  mrg }
   1488      1.1  mrg 
   1489      1.1  mrg /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
   1490      1.1  mrg    agent_count in hsa_context.  */
   1491      1.1  mrg 
   1492      1.1  mrg static hsa_status_t
   1493      1.1  mrg count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
   1494      1.1  mrg {
   1495      1.1  mrg   if (suitable_hsa_agent_p (agent))
   1496      1.1  mrg     hsa_context.agent_count++;
   1497      1.1  mrg   return HSA_STATUS_SUCCESS;
   1498      1.1  mrg }
   1499      1.1  mrg 
   1500      1.1  mrg /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
   1501      1.1  mrg    id to the describing structure in the hsa context.  The index of the
   1502      1.1  mrg    structure is pointed to by DATA, increment it afterwards.  */
   1503      1.1  mrg 
   1504      1.1  mrg static hsa_status_t
   1505      1.1  mrg assign_agent_ids (hsa_agent_t agent, void *data)
   1506      1.1  mrg {
   1507      1.1  mrg   if (suitable_hsa_agent_p (agent))
   1508      1.1  mrg     {
   1509      1.1  mrg       int *agent_index = (int *) data;
   1510      1.1  mrg       hsa_context.agents[*agent_index].id = agent;
   1511      1.1  mrg       ++*agent_index;
   1512      1.1  mrg     }
   1513      1.1  mrg   return HSA_STATUS_SUCCESS;
   1514      1.1  mrg }
   1515      1.1  mrg 
   1516      1.1  mrg /* Initialize hsa_context if it has not already been done.
   1517  1.1.1.3  mrg    If !PROBE: returns TRUE on success.
   1518  1.1.1.3  mrg    If PROBE: returns TRUE on success or if the plugin/device shall be silently
   1519  1.1.1.3  mrg    ignored, and otherwise emits an error and returns FALSE.  */
   1520      1.1  mrg 
   1521      1.1  mrg static bool
   1522  1.1.1.3  mrg init_hsa_context (bool probe)
   1523      1.1  mrg {
   1524      1.1  mrg   hsa_status_t status;
   1525      1.1  mrg   int agent_index = 0;
   1526      1.1  mrg 
   1527      1.1  mrg   if (hsa_context.initialized)
   1528      1.1  mrg     return true;
   1529      1.1  mrg   init_environment_variables ();
   1530      1.1  mrg   if (!init_hsa_runtime_functions ())
   1531      1.1  mrg     {
   1532  1.1.1.3  mrg       const char *msg = "Run-time could not be dynamically opened";
   1533      1.1  mrg       if (suppress_host_fallback)
   1534  1.1.1.3  mrg 	GOMP_PLUGIN_fatal ("%s\n", msg);
   1535  1.1.1.3  mrg       else
   1536  1.1.1.3  mrg 	GCN_WARNING ("%s\n", msg);
   1537  1.1.1.3  mrg       return probe ? true : false;
   1538      1.1  mrg     }
   1539      1.1  mrg   status = hsa_fns.hsa_init_fn ();
   1540      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1541      1.1  mrg     return hsa_error ("Run-time could not be initialized", status);
   1542      1.1  mrg   GCN_DEBUG ("HSA run-time initialized for GCN\n");
   1543      1.1  mrg 
   1544      1.1  mrg   if (debug)
   1545      1.1  mrg     dump_hsa_system_info ();
   1546      1.1  mrg 
   1547      1.1  mrg   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
   1548      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1549      1.1  mrg     return hsa_error ("GCN GPU devices could not be enumerated", status);
   1550      1.1  mrg   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
   1551      1.1  mrg 
   1552      1.1  mrg   hsa_context.agents
   1553      1.1  mrg     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
   1554      1.1  mrg 				  * sizeof (struct agent_info));
   1555      1.1  mrg   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
   1556      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1557      1.1  mrg     return hsa_error ("Scanning compute agents failed", status);
   1558      1.1  mrg   if (agent_index != hsa_context.agent_count)
   1559      1.1  mrg     {
   1560      1.1  mrg       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
   1561      1.1  mrg       return false;
   1562      1.1  mrg     }
   1563      1.1  mrg 
   1564      1.1  mrg   if (debug)
   1565      1.1  mrg     {
   1566      1.1  mrg       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
   1567      1.1  mrg       if (status != HSA_STATUS_SUCCESS)
   1568      1.1  mrg 	GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
   1569      1.1  mrg     }
   1570      1.1  mrg 
   1571      1.1  mrg   uint16_t minor, major;
   1572      1.1  mrg   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
   1573      1.1  mrg 					   &minor);
   1574      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1575      1.1  mrg     GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
   1576      1.1  mrg   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
   1577      1.1  mrg 					   &major);
   1578      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1579      1.1  mrg     GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
   1580      1.1  mrg 
   1581      1.1  mrg   size_t len = sizeof hsa_context.driver_version_s;
   1582      1.1  mrg   int printed = snprintf (hsa_context.driver_version_s, len,
   1583      1.1  mrg 			  "HSA Runtime %hu.%hu", (unsigned short int)major,
   1584      1.1  mrg 			  (unsigned short int)minor);
   1585      1.1  mrg   if (printed >= len)
   1586      1.1  mrg     GCN_WARNING ("HSA runtime version string was truncated."
   1587      1.1  mrg 		 "Version %hu.%hu is too long.", (unsigned short int)major,
   1588      1.1  mrg 		 (unsigned short int)minor);
   1589      1.1  mrg 
   1590      1.1  mrg   hsa_context.initialized = true;
   1591      1.1  mrg   return true;
   1592      1.1  mrg }
   1593      1.1  mrg 
   1594      1.1  mrg /* Verify that hsa_context has already been initialized and return the
   1595      1.1  mrg    agent_info structure describing device number N.  Return NULL on error.  */
   1596      1.1  mrg 
   1597      1.1  mrg static struct agent_info *
   1598      1.1  mrg get_agent_info (int n)
   1599      1.1  mrg {
   1600      1.1  mrg   if (!hsa_context.initialized)
   1601      1.1  mrg     {
   1602      1.1  mrg       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
   1603      1.1  mrg       return NULL;
   1604      1.1  mrg     }
   1605      1.1  mrg   if (n >= hsa_context.agent_count)
   1606      1.1  mrg     {
   1607      1.1  mrg       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
   1608      1.1  mrg       return NULL;
   1609      1.1  mrg     }
   1610      1.1  mrg   if (!hsa_context.agents[n].initialized)
   1611      1.1  mrg     {
   1612      1.1  mrg       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
   1613      1.1  mrg       return NULL;
   1614      1.1  mrg     }
   1615      1.1  mrg   return &hsa_context.agents[n];
   1616      1.1  mrg }
   1617      1.1  mrg 
   1618      1.1  mrg /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
   1619      1.1  mrg 
   1620      1.1  mrg    Selects (breaks at) a suitable region of type KIND.  */
   1621      1.1  mrg 
   1622      1.1  mrg static hsa_status_t
   1623      1.1  mrg get_memory_region (hsa_region_t region, hsa_region_t *retval,
   1624      1.1  mrg 		   hsa_region_global_flag_t kind)
   1625      1.1  mrg {
   1626      1.1  mrg   hsa_status_t status;
   1627      1.1  mrg   hsa_region_segment_t segment;
   1628      1.1  mrg 
   1629      1.1  mrg   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
   1630      1.1  mrg 					   &segment);
   1631      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1632      1.1  mrg     return status;
   1633      1.1  mrg   if (segment != HSA_REGION_SEGMENT_GLOBAL)
   1634      1.1  mrg     return HSA_STATUS_SUCCESS;
   1635      1.1  mrg 
   1636      1.1  mrg   uint32_t flags;
   1637      1.1  mrg   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
   1638      1.1  mrg 					   &flags);
   1639      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1640      1.1  mrg     return status;
   1641      1.1  mrg   if (flags & kind)
   1642      1.1  mrg     {
   1643      1.1  mrg       *retval = region;
   1644      1.1  mrg       return HSA_STATUS_INFO_BREAK;
   1645      1.1  mrg     }
   1646      1.1  mrg   return HSA_STATUS_SUCCESS;
   1647      1.1  mrg }
   1648      1.1  mrg 
   1649      1.1  mrg /* Callback of hsa_agent_iterate_regions.
   1650      1.1  mrg 
   1651      1.1  mrg    Selects a kernargs memory region.  */
   1652      1.1  mrg 
   1653      1.1  mrg static hsa_status_t
   1654      1.1  mrg get_kernarg_memory_region (hsa_region_t region, void *data)
   1655      1.1  mrg {
   1656      1.1  mrg   return get_memory_region (region, (hsa_region_t *)data,
   1657      1.1  mrg 			    HSA_REGION_GLOBAL_FLAG_KERNARG);
   1658      1.1  mrg }
   1659      1.1  mrg 
   1660      1.1  mrg /* Callback of hsa_agent_iterate_regions.
   1661      1.1  mrg 
   1662      1.1  mrg    Selects a coarse-grained memory region suitable for the heap and
   1663      1.1  mrg    offload data.  */
   1664      1.1  mrg 
   1665      1.1  mrg static hsa_status_t
   1666      1.1  mrg get_data_memory_region (hsa_region_t region, void *data)
   1667      1.1  mrg {
   1668      1.1  mrg   return get_memory_region (region, (hsa_region_t *)data,
   1669      1.1  mrg 			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
   1670      1.1  mrg }
   1671      1.1  mrg 
   1672      1.1  mrg static int
   1673      1.1  mrg elf_gcn_isa_field (Elf64_Ehdr *image)
   1674      1.1  mrg {
   1675      1.1  mrg   return image->e_flags & EF_AMDGPU_MACH_MASK;
   1676      1.1  mrg }
   1677      1.1  mrg 
   1678      1.1  mrg const static char *gcn_gfx803_s = "gfx803";
   1679      1.1  mrg const static char *gcn_gfx900_s = "gfx900";
   1680      1.1  mrg const static char *gcn_gfx906_s = "gfx906";
   1681  1.1.1.2  mrg const static char *gcn_gfx908_s = "gfx908";
   1682  1.1.1.3  mrg const static char *gcn_gfx90a_s = "gfx90a";
   1683  1.1.1.3  mrg const static char *gcn_gfx90c_s = "gfx90c";
   1684  1.1.1.3  mrg const static char *gcn_gfx1030_s = "gfx1030";
   1685  1.1.1.3  mrg const static char *gcn_gfx1036_s = "gfx1036";
   1686  1.1.1.3  mrg const static char *gcn_gfx1100_s = "gfx1100";
   1687  1.1.1.3  mrg const static char *gcn_gfx1103_s = "gfx1103";
   1688  1.1.1.3  mrg const static int gcn_isa_name_len = 7;
   1689      1.1  mrg 
   1690      1.1  mrg /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
   1691      1.1  mrg    support the ISA. */
   1692      1.1  mrg 
   1693      1.1  mrg static const char*
   1694      1.1  mrg isa_hsa_name (int isa) {
   1695      1.1  mrg   switch(isa)
   1696      1.1  mrg     {
   1697      1.1  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX803:
   1698      1.1  mrg       return gcn_gfx803_s;
   1699      1.1  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX900:
   1700      1.1  mrg       return gcn_gfx900_s;
   1701      1.1  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX906:
   1702      1.1  mrg       return gcn_gfx906_s;
   1703  1.1.1.2  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX908:
   1704  1.1.1.2  mrg       return gcn_gfx908_s;
   1705  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
   1706  1.1.1.3  mrg       return gcn_gfx90a_s;
   1707  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX90c:
   1708  1.1.1.3  mrg       return gcn_gfx90c_s;
   1709  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
   1710  1.1.1.3  mrg       return gcn_gfx1030_s;
   1711  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX1036:
   1712  1.1.1.3  mrg       return gcn_gfx1036_s;
   1713  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX1100:
   1714  1.1.1.3  mrg       return gcn_gfx1100_s;
   1715  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX1103:
   1716  1.1.1.3  mrg       return gcn_gfx1103_s;
   1717      1.1  mrg     }
   1718      1.1  mrg   return NULL;
   1719      1.1  mrg }
   1720      1.1  mrg 
   1721      1.1  mrg /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
   1722      1.1  mrg    with -march) or NULL if we do not support the ISA.
   1723      1.1  mrg    Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
   1724      1.1  mrg 
   1725      1.1  mrg static const char*
   1726      1.1  mrg isa_gcc_name (int isa) {
   1727      1.1  mrg   switch(isa)
   1728      1.1  mrg     {
   1729      1.1  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX803:
   1730      1.1  mrg       return "fiji";
   1731      1.1  mrg     default:
   1732      1.1  mrg       return isa_hsa_name (isa);
   1733      1.1  mrg     }
   1734      1.1  mrg }
   1735      1.1  mrg 
   1736      1.1  mrg /* Returns the code which is used in the GCN object code to identify the ISA with
   1737      1.1  mrg    the given name (as used by the HSA runtime).  */
   1738      1.1  mrg 
   1739      1.1  mrg static gcn_isa
   1740      1.1  mrg isa_code(const char *isa) {
   1741      1.1  mrg   if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
   1742      1.1  mrg     return EF_AMDGPU_MACH_AMDGCN_GFX803;
   1743      1.1  mrg 
   1744      1.1  mrg   if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
   1745      1.1  mrg     return EF_AMDGPU_MACH_AMDGCN_GFX900;
   1746      1.1  mrg 
   1747      1.1  mrg   if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
   1748      1.1  mrg     return EF_AMDGPU_MACH_AMDGCN_GFX906;
   1749      1.1  mrg 
   1750  1.1.1.2  mrg   if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
   1751  1.1.1.2  mrg     return EF_AMDGPU_MACH_AMDGCN_GFX908;
   1752  1.1.1.2  mrg 
   1753  1.1.1.3  mrg   if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
   1754  1.1.1.3  mrg     return EF_AMDGPU_MACH_AMDGCN_GFX90a;
   1755  1.1.1.3  mrg 
   1756  1.1.1.3  mrg   if (!strncmp (isa, gcn_gfx90c_s, gcn_isa_name_len))
   1757  1.1.1.3  mrg     return EF_AMDGPU_MACH_AMDGCN_GFX90c;
   1758  1.1.1.3  mrg 
   1759  1.1.1.3  mrg   if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
   1760  1.1.1.3  mrg     return EF_AMDGPU_MACH_AMDGCN_GFX1030;
   1761  1.1.1.3  mrg 
   1762  1.1.1.3  mrg   if (!strncmp (isa, gcn_gfx1036_s, gcn_isa_name_len))
   1763  1.1.1.3  mrg     return EF_AMDGPU_MACH_AMDGCN_GFX1036;
   1764  1.1.1.3  mrg 
   1765  1.1.1.3  mrg   if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len))
   1766  1.1.1.3  mrg     return EF_AMDGPU_MACH_AMDGCN_GFX1100;
   1767  1.1.1.3  mrg 
   1768  1.1.1.3  mrg   if (!strncmp (isa, gcn_gfx1103_s, gcn_isa_name_len))
   1769  1.1.1.3  mrg     return EF_AMDGPU_MACH_AMDGCN_GFX1103;
   1770  1.1.1.3  mrg 
   1771  1.1.1.3  mrg   return EF_AMDGPU_MACH_UNSUPPORTED;
   1772  1.1.1.3  mrg }
   1773  1.1.1.3  mrg 
   1774  1.1.1.3  mrg /* CDNA2 devices have twice as many VGPRs compared to older devices.  */
   1775  1.1.1.3  mrg 
   1776  1.1.1.3  mrg static int
   1777  1.1.1.3  mrg max_isa_vgprs (int isa)
   1778  1.1.1.3  mrg {
   1779  1.1.1.3  mrg   switch (isa)
   1780  1.1.1.3  mrg     {
   1781  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX803:
   1782  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX900:
   1783  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX906:
   1784  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX908:
   1785  1.1.1.3  mrg       return 256;
   1786  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
   1787  1.1.1.3  mrg       return 512;
   1788  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX90c:
   1789  1.1.1.3  mrg       return 256;
   1790  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
   1791  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX1036:
   1792  1.1.1.3  mrg       return 512;  /* 512 SIMD32 = 256 wavefrontsize64.  */
   1793  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX1100:
   1794  1.1.1.3  mrg     case EF_AMDGPU_MACH_AMDGCN_GFX1103:
   1795  1.1.1.3  mrg       return 1536; /* 1536 SIMD32 = 768 wavefrontsize64.  */
   1796  1.1.1.3  mrg     }
   1797  1.1.1.3  mrg   GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
   1798      1.1  mrg }
   1799      1.1  mrg 
   1800      1.1  mrg /* }}}  */
   1801      1.1  mrg /* {{{ Run  */
   1802      1.1  mrg 
   1803  1.1.1.3  mrg /* Create or reuse a team arena and stack space.
   1804      1.1  mrg 
   1805      1.1  mrg    Team arenas are used by OpenMP to avoid calling malloc multiple times
   1806      1.1  mrg    while setting up each team.  This is purely a performance optimization.
   1807      1.1  mrg 
   1808  1.1.1.3  mrg    The stack space is used by all kernels.  We must allocate it in such a
   1809  1.1.1.3  mrg    way that the reverse offload implmentation can access the data.
   1810      1.1  mrg 
   1811  1.1.1.3  mrg    Allocating this memory costs performance, so this function will reuse an
   1812  1.1.1.3  mrg    existing allocation if a large enough one is idle.
   1813  1.1.1.3  mrg    The memory lock is released, but not deallocated, when the kernel exits.  */
   1814  1.1.1.3  mrg 
   1815  1.1.1.3  mrg static void
   1816  1.1.1.3  mrg configure_ephemeral_memories (struct kernel_info *kernel,
   1817  1.1.1.3  mrg 			      struct kernargs_abi *kernargs, int num_teams,
   1818  1.1.1.3  mrg 			      int num_threads)
   1819      1.1  mrg {
   1820  1.1.1.3  mrg   struct agent_info *agent = kernel->agent;
   1821  1.1.1.3  mrg   struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
   1822  1.1.1.3  mrg   struct ephemeral_memories_list *item;
   1823  1.1.1.3  mrg 
   1824  1.1.1.3  mrg   int actual_arena_size = (kernel->kind == KIND_OPENMP
   1825  1.1.1.3  mrg 			   ? team_arena_size : 0);
   1826  1.1.1.3  mrg   int actual_arena_total_size = actual_arena_size * num_teams;
   1827  1.1.1.3  mrg   size_t size = (actual_arena_total_size
   1828  1.1.1.3  mrg 		 + num_teams * num_threads * stack_size);
   1829      1.1  mrg 
   1830      1.1  mrg   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
   1831      1.1  mrg     {
   1832  1.1.1.3  mrg       if (item->size < size)
   1833      1.1  mrg 	continue;
   1834      1.1  mrg 
   1835  1.1.1.3  mrg       if (pthread_mutex_trylock (&item->in_use) == 0)
   1836  1.1.1.3  mrg 	break;
   1837      1.1  mrg     }
   1838      1.1  mrg 
   1839  1.1.1.3  mrg   if (!item)
   1840      1.1  mrg     {
   1841  1.1.1.3  mrg       GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
   1842  1.1.1.3  mrg 		 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
   1843  1.1.1.3  mrg 		 num_teams, num_threads, size);
   1844      1.1  mrg 
   1845  1.1.1.3  mrg       if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
   1846  1.1.1.3  mrg 	{
   1847  1.1.1.3  mrg 	  GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
   1848  1.1.1.3  mrg 	  return;
   1849  1.1.1.3  mrg 	}
   1850  1.1.1.3  mrg       item = malloc (sizeof (*item));
   1851  1.1.1.3  mrg       item->size = size;
   1852  1.1.1.3  mrg       item->next = NULL;
   1853  1.1.1.3  mrg       *next_ptr = item;
   1854      1.1  mrg 
   1855  1.1.1.3  mrg       if (pthread_mutex_init (&item->in_use, NULL))
   1856  1.1.1.3  mrg 	{
   1857  1.1.1.3  mrg 	  GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
   1858  1.1.1.3  mrg 	  return;
   1859  1.1.1.3  mrg 	}
   1860  1.1.1.3  mrg       if (pthread_mutex_lock (&item->in_use))
   1861  1.1.1.3  mrg 	{
   1862  1.1.1.3  mrg 	  GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
   1863  1.1.1.3  mrg 	  return;
   1864  1.1.1.3  mrg 	}
   1865  1.1.1.3  mrg       if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
   1866  1.1.1.3  mrg 	{
   1867  1.1.1.3  mrg 	  GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
   1868  1.1.1.3  mrg 	  return;
   1869  1.1.1.3  mrg 	}
   1870  1.1.1.3  mrg 
   1871  1.1.1.3  mrg       hsa_status_t status;
   1872  1.1.1.3  mrg       status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
   1873  1.1.1.3  mrg 					       &item->address);
   1874  1.1.1.3  mrg       if (status != HSA_STATUS_SUCCESS)
   1875  1.1.1.3  mrg 	hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
   1876  1.1.1.3  mrg       status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
   1877  1.1.1.3  mrg 						   HSA_ACCESS_PERMISSION_RW);
   1878  1.1.1.3  mrg       if (status != HSA_STATUS_SUCCESS)
   1879  1.1.1.3  mrg 	hsa_fatal ("Could not assign arena & stack memory to device", status);
   1880  1.1.1.3  mrg     }
   1881      1.1  mrg 
   1882  1.1.1.3  mrg   kernargs->arena_ptr = (actual_arena_total_size
   1883  1.1.1.3  mrg 			 ? (uint64_t)item->address
   1884  1.1.1.3  mrg 			 : 0);
   1885  1.1.1.3  mrg   kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
   1886  1.1.1.3  mrg   kernargs->arena_size_per_team = actual_arena_size;
   1887  1.1.1.3  mrg   kernargs->stack_size_per_thread = stack_size;
   1888      1.1  mrg }
   1889      1.1  mrg 
   1890  1.1.1.3  mrg /* Mark an ephemeral memory space available for reuse.  */
   1891      1.1  mrg 
   1892      1.1  mrg static void
   1893  1.1.1.3  mrg release_ephemeral_memories (struct agent_info* agent, void *address)
   1894      1.1  mrg {
   1895  1.1.1.3  mrg   struct ephemeral_memories_list *item;
   1896      1.1  mrg 
   1897  1.1.1.3  mrg   for (item = agent->ephemeral_memories_list; item; item = item->next)
   1898      1.1  mrg     {
   1899  1.1.1.3  mrg       if (item->address == address)
   1900      1.1  mrg 	{
   1901      1.1  mrg 	  if (pthread_mutex_unlock (&item->in_use))
   1902      1.1  mrg 	    GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
   1903      1.1  mrg 	  return;
   1904      1.1  mrg 	}
   1905      1.1  mrg     }
   1906      1.1  mrg   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
   1907      1.1  mrg }
   1908      1.1  mrg 
   1909      1.1  mrg /* Clean up all the allocated team arenas.  */
   1910      1.1  mrg 
   1911      1.1  mrg static bool
   1912  1.1.1.3  mrg destroy_ephemeral_memories (struct agent_info *agent)
   1913      1.1  mrg {
   1914  1.1.1.3  mrg   struct ephemeral_memories_list *item, *next;
   1915      1.1  mrg 
   1916  1.1.1.3  mrg   for (item = agent->ephemeral_memories_list; item; item = next)
   1917      1.1  mrg     {
   1918      1.1  mrg       next = item->next;
   1919  1.1.1.3  mrg       hsa_fns.hsa_memory_free_fn (item->address);
   1920      1.1  mrg       if (pthread_mutex_destroy (&item->in_use))
   1921      1.1  mrg 	{
   1922  1.1.1.3  mrg 	  GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
   1923      1.1  mrg 	  return false;
   1924      1.1  mrg 	}
   1925      1.1  mrg       free (item);
   1926      1.1  mrg     }
   1927  1.1.1.3  mrg   agent->ephemeral_memories_list = NULL;
   1928      1.1  mrg 
   1929      1.1  mrg   return true;
   1930      1.1  mrg }
   1931      1.1  mrg 
   1932      1.1  mrg /* Allocate memory on a specified device.  */
   1933      1.1  mrg 
   1934      1.1  mrg static void *
   1935      1.1  mrg alloc_by_agent (struct agent_info *agent, size_t size)
   1936      1.1  mrg {
   1937      1.1  mrg   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
   1938      1.1  mrg 
   1939      1.1  mrg   void *ptr;
   1940      1.1  mrg   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
   1941      1.1  mrg 							size, &ptr);
   1942      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1943      1.1  mrg     {
   1944      1.1  mrg       hsa_error ("Could not allocate device memory", status);
   1945      1.1  mrg       return NULL;
   1946      1.1  mrg     }
   1947      1.1  mrg 
   1948      1.1  mrg   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
   1949      1.1  mrg 					       HSA_ACCESS_PERMISSION_RW);
   1950      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   1951      1.1  mrg     {
   1952      1.1  mrg       hsa_error ("Could not assign data memory to device", status);
   1953      1.1  mrg       return NULL;
   1954      1.1  mrg     }
   1955      1.1  mrg 
   1956      1.1  mrg   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
   1957      1.1  mrg   bool profiling_dispatch_p
   1958      1.1  mrg     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
   1959      1.1  mrg   if (profiling_dispatch_p)
   1960      1.1  mrg     {
   1961      1.1  mrg       acc_prof_info *prof_info = thr->prof_info;
   1962      1.1  mrg       acc_event_info data_event_info;
   1963      1.1  mrg       acc_api_info *api_info = thr->api_info;
   1964      1.1  mrg 
   1965      1.1  mrg       prof_info->event_type = acc_ev_alloc;
   1966      1.1  mrg 
   1967      1.1  mrg       data_event_info.data_event.event_type = prof_info->event_type;
   1968      1.1  mrg       data_event_info.data_event.valid_bytes
   1969      1.1  mrg 	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
   1970      1.1  mrg       data_event_info.data_event.parent_construct
   1971      1.1  mrg 	= acc_construct_parallel;
   1972      1.1  mrg       data_event_info.data_event.implicit = 1;
   1973      1.1  mrg       data_event_info.data_event.tool_info = NULL;
   1974      1.1  mrg       data_event_info.data_event.var_name = NULL;
   1975      1.1  mrg       data_event_info.data_event.bytes = size;
   1976      1.1  mrg       data_event_info.data_event.host_ptr = NULL;
   1977      1.1  mrg       data_event_info.data_event.device_ptr = (void *) ptr;
   1978      1.1  mrg 
   1979      1.1  mrg       api_info->device_api = acc_device_api_other;
   1980      1.1  mrg 
   1981      1.1  mrg       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
   1982      1.1  mrg 					    api_info);
   1983      1.1  mrg     }
   1984      1.1  mrg 
   1985      1.1  mrg   return ptr;
   1986      1.1  mrg }
   1987      1.1  mrg 
   1988      1.1  mrg /* Create kernel dispatch data structure for given KERNEL, along with
   1989      1.1  mrg    the necessary device signals and memory allocations.  */
   1990      1.1  mrg 
   1991      1.1  mrg static struct kernel_dispatch *
   1992  1.1.1.3  mrg create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
   1993  1.1.1.3  mrg 			int num_threads)
   1994      1.1  mrg {
   1995      1.1  mrg   struct agent_info *agent = kernel->agent;
   1996      1.1  mrg   struct kernel_dispatch *shadow
   1997      1.1  mrg     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
   1998      1.1  mrg 
   1999      1.1  mrg   shadow->agent = kernel->agent;
   2000      1.1  mrg   shadow->object = kernel->object;
   2001      1.1  mrg 
   2002      1.1  mrg   hsa_signal_t sync_signal;
   2003      1.1  mrg   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
   2004      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   2005      1.1  mrg     hsa_fatal ("Error creating the GCN sync signal", status);
   2006      1.1  mrg 
   2007      1.1  mrg   shadow->signal = sync_signal.handle;
   2008      1.1  mrg   shadow->private_segment_size = kernel->private_segment_size;
   2009  1.1.1.3  mrg 
   2010  1.1.1.3  mrg   if (lowlat_size < 0)
   2011  1.1.1.3  mrg     {
   2012  1.1.1.3  mrg       /* Divide the LDS between the number of running teams.
   2013  1.1.1.3  mrg 	 Allocate not less than is defined in the kernel metadata.  */
   2014  1.1.1.3  mrg       int teams_per_cu = num_teams / get_cu_count (agent);
   2015  1.1.1.3  mrg       int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
   2016  1.1.1.3  mrg       shadow->group_segment_size
   2017  1.1.1.3  mrg 	= (kernel->group_segment_size > LDS_per_team
   2018  1.1.1.3  mrg 	   ? kernel->group_segment_size
   2019  1.1.1.3  mrg 	   : LDS_per_team);;
   2020  1.1.1.3  mrg     }
   2021  1.1.1.3  mrg   else if (lowlat_size < GCN_LOWLAT_HEAP+8)
   2022  1.1.1.3  mrg     /* Ensure that there's space for the OpenMP libgomp data.  */
   2023  1.1.1.3  mrg     shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
   2024  1.1.1.3  mrg   else
   2025  1.1.1.3  mrg     shadow->group_segment_size = (lowlat_size > 65536
   2026  1.1.1.3  mrg 				  ? 65536
   2027  1.1.1.3  mrg 				  : lowlat_size);
   2028      1.1  mrg 
   2029      1.1  mrg   /* We expect kernels to request a single pointer, explicitly, and the
   2030      1.1  mrg      rest of struct kernargs, implicitly.  If they request anything else
   2031      1.1  mrg      then something is wrong.  */
   2032      1.1  mrg   if (kernel->kernarg_segment_size > 8)
   2033      1.1  mrg     {
   2034      1.1  mrg       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
   2035      1.1  mrg       return NULL;
   2036      1.1  mrg     }
   2037      1.1  mrg 
   2038      1.1  mrg   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
   2039      1.1  mrg 					   sizeof (struct kernargs),
   2040      1.1  mrg 					   &shadow->kernarg_address);
   2041      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   2042      1.1  mrg     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
   2043      1.1  mrg   struct kernargs *kernargs = shadow->kernarg_address;
   2044      1.1  mrg 
   2045      1.1  mrg   /* Zero-initialize the output_data (minimum needed).  */
   2046  1.1.1.3  mrg   kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
   2047      1.1  mrg   kernargs->output_data.next_output = 0;
   2048      1.1  mrg   for (unsigned i = 0;
   2049      1.1  mrg        i < (sizeof (kernargs->output_data.queue)
   2050      1.1  mrg 	    / sizeof (kernargs->output_data.queue[0]));
   2051      1.1  mrg        i++)
   2052      1.1  mrg     kernargs->output_data.queue[i].written = 0;
   2053      1.1  mrg   kernargs->output_data.consumed = 0;
   2054      1.1  mrg 
   2055      1.1  mrg   /* Pass in the heap location.  */
   2056  1.1.1.3  mrg   kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
   2057      1.1  mrg 
   2058  1.1.1.3  mrg   /* Create the ephemeral memory spaces.  */
   2059  1.1.1.3  mrg   configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
   2060      1.1  mrg 
   2061      1.1  mrg   /* Ensure we can recognize unset return values.  */
   2062      1.1  mrg   kernargs->output_data.return_value = 0xcafe0000;
   2063      1.1  mrg 
   2064      1.1  mrg   return shadow;
   2065      1.1  mrg }
   2066      1.1  mrg 
   2067  1.1.1.3  mrg static void
   2068  1.1.1.3  mrg process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
   2069  1.1.1.3  mrg 			 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
   2070  1.1.1.3  mrg {
   2071  1.1.1.3  mrg   int dev_num = dev_num64;
   2072  1.1.1.3  mrg   GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
   2073  1.1.1.3  mrg 			  NULL);
   2074  1.1.1.3  mrg }
   2075  1.1.1.3  mrg 
   2076      1.1  mrg /* Output any data written to console output from the kernel.  It is expected
   2077      1.1  mrg    that this function is polled during kernel execution.
   2078      1.1  mrg 
   2079      1.1  mrg    We print all entries from the last item printed to the next entry without
   2080      1.1  mrg    a "written" flag.  If the "final" flag is set then it'll continue right to
   2081      1.1  mrg    the end.
   2082      1.1  mrg 
   2083      1.1  mrg    The print buffer is circular, but the from and to locations don't wrap when
   2084      1.1  mrg    the buffer does, so the output limit is UINT_MAX.  The target blocks on
   2085      1.1  mrg    output when the buffer is full.  */
   2086      1.1  mrg 
   2087      1.1  mrg static void
   2088      1.1  mrg console_output (struct kernel_info *kernel, struct kernargs *kernargs,
   2089      1.1  mrg 		bool final)
   2090      1.1  mrg {
   2091      1.1  mrg   unsigned int limit = (sizeof (kernargs->output_data.queue)
   2092      1.1  mrg 			/ sizeof (kernargs->output_data.queue[0]));
   2093      1.1  mrg 
   2094      1.1  mrg   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
   2095      1.1  mrg 				       __ATOMIC_ACQUIRE);
   2096      1.1  mrg   unsigned int to = kernargs->output_data.next_output;
   2097      1.1  mrg 
   2098      1.1  mrg   if (from > to)
   2099      1.1  mrg     {
   2100      1.1  mrg       /* Overflow.  */
   2101      1.1  mrg       if (final)
   2102      1.1  mrg 	printf ("GCN print buffer overflowed.\n");
   2103      1.1  mrg       return;
   2104      1.1  mrg     }
   2105      1.1  mrg 
   2106      1.1  mrg   unsigned int i;
   2107      1.1  mrg   for (i = from; i < to; i++)
   2108      1.1  mrg     {
   2109      1.1  mrg       struct printf_data *data = &kernargs->output_data.queue[i%limit];
   2110      1.1  mrg 
   2111      1.1  mrg       if (!data->written && !final)
   2112      1.1  mrg 	break;
   2113      1.1  mrg 
   2114      1.1  mrg       switch (data->type)
   2115      1.1  mrg 	{
   2116      1.1  mrg 	case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
   2117      1.1  mrg 	case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
   2118      1.1  mrg 	case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
   2119      1.1  mrg 	case 3: printf ("%.128s%.128s", data->msg, data->text); break;
   2120  1.1.1.3  mrg 	case 4:
   2121  1.1.1.3  mrg 	  process_reverse_offload (data->value_u64[0], data->value_u64[1],
   2122  1.1.1.3  mrg 				   data->value_u64[2], data->value_u64[3],
   2123  1.1.1.3  mrg 				   data->value_u64[4], data->value_u64[5]);
   2124  1.1.1.3  mrg 	  break;
   2125      1.1  mrg 	default: printf ("GCN print buffer error!\n"); break;
   2126      1.1  mrg 	}
   2127      1.1  mrg       data->written = 0;
   2128      1.1  mrg       __atomic_store_n (&kernargs->output_data.consumed, i+1,
   2129      1.1  mrg 			__ATOMIC_RELEASE);
   2130      1.1  mrg     }
   2131      1.1  mrg   fflush (stdout);
   2132      1.1  mrg }
   2133      1.1  mrg 
   2134      1.1  mrg /* Release data structure created for a kernel dispatch in SHADOW argument,
   2135      1.1  mrg    and clean up the signal and memory allocations.  */
   2136      1.1  mrg 
   2137      1.1  mrg static void
   2138      1.1  mrg release_kernel_dispatch (struct kernel_dispatch *shadow)
   2139      1.1  mrg {
   2140      1.1  mrg   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
   2141      1.1  mrg 
   2142      1.1  mrg   struct kernargs *kernargs = shadow->kernarg_address;
   2143  1.1.1.3  mrg   void *addr = (void *)kernargs->abi.arena_ptr;
   2144  1.1.1.3  mrg   if (!addr)
   2145  1.1.1.3  mrg     addr = (void *)kernargs->abi.stack_ptr;
   2146  1.1.1.3  mrg   release_ephemeral_memories (shadow->agent, addr);
   2147      1.1  mrg 
   2148      1.1  mrg   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
   2149      1.1  mrg 
   2150      1.1  mrg   hsa_signal_t s;
   2151      1.1  mrg   s.handle = shadow->signal;
   2152      1.1  mrg   hsa_fns.hsa_signal_destroy_fn (s);
   2153      1.1  mrg 
   2154      1.1  mrg   free (shadow);
   2155      1.1  mrg }
   2156      1.1  mrg 
   2157      1.1  mrg /* Extract the properties from a kernel binary.  */
   2158      1.1  mrg 
   2159      1.1  mrg static void
   2160      1.1  mrg init_kernel_properties (struct kernel_info *kernel)
   2161      1.1  mrg {
   2162      1.1  mrg   hsa_status_t status;
   2163      1.1  mrg   struct agent_info *agent = kernel->agent;
   2164      1.1  mrg   hsa_executable_symbol_t kernel_symbol;
   2165  1.1.1.2  mrg   char *buf = alloca (strlen (kernel->name) + 4);
   2166  1.1.1.2  mrg   sprintf (buf, "%s.kd", kernel->name);
   2167      1.1  mrg   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   2168  1.1.1.2  mrg 						 buf, agent->id,
   2169      1.1  mrg 						 0, &kernel_symbol);
   2170      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   2171      1.1  mrg     {
   2172      1.1  mrg       hsa_warn ("Could not find symbol for kernel in the code object", status);
   2173  1.1.1.2  mrg       fprintf (stderr, "not found name: '%s'\n", buf);
   2174      1.1  mrg       dump_executable_symbols (agent->executable);
   2175      1.1  mrg       goto failure;
   2176      1.1  mrg     }
   2177      1.1  mrg   GCN_DEBUG ("Located kernel %s\n", kernel->name);
   2178      1.1  mrg   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2179      1.1  mrg     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
   2180      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   2181      1.1  mrg     hsa_fatal ("Could not extract a kernel object from its symbol", status);
   2182      1.1  mrg   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2183      1.1  mrg     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
   2184      1.1  mrg      &kernel->kernarg_segment_size);
   2185      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   2186      1.1  mrg     hsa_fatal ("Could not get info about kernel argument size", status);
   2187      1.1  mrg   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2188      1.1  mrg     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
   2189      1.1  mrg      &kernel->group_segment_size);
   2190      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   2191      1.1  mrg     hsa_fatal ("Could not get info about kernel group segment size", status);
   2192      1.1  mrg   status = hsa_fns.hsa_executable_symbol_get_info_fn
   2193      1.1  mrg     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
   2194      1.1  mrg      &kernel->private_segment_size);
   2195      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   2196      1.1  mrg     hsa_fatal ("Could not get info about kernel private segment size",
   2197      1.1  mrg 	       status);
   2198      1.1  mrg 
   2199      1.1  mrg   /* The kernel type is not known until something tries to launch it.  */
   2200      1.1  mrg   kernel->kind = KIND_UNKNOWN;
   2201      1.1  mrg 
   2202      1.1  mrg   GCN_DEBUG ("Kernel structure for %s fully initialized with "
   2203      1.1  mrg 	     "following segment sizes: \n", kernel->name);
   2204      1.1  mrg   GCN_DEBUG ("  group_segment_size: %u\n",
   2205      1.1  mrg 	     (unsigned) kernel->group_segment_size);
   2206      1.1  mrg   GCN_DEBUG ("  private_segment_size: %u\n",
   2207      1.1  mrg 	     (unsigned) kernel->private_segment_size);
   2208      1.1  mrg   GCN_DEBUG ("  kernarg_segment_size: %u\n",
   2209      1.1  mrg 	     (unsigned) kernel->kernarg_segment_size);
   2210      1.1  mrg   return;
   2211      1.1  mrg 
   2212      1.1  mrg failure:
   2213      1.1  mrg   kernel->initialization_failed = true;
   2214      1.1  mrg }
   2215      1.1  mrg 
   2216      1.1  mrg /* Do all the work that is necessary before running KERNEL for the first time.
   2217      1.1  mrg    The function assumes the program has been created, finalized and frozen by
   2218      1.1  mrg    create_and_finalize_hsa_program.  */
   2219      1.1  mrg 
   2220      1.1  mrg static void
   2221      1.1  mrg init_kernel (struct kernel_info *kernel)
   2222      1.1  mrg {
   2223      1.1  mrg   if (pthread_mutex_lock (&kernel->init_mutex))
   2224      1.1  mrg     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
   2225      1.1  mrg   if (kernel->initialized)
   2226      1.1  mrg     {
   2227      1.1  mrg       if (pthread_mutex_unlock (&kernel->init_mutex))
   2228      1.1  mrg 	GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
   2229      1.1  mrg 			   "mutex");
   2230      1.1  mrg 
   2231      1.1  mrg       return;
   2232      1.1  mrg     }
   2233      1.1  mrg 
   2234      1.1  mrg   init_kernel_properties (kernel);
   2235      1.1  mrg 
   2236      1.1  mrg   if (!kernel->initialization_failed)
   2237      1.1  mrg     {
   2238      1.1  mrg       GCN_DEBUG ("\n");
   2239      1.1  mrg 
   2240      1.1  mrg       kernel->initialized = true;
   2241      1.1  mrg     }
   2242      1.1  mrg   if (pthread_mutex_unlock (&kernel->init_mutex))
   2243      1.1  mrg     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
   2244      1.1  mrg 		       "mutex");
   2245      1.1  mrg }
   2246      1.1  mrg 
   2247      1.1  mrg /* Run KERNEL on its agent, pass VARS to it as arguments and take
   2248      1.1  mrg    launch attributes from KLA.
   2249      1.1  mrg 
   2250      1.1  mrg    MODULE_LOCKED indicates that the caller already holds the lock and
   2251      1.1  mrg    run_kernel need not lock it again.
   2252      1.1  mrg    If AQ is NULL then agent->sync_queue will be used.  */
   2253      1.1  mrg 
   2254      1.1  mrg static void
   2255      1.1  mrg run_kernel (struct kernel_info *kernel, void *vars,
   2256      1.1  mrg 	    struct GOMP_kernel_launch_attributes *kla,
   2257      1.1  mrg 	    struct goacc_asyncqueue *aq, bool module_locked)
   2258      1.1  mrg {
   2259  1.1.1.3  mrg   struct agent_info *agent = kernel->agent;
   2260      1.1  mrg   GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
   2261      1.1  mrg 	     kernel->description->vpgr_count);
   2262      1.1  mrg 
   2263      1.1  mrg   /* Reduce the number of threads/workers if there are insufficient
   2264      1.1  mrg      VGPRs available to run the kernels together.  */
   2265      1.1  mrg   if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
   2266      1.1  mrg     {
   2267  1.1.1.3  mrg       int max_vgprs = max_isa_vgprs (agent->device_isa);
   2268      1.1  mrg       int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
   2269  1.1.1.3  mrg       int max_threads = (max_vgprs / granulated_vgprs) * 4;
   2270      1.1  mrg       if (kla->gdims[2] > max_threads)
   2271      1.1  mrg 	{
   2272      1.1  mrg 	  GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
   2273      1.1  mrg 		       " per team/gang - reducing to %d threads/workers.\n",
   2274      1.1  mrg 		       kla->gdims[2], max_threads);
   2275      1.1  mrg 	  kla->gdims[2] = max_threads;
   2276      1.1  mrg 	}
   2277      1.1  mrg     }
   2278      1.1  mrg 
   2279      1.1  mrg   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
   2280      1.1  mrg 	     (aq ? aq->id : 0));
   2281      1.1  mrg   GCN_DEBUG ("GCN launch attribs: gdims:[");
   2282      1.1  mrg   int i;
   2283      1.1  mrg   for (i = 0; i < kla->ndim; ++i)
   2284      1.1  mrg     {
   2285      1.1  mrg       if (i)
   2286      1.1  mrg 	DEBUG_PRINT (", ");
   2287      1.1  mrg       DEBUG_PRINT ("%u", kla->gdims[i]);
   2288      1.1  mrg     }
   2289      1.1  mrg   DEBUG_PRINT ("], normalized gdims:[");
   2290      1.1  mrg   for (i = 0; i < kla->ndim; ++i)
   2291      1.1  mrg     {
   2292      1.1  mrg       if (i)
   2293      1.1  mrg 	DEBUG_PRINT (", ");
   2294      1.1  mrg       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
   2295      1.1  mrg     }
   2296      1.1  mrg   DEBUG_PRINT ("], wdims:[");
   2297      1.1  mrg   for (i = 0; i < kla->ndim; ++i)
   2298      1.1  mrg     {
   2299      1.1  mrg       if (i)
   2300      1.1  mrg 	DEBUG_PRINT (", ");
   2301      1.1  mrg       DEBUG_PRINT ("%u", kla->wdims[i]);
   2302      1.1  mrg     }
   2303      1.1  mrg   DEBUG_PRINT ("]\n");
   2304      1.1  mrg   DEBUG_FLUSH ();
   2305      1.1  mrg 
   2306      1.1  mrg   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
   2307      1.1  mrg     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
   2308      1.1  mrg 
   2309      1.1  mrg   if (!agent->initialized)
   2310      1.1  mrg     GOMP_PLUGIN_fatal ("Agent must be initialized");
   2311      1.1  mrg 
   2312      1.1  mrg   if (!kernel->initialized)
   2313      1.1  mrg     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
   2314      1.1  mrg 
   2315      1.1  mrg   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
   2316      1.1  mrg 
   2317      1.1  mrg   uint64_t index
   2318      1.1  mrg     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
   2319      1.1  mrg   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
   2320      1.1  mrg 
   2321      1.1  mrg   /* Wait until the queue is not full before writing the packet.   */
   2322      1.1  mrg   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
   2323      1.1  mrg 	 >= command_q->size)
   2324      1.1  mrg     ;
   2325      1.1  mrg 
   2326      1.1  mrg   /* Do not allow the dimensions to be overridden when running
   2327      1.1  mrg      constructors or destructors.  */
   2328      1.1  mrg   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
   2329      1.1  mrg   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
   2330      1.1  mrg 
   2331      1.1  mrg   hsa_kernel_dispatch_packet_t *packet;
   2332      1.1  mrg   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
   2333      1.1  mrg 	   + index % command_q->size;
   2334      1.1  mrg 
   2335      1.1  mrg   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
   2336      1.1  mrg   packet->grid_size_x = override_x ? : kla->gdims[0];
   2337      1.1  mrg   packet->workgroup_size_x = get_group_size (kla->ndim,
   2338      1.1  mrg 					     packet->grid_size_x,
   2339      1.1  mrg 					     kla->wdims[0]);
   2340      1.1  mrg 
   2341      1.1  mrg   if (kla->ndim >= 2)
   2342      1.1  mrg     {
   2343      1.1  mrg       packet->grid_size_y = kla->gdims[1];
   2344      1.1  mrg       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
   2345      1.1  mrg 						 kla->wdims[1]);
   2346      1.1  mrg     }
   2347      1.1  mrg   else
   2348      1.1  mrg     {
   2349      1.1  mrg       packet->grid_size_y = 1;
   2350      1.1  mrg       packet->workgroup_size_y = 1;
   2351      1.1  mrg     }
   2352      1.1  mrg 
   2353      1.1  mrg   if (kla->ndim == 3)
   2354      1.1  mrg     {
   2355      1.1  mrg       packet->grid_size_z = limit_worker_threads (override_z
   2356      1.1  mrg 						  ? : kla->gdims[2]);
   2357      1.1  mrg       packet->workgroup_size_z = get_group_size (kla->ndim,
   2358      1.1  mrg 						 packet->grid_size_z,
   2359      1.1  mrg 						 kla->wdims[2]);
   2360      1.1  mrg     }
   2361      1.1  mrg   else
   2362      1.1  mrg     {
   2363      1.1  mrg       packet->grid_size_z = 1;
   2364      1.1  mrg       packet->workgroup_size_z = 1;
   2365      1.1  mrg     }
   2366      1.1  mrg 
   2367      1.1  mrg   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
   2368      1.1  mrg 	     " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
   2369      1.1  mrg 	     packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
   2370      1.1  mrg 	     packet->grid_size_x / packet->workgroup_size_x,
   2371      1.1  mrg 	     packet->grid_size_y / packet->workgroup_size_y,
   2372      1.1  mrg 	     packet->grid_size_z / packet->workgroup_size_z,
   2373      1.1  mrg 	     packet->workgroup_size_x, packet->workgroup_size_y,
   2374      1.1  mrg 	     packet->workgroup_size_z);
   2375      1.1  mrg 
   2376      1.1  mrg   struct kernel_dispatch *shadow
   2377  1.1.1.3  mrg     = create_kernel_dispatch (kernel, packet->grid_size_x,
   2378  1.1.1.3  mrg 			      packet->grid_size_z);
   2379      1.1  mrg   shadow->queue = command_q;
   2380      1.1  mrg 
   2381      1.1  mrg   if (debug)
   2382      1.1  mrg     {
   2383      1.1  mrg       fprintf (stderr, "\nKernel has following dependencies:\n");
   2384      1.1  mrg       print_kernel_dispatch (shadow, 2);
   2385      1.1  mrg     }
   2386      1.1  mrg 
   2387  1.1.1.3  mrg   packet->private_segment_size = shadow->private_segment_size;
   2388  1.1.1.3  mrg   packet->group_segment_size = shadow->group_segment_size;
   2389  1.1.1.3  mrg   packet->kernel_object = shadow->object;
   2390      1.1  mrg   packet->kernarg_address = shadow->kernarg_address;
   2391      1.1  mrg   hsa_signal_t s;
   2392      1.1  mrg   s.handle = shadow->signal;
   2393      1.1  mrg   packet->completion_signal = s;
   2394      1.1  mrg   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
   2395      1.1  mrg   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
   2396      1.1  mrg 
   2397      1.1  mrg   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
   2398      1.1  mrg 
   2399      1.1  mrg   uint16_t header;
   2400      1.1  mrg   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
   2401      1.1  mrg   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
   2402      1.1  mrg   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
   2403      1.1  mrg 
   2404      1.1  mrg   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
   2405      1.1  mrg 	     agent->device_id);
   2406      1.1  mrg 
   2407      1.1  mrg   packet_store_release ((uint32_t *) packet, header,
   2408      1.1  mrg 			(uint16_t) kla->ndim
   2409      1.1  mrg 			<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
   2410      1.1  mrg 
   2411      1.1  mrg   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
   2412      1.1  mrg 				       index);
   2413      1.1  mrg 
   2414      1.1  mrg   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
   2415      1.1  mrg 
   2416      1.1  mrg   /* Root signal waits with 1ms timeout.  */
   2417      1.1  mrg   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
   2418      1.1  mrg 					     1000 * 1000,
   2419      1.1  mrg 					     HSA_WAIT_STATE_BLOCKED) != 0)
   2420      1.1  mrg     {
   2421      1.1  mrg       console_output (kernel, shadow->kernarg_address, false);
   2422      1.1  mrg     }
   2423      1.1  mrg   console_output (kernel, shadow->kernarg_address, true);
   2424      1.1  mrg 
   2425      1.1  mrg   struct kernargs *kernargs = shadow->kernarg_address;
   2426      1.1  mrg   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
   2427      1.1  mrg 
   2428      1.1  mrg   release_kernel_dispatch (shadow);
   2429      1.1  mrg 
   2430      1.1  mrg   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
   2431      1.1  mrg     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
   2432      1.1  mrg 
   2433      1.1  mrg   unsigned int upper = (return_value & ~0xffff) >> 16;
   2434      1.1  mrg   if (upper == 0xcafe)
   2435      1.1  mrg     ; // exit not called, normal termination.
   2436      1.1  mrg   else if (upper == 0xffff)
   2437      1.1  mrg     ; // exit called.
   2438      1.1  mrg   else
   2439      1.1  mrg     {
   2440      1.1  mrg       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
   2441      1.1  mrg 			 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
   2442      1.1  mrg 			 return_value);
   2443      1.1  mrg       abort ();
   2444      1.1  mrg     }
   2445      1.1  mrg 
   2446      1.1  mrg   if (upper == 0xffff)
   2447      1.1  mrg     {
   2448      1.1  mrg       unsigned int signal = (return_value >> 8) & 0xff;
   2449      1.1  mrg 
   2450      1.1  mrg       if (signal == SIGABRT)
   2451      1.1  mrg 	{
   2452      1.1  mrg 	  GCN_WARNING ("GCN Kernel aborted\n");
   2453      1.1  mrg 	  abort ();
   2454      1.1  mrg 	}
   2455      1.1  mrg       else if (signal != 0)
   2456      1.1  mrg 	{
   2457      1.1  mrg 	  GCN_WARNING ("GCN Kernel received unknown signal\n");
   2458      1.1  mrg 	  abort ();
   2459      1.1  mrg 	}
   2460      1.1  mrg 
   2461      1.1  mrg       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
   2462      1.1  mrg       exit (return_value & 0xff);
   2463      1.1  mrg     }
   2464      1.1  mrg }
   2465      1.1  mrg 
   2466      1.1  mrg /* }}}  */
   2467      1.1  mrg /* {{{ Load/Unload  */
   2468      1.1  mrg 
   2469      1.1  mrg /* Initialize KERNEL from D and other parameters.  Return true on success. */
   2470      1.1  mrg 
   2471      1.1  mrg static bool
   2472      1.1  mrg init_basic_kernel_info (struct kernel_info *kernel,
   2473      1.1  mrg 			struct hsa_kernel_description *d,
   2474      1.1  mrg 			struct agent_info *agent,
   2475      1.1  mrg 			struct module_info *module)
   2476      1.1  mrg {
   2477      1.1  mrg   kernel->agent = agent;
   2478      1.1  mrg   kernel->module = module;
   2479      1.1  mrg   kernel->name = d->name;
   2480      1.1  mrg   kernel->description = d;
   2481      1.1  mrg   if (pthread_mutex_init (&kernel->init_mutex, NULL))
   2482      1.1  mrg     {
   2483      1.1  mrg       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
   2484      1.1  mrg       return false;
   2485      1.1  mrg     }
   2486      1.1  mrg   return true;
   2487      1.1  mrg }
   2488      1.1  mrg 
   2489      1.1  mrg /* Check that the GCN ISA of the given image matches the ISA of the agent. */
   2490      1.1  mrg 
   2491      1.1  mrg static bool
   2492      1.1  mrg isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
   2493      1.1  mrg {
   2494      1.1  mrg   int isa_field = elf_gcn_isa_field (image);
   2495      1.1  mrg   const char* isa_s = isa_hsa_name (isa_field);
   2496      1.1  mrg   if (!isa_s)
   2497      1.1  mrg     {
   2498      1.1  mrg       hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
   2499      1.1  mrg       return false;
   2500      1.1  mrg     }
   2501      1.1  mrg 
   2502      1.1  mrg   if (isa_field != agent->device_isa)
   2503      1.1  mrg     {
   2504      1.1  mrg       char msg[120];
   2505      1.1  mrg       const char *agent_isa_s = isa_hsa_name (agent->device_isa);
   2506      1.1  mrg       const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
   2507      1.1  mrg       assert (agent_isa_s);
   2508      1.1  mrg       assert (agent_isa_gcc_s);
   2509      1.1  mrg 
   2510      1.1  mrg       snprintf (msg, sizeof msg,
   2511      1.1  mrg 		"GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
   2512  1.1.1.3  mrg 		"Try to recompile with '-foffload-options=-march=%s'.\n",
   2513      1.1  mrg 		isa_s, agent_isa_s, agent_isa_gcc_s);
   2514      1.1  mrg 
   2515      1.1  mrg       hsa_error (msg, HSA_STATUS_ERROR);
   2516      1.1  mrg       return false;
   2517      1.1  mrg     }
   2518      1.1  mrg 
   2519      1.1  mrg   return true;
   2520      1.1  mrg }
   2521      1.1  mrg 
   2522      1.1  mrg /* Create and finalize the program consisting of all loaded modules.  */
   2523      1.1  mrg 
   2524      1.1  mrg static bool
   2525      1.1  mrg create_and_finalize_hsa_program (struct agent_info *agent)
   2526      1.1  mrg {
   2527      1.1  mrg   hsa_status_t status;
   2528      1.1  mrg   bool res = true;
   2529      1.1  mrg   if (pthread_mutex_lock (&agent->prog_mutex))
   2530      1.1  mrg     {
   2531      1.1  mrg       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
   2532      1.1  mrg       return false;
   2533      1.1  mrg     }
   2534      1.1  mrg   if (agent->prog_finalized)
   2535      1.1  mrg     goto final;
   2536      1.1  mrg 
   2537      1.1  mrg   status
   2538      1.1  mrg     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
   2539      1.1  mrg 					HSA_EXECUTABLE_STATE_UNFROZEN,
   2540      1.1  mrg 					"", &agent->executable);
   2541      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   2542      1.1  mrg     {
   2543      1.1  mrg       hsa_error ("Could not create GCN executable", status);
   2544      1.1  mrg       goto fail;
   2545      1.1  mrg     }
   2546      1.1  mrg 
   2547      1.1  mrg   /* Load any GCN modules.  */
   2548      1.1  mrg   struct module_info *module = agent->module;
   2549      1.1  mrg   if (module)
   2550      1.1  mrg     {
   2551      1.1  mrg       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
   2552      1.1  mrg 
   2553      1.1  mrg       if (!isa_matches_agent (agent, image))
   2554      1.1  mrg 	goto fail;
   2555      1.1  mrg 
   2556      1.1  mrg       hsa_code_object_t co = { 0 };
   2557      1.1  mrg       status = hsa_fns.hsa_code_object_deserialize_fn
   2558      1.1  mrg 	(module->image_desc->gcn_image->image,
   2559      1.1  mrg 	 module->image_desc->gcn_image->size,
   2560      1.1  mrg 	 NULL, &co);
   2561      1.1  mrg       if (status != HSA_STATUS_SUCCESS)
   2562      1.1  mrg 	{
   2563      1.1  mrg 	  hsa_error ("Could not deserialize GCN code object", status);
   2564      1.1  mrg 	  goto fail;
   2565      1.1  mrg 	}
   2566      1.1  mrg 
   2567      1.1  mrg       status = hsa_fns.hsa_executable_load_code_object_fn
   2568      1.1  mrg 	(agent->executable, agent->id, co, "");
   2569      1.1  mrg       if (status != HSA_STATUS_SUCCESS)
   2570      1.1  mrg 	{
   2571      1.1  mrg 	  hsa_error ("Could not load GCN code object", status);
   2572      1.1  mrg 	  goto fail;
   2573      1.1  mrg 	}
   2574      1.1  mrg 
   2575      1.1  mrg       if (!module->heap)
   2576      1.1  mrg 	{
   2577      1.1  mrg 	  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
   2578      1.1  mrg 						   gcn_kernel_heap_size,
   2579      1.1  mrg 						   (void**)&module->heap);
   2580      1.1  mrg 	  if (status != HSA_STATUS_SUCCESS)
   2581      1.1  mrg 	    {
   2582      1.1  mrg 	      hsa_error ("Could not allocate memory for GCN heap", status);
   2583      1.1  mrg 	      goto fail;
   2584      1.1  mrg 	    }
   2585      1.1  mrg 
   2586      1.1  mrg 	  status = hsa_fns.hsa_memory_assign_agent_fn
   2587      1.1  mrg 			(module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
   2588      1.1  mrg 	  if (status != HSA_STATUS_SUCCESS)
   2589      1.1  mrg 	    {
   2590      1.1  mrg 	      hsa_error ("Could not assign GCN heap memory to device", status);
   2591      1.1  mrg 	      goto fail;
   2592      1.1  mrg 	    }
   2593      1.1  mrg 
   2594      1.1  mrg 	  hsa_fns.hsa_memory_copy_fn (&module->heap->size,
   2595      1.1  mrg 				      &gcn_kernel_heap_size,
   2596      1.1  mrg 				      sizeof (gcn_kernel_heap_size));
   2597      1.1  mrg 	}
   2598      1.1  mrg 
   2599      1.1  mrg     }
   2600      1.1  mrg 
   2601      1.1  mrg   if (debug)
   2602      1.1  mrg     dump_executable_symbols (agent->executable);
   2603      1.1  mrg 
   2604      1.1  mrg   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
   2605      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   2606      1.1  mrg     {
   2607      1.1  mrg       hsa_error ("Could not freeze the GCN executable", status);
   2608      1.1  mrg       goto fail;
   2609      1.1  mrg     }
   2610      1.1  mrg 
   2611      1.1  mrg final:
   2612      1.1  mrg   agent->prog_finalized = true;
   2613      1.1  mrg 
   2614      1.1  mrg   if (pthread_mutex_unlock (&agent->prog_mutex))
   2615      1.1  mrg     {
   2616      1.1  mrg       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
   2617      1.1  mrg       res = false;
   2618      1.1  mrg     }
   2619      1.1  mrg 
   2620      1.1  mrg   return res;
   2621      1.1  mrg 
   2622      1.1  mrg fail:
   2623      1.1  mrg   res = false;
   2624      1.1  mrg   goto final;
   2625      1.1  mrg }
   2626      1.1  mrg 
   2627      1.1  mrg /* Free the HSA program in agent and everything associated with it and set
   2628      1.1  mrg    agent->prog_finalized and the initialized flags of all kernels to false.
   2629      1.1  mrg    Return TRUE on success.  */
   2630      1.1  mrg 
   2631      1.1  mrg static bool
   2632      1.1  mrg destroy_hsa_program (struct agent_info *agent)
   2633      1.1  mrg {
   2634      1.1  mrg   if (!agent->prog_finalized)
   2635      1.1  mrg     return true;
   2636      1.1  mrg 
   2637      1.1  mrg   hsa_status_t status;
   2638      1.1  mrg 
   2639      1.1  mrg   GCN_DEBUG ("Destroying the current GCN program.\n");
   2640      1.1  mrg 
   2641      1.1  mrg   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
   2642      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   2643      1.1  mrg     return hsa_error ("Could not destroy GCN executable", status);
   2644      1.1  mrg 
   2645      1.1  mrg   if (agent->module)
   2646      1.1  mrg     {
   2647      1.1  mrg       int i;
   2648      1.1  mrg       for (i = 0; i < agent->module->kernel_count; i++)
   2649      1.1  mrg 	agent->module->kernels[i].initialized = false;
   2650      1.1  mrg 
   2651      1.1  mrg       if (agent->module->heap)
   2652      1.1  mrg 	{
   2653      1.1  mrg 	  hsa_fns.hsa_memory_free_fn (agent->module->heap);
   2654      1.1  mrg 	  agent->module->heap = NULL;
   2655      1.1  mrg 	}
   2656      1.1  mrg     }
   2657      1.1  mrg   agent->prog_finalized = false;
   2658      1.1  mrg   return true;
   2659      1.1  mrg }
   2660      1.1  mrg 
   2661      1.1  mrg /* Deinitialize all information associated with MODULE and kernels within
   2662      1.1  mrg    it.  Return TRUE on success.  */
   2663      1.1  mrg 
   2664      1.1  mrg static bool
   2665      1.1  mrg destroy_module (struct module_info *module, bool locked)
   2666      1.1  mrg {
   2667      1.1  mrg   /* Run destructors before destroying module.  */
   2668      1.1  mrg   struct GOMP_kernel_launch_attributes kla =
   2669      1.1  mrg     { 3,
   2670      1.1  mrg       /* Grid size.  */
   2671      1.1  mrg       { 1, 64, 1 },
   2672      1.1  mrg       /* Work-group size.  */
   2673      1.1  mrg       { 1, 64, 1 }
   2674      1.1  mrg     };
   2675      1.1  mrg 
   2676      1.1  mrg   if (module->fini_array_func)
   2677      1.1  mrg     {
   2678      1.1  mrg       init_kernel (module->fini_array_func);
   2679      1.1  mrg       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
   2680      1.1  mrg     }
   2681      1.1  mrg   module->constructors_run_p = false;
   2682      1.1  mrg 
   2683      1.1  mrg   int i;
   2684      1.1  mrg   for (i = 0; i < module->kernel_count; i++)
   2685      1.1  mrg     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
   2686      1.1  mrg       {
   2687      1.1  mrg 	GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
   2688      1.1  mrg 			   "mutex");
   2689      1.1  mrg 	return false;
   2690      1.1  mrg       }
   2691      1.1  mrg 
   2692      1.1  mrg   return true;
   2693      1.1  mrg }
   2694      1.1  mrg 
   2695      1.1  mrg /* }}}  */
   2696      1.1  mrg /* {{{ Async  */
   2697      1.1  mrg 
   2698      1.1  mrg /* Callback of dispatch queues to report errors.  */
   2699      1.1  mrg 
   2700      1.1  mrg static void
   2701      1.1  mrg execute_queue_entry (struct goacc_asyncqueue *aq, int index)
   2702      1.1  mrg {
   2703      1.1  mrg   struct queue_entry *entry = &aq->queue[index];
   2704      1.1  mrg 
   2705      1.1  mrg   switch (entry->type)
   2706      1.1  mrg     {
   2707      1.1  mrg     case KERNEL_LAUNCH:
   2708      1.1  mrg       if (DEBUG_QUEUES)
   2709      1.1  mrg 	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
   2710      1.1  mrg 		   aq->agent->device_id, aq->id, index);
   2711      1.1  mrg       run_kernel (entry->u.launch.kernel,
   2712      1.1  mrg 		  entry->u.launch.vars,
   2713      1.1  mrg 		  &entry->u.launch.kla, aq, false);
   2714      1.1  mrg       if (DEBUG_QUEUES)
   2715      1.1  mrg 	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
   2716      1.1  mrg 		   aq->agent->device_id, aq->id, index);
   2717      1.1  mrg       break;
   2718      1.1  mrg 
   2719      1.1  mrg     case CALLBACK:
   2720      1.1  mrg       if (DEBUG_QUEUES)
   2721      1.1  mrg 	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
   2722      1.1  mrg 		   aq->agent->device_id, aq->id, index);
   2723      1.1  mrg       entry->u.callback.fn (entry->u.callback.data);
   2724      1.1  mrg       if (DEBUG_QUEUES)
   2725      1.1  mrg 	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
   2726      1.1  mrg 		   aq->agent->device_id, aq->id, index);
   2727      1.1  mrg       break;
   2728      1.1  mrg 
   2729      1.1  mrg     case ASYNC_WAIT:
   2730      1.1  mrg       {
   2731      1.1  mrg 	/* FIXME: is it safe to access a placeholder that may already have
   2732      1.1  mrg 	   been executed?  */
   2733      1.1  mrg         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
   2734      1.1  mrg 
   2735      1.1  mrg 	if (DEBUG_QUEUES)
   2736      1.1  mrg           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
   2737      1.1  mrg 		     aq->agent->device_id, aq->id, index);
   2738      1.1  mrg 
   2739      1.1  mrg 	pthread_mutex_lock (&placeholderp->mutex);
   2740      1.1  mrg 
   2741      1.1  mrg 	while (!placeholderp->executed)
   2742      1.1  mrg           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
   2743      1.1  mrg 
   2744      1.1  mrg 	pthread_mutex_unlock (&placeholderp->mutex);
   2745      1.1  mrg 
   2746      1.1  mrg 	if (pthread_cond_destroy (&placeholderp->cond))
   2747      1.1  mrg 	  GOMP_PLUGIN_error ("Failed to destroy serialization cond");
   2748      1.1  mrg 
   2749      1.1  mrg 	if (pthread_mutex_destroy (&placeholderp->mutex))
   2750      1.1  mrg 	  GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
   2751      1.1  mrg 
   2752      1.1  mrg 	if (DEBUG_QUEUES)
   2753      1.1  mrg           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
   2754      1.1  mrg 		     "entry (%d) done\n", aq->agent->device_id, aq->id, index);
   2755      1.1  mrg       }
   2756      1.1  mrg       break;
   2757      1.1  mrg 
   2758      1.1  mrg     case ASYNC_PLACEHOLDER:
   2759      1.1  mrg       pthread_mutex_lock (&entry->u.placeholder.mutex);
   2760      1.1  mrg       entry->u.placeholder.executed = 1;
   2761      1.1  mrg       pthread_cond_signal (&entry->u.placeholder.cond);
   2762      1.1  mrg       pthread_mutex_unlock (&entry->u.placeholder.mutex);
   2763      1.1  mrg       break;
   2764      1.1  mrg 
   2765      1.1  mrg     default:
   2766      1.1  mrg       GOMP_PLUGIN_fatal ("Unknown queue element");
   2767      1.1  mrg     }
   2768      1.1  mrg }
   2769      1.1  mrg 
   2770      1.1  mrg /* This function is run as a thread to service an async queue in the
   2771      1.1  mrg    background.  It runs continuously until the stop flag is set.  */
   2772      1.1  mrg 
   2773      1.1  mrg static void *
   2774      1.1  mrg drain_queue (void *thread_arg)
   2775      1.1  mrg {
   2776      1.1  mrg   struct goacc_asyncqueue *aq = thread_arg;
   2777      1.1  mrg 
   2778      1.1  mrg   if (DRAIN_QUEUE_SYNCHRONOUS_P)
   2779      1.1  mrg     {
   2780      1.1  mrg       aq->drain_queue_stop = 2;
   2781      1.1  mrg       return NULL;
   2782      1.1  mrg     }
   2783      1.1  mrg 
   2784      1.1  mrg   pthread_mutex_lock (&aq->mutex);
   2785      1.1  mrg 
   2786      1.1  mrg   while (true)
   2787      1.1  mrg     {
   2788      1.1  mrg       if (aq->drain_queue_stop)
   2789      1.1  mrg 	break;
   2790      1.1  mrg 
   2791      1.1  mrg       if (aq->queue_n > 0)
   2792      1.1  mrg 	{
   2793      1.1  mrg 	  pthread_mutex_unlock (&aq->mutex);
   2794      1.1  mrg 	  execute_queue_entry (aq, aq->queue_first);
   2795      1.1  mrg 
   2796      1.1  mrg 	  pthread_mutex_lock (&aq->mutex);
   2797      1.1  mrg 	  aq->queue_first = ((aq->queue_first + 1)
   2798      1.1  mrg 			     % ASYNC_QUEUE_SIZE);
   2799      1.1  mrg 	  aq->queue_n--;
   2800      1.1  mrg 
   2801      1.1  mrg 	  if (DEBUG_THREAD_SIGNAL)
   2802      1.1  mrg 	    GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
   2803      1.1  mrg 		       aq->agent->device_id, aq->id);
   2804      1.1  mrg 	  pthread_cond_broadcast (&aq->queue_cond_out);
   2805      1.1  mrg 	  pthread_mutex_unlock (&aq->mutex);
   2806      1.1  mrg 
   2807      1.1  mrg 	  if (DEBUG_QUEUES)
   2808      1.1  mrg 	    GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
   2809      1.1  mrg 		       aq->id);
   2810      1.1  mrg 	  pthread_mutex_lock (&aq->mutex);
   2811      1.1  mrg 	}
   2812      1.1  mrg       else
   2813      1.1  mrg 	{
   2814      1.1  mrg 	  if (DEBUG_THREAD_SLEEP)
   2815      1.1  mrg 	    GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
   2816      1.1  mrg 		       aq->agent->device_id, aq->id);
   2817      1.1  mrg 	  pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
   2818      1.1  mrg 	  if (DEBUG_THREAD_SLEEP)
   2819      1.1  mrg 	    GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
   2820      1.1  mrg 		       aq->agent->device_id, aq->id);
   2821      1.1  mrg 	}
   2822      1.1  mrg     }
   2823      1.1  mrg 
   2824      1.1  mrg   aq->drain_queue_stop = 2;
   2825      1.1  mrg   if (DEBUG_THREAD_SIGNAL)
   2826      1.1  mrg     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
   2827      1.1  mrg 	       aq->agent->device_id, aq->id);
   2828      1.1  mrg   pthread_cond_broadcast (&aq->queue_cond_out);
   2829      1.1  mrg   pthread_mutex_unlock (&aq->mutex);
   2830      1.1  mrg 
   2831      1.1  mrg   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
   2832      1.1  mrg   return NULL;
   2833      1.1  mrg }
   2834      1.1  mrg 
   2835      1.1  mrg /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
   2836      1.1  mrg    is not usually the case.  This is just a debug tool.  */
   2837      1.1  mrg 
   2838      1.1  mrg static void
   2839      1.1  mrg drain_queue_synchronous (struct goacc_asyncqueue *aq)
   2840      1.1  mrg {
   2841      1.1  mrg   pthread_mutex_lock (&aq->mutex);
   2842      1.1  mrg 
   2843      1.1  mrg   while (aq->queue_n > 0)
   2844      1.1  mrg     {
   2845      1.1  mrg       execute_queue_entry (aq, aq->queue_first);
   2846      1.1  mrg 
   2847      1.1  mrg       aq->queue_first = ((aq->queue_first + 1)
   2848      1.1  mrg 			 % ASYNC_QUEUE_SIZE);
   2849      1.1  mrg       aq->queue_n--;
   2850      1.1  mrg     }
   2851      1.1  mrg 
   2852      1.1  mrg   pthread_mutex_unlock (&aq->mutex);
   2853      1.1  mrg }
   2854      1.1  mrg 
   2855      1.1  mrg /* Block the current thread until an async queue is writable.  The aq->mutex
   2856      1.1  mrg    lock should be held on entry, and remains locked on exit.  */
   2857      1.1  mrg 
   2858      1.1  mrg static void
   2859      1.1  mrg wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
   2860      1.1  mrg {
   2861      1.1  mrg   if (aq->queue_n == ASYNC_QUEUE_SIZE)
   2862      1.1  mrg     {
   2863      1.1  mrg       /* Queue is full.  Wait for it to not be full.  */
   2864      1.1  mrg       while (aq->queue_n == ASYNC_QUEUE_SIZE)
   2865      1.1  mrg 	pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
   2866      1.1  mrg     }
   2867      1.1  mrg }
   2868      1.1  mrg 
   2869      1.1  mrg /* Request an asynchronous kernel launch on the specified queue.  This
   2870      1.1  mrg    may block if the queue is full, but returns without waiting for the
   2871      1.1  mrg    kernel to run.  */
   2872      1.1  mrg 
   2873      1.1  mrg static void
   2874      1.1  mrg queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
   2875      1.1  mrg 		   void *vars, struct GOMP_kernel_launch_attributes *kla)
   2876      1.1  mrg {
   2877      1.1  mrg   assert (aq->agent == kernel->agent);
   2878      1.1  mrg 
   2879      1.1  mrg   pthread_mutex_lock (&aq->mutex);
   2880      1.1  mrg 
   2881      1.1  mrg   wait_for_queue_nonfull (aq);
   2882      1.1  mrg 
   2883      1.1  mrg   int queue_last = ((aq->queue_first + aq->queue_n)
   2884      1.1  mrg 		    % ASYNC_QUEUE_SIZE);
   2885      1.1  mrg   if (DEBUG_QUEUES)
   2886      1.1  mrg     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
   2887      1.1  mrg 	       aq->id, queue_last);
   2888      1.1  mrg 
   2889      1.1  mrg   aq->queue[queue_last].type = KERNEL_LAUNCH;
   2890      1.1  mrg   aq->queue[queue_last].u.launch.kernel = kernel;
   2891      1.1  mrg   aq->queue[queue_last].u.launch.vars = vars;
   2892      1.1  mrg   aq->queue[queue_last].u.launch.kla = *kla;
   2893      1.1  mrg 
   2894      1.1  mrg   aq->queue_n++;
   2895      1.1  mrg 
   2896      1.1  mrg   if (DEBUG_THREAD_SIGNAL)
   2897      1.1  mrg     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
   2898      1.1  mrg 	       aq->agent->device_id, aq->id);
   2899      1.1  mrg   pthread_cond_signal (&aq->queue_cond_in);
   2900      1.1  mrg 
   2901      1.1  mrg   pthread_mutex_unlock (&aq->mutex);
   2902      1.1  mrg }
   2903      1.1  mrg 
   2904      1.1  mrg /* Request an asynchronous callback on the specified queue.  The callback
   2905      1.1  mrg    function will be called, with the given opaque data, from the appropriate
   2906      1.1  mrg    async thread, when all previous items on that queue are complete.  */
   2907      1.1  mrg 
   2908      1.1  mrg static void
   2909      1.1  mrg queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
   2910      1.1  mrg 		     void *data)
   2911      1.1  mrg {
   2912      1.1  mrg   pthread_mutex_lock (&aq->mutex);
   2913      1.1  mrg 
   2914      1.1  mrg   wait_for_queue_nonfull (aq);
   2915      1.1  mrg 
   2916      1.1  mrg   int queue_last = ((aq->queue_first + aq->queue_n)
   2917      1.1  mrg 		    % ASYNC_QUEUE_SIZE);
   2918      1.1  mrg   if (DEBUG_QUEUES)
   2919      1.1  mrg     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
   2920      1.1  mrg 	       aq->id, queue_last);
   2921      1.1  mrg 
   2922      1.1  mrg   aq->queue[queue_last].type = CALLBACK;
   2923      1.1  mrg   aq->queue[queue_last].u.callback.fn = fn;
   2924      1.1  mrg   aq->queue[queue_last].u.callback.data = data;
   2925      1.1  mrg 
   2926      1.1  mrg   aq->queue_n++;
   2927      1.1  mrg 
   2928      1.1  mrg   if (DEBUG_THREAD_SIGNAL)
   2929      1.1  mrg     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
   2930      1.1  mrg 	       aq->agent->device_id, aq->id);
   2931      1.1  mrg   pthread_cond_signal (&aq->queue_cond_in);
   2932      1.1  mrg 
   2933      1.1  mrg   pthread_mutex_unlock (&aq->mutex);
   2934      1.1  mrg }
   2935      1.1  mrg 
   2936      1.1  mrg /* Request that a given async thread wait for another thread (unspecified) to
   2937      1.1  mrg    reach the given placeholder.  The wait will occur when all previous entries
   2938      1.1  mrg    on the queue are complete.  A placeholder is effectively a kind of signal
   2939      1.1  mrg    which simply sets a flag when encountered in a queue.  */
   2940      1.1  mrg 
   2941      1.1  mrg static void
   2942      1.1  mrg queue_push_asyncwait (struct goacc_asyncqueue *aq,
   2943      1.1  mrg 		      struct placeholder *placeholderp)
   2944      1.1  mrg {
   2945      1.1  mrg   pthread_mutex_lock (&aq->mutex);
   2946      1.1  mrg 
   2947      1.1  mrg   wait_for_queue_nonfull (aq);
   2948      1.1  mrg 
   2949      1.1  mrg   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
   2950      1.1  mrg   if (DEBUG_QUEUES)
   2951      1.1  mrg     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
   2952      1.1  mrg 	       aq->id, queue_last);
   2953      1.1  mrg 
   2954      1.1  mrg   aq->queue[queue_last].type = ASYNC_WAIT;
   2955      1.1  mrg   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
   2956      1.1  mrg 
   2957      1.1  mrg   aq->queue_n++;
   2958      1.1  mrg 
   2959      1.1  mrg   if (DEBUG_THREAD_SIGNAL)
   2960      1.1  mrg     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
   2961      1.1  mrg 	       aq->agent->device_id, aq->id);
   2962      1.1  mrg   pthread_cond_signal (&aq->queue_cond_in);
   2963      1.1  mrg 
   2964      1.1  mrg   pthread_mutex_unlock (&aq->mutex);
   2965      1.1  mrg }
   2966      1.1  mrg 
   2967      1.1  mrg /* Add a placeholder into an async queue.  When the async thread reaches the
   2968      1.1  mrg    placeholder it will set the "executed" flag to true and continue.
   2969      1.1  mrg    Another thread may be waiting on this thread reaching the placeholder.  */
   2970      1.1  mrg 
   2971      1.1  mrg static struct placeholder *
   2972      1.1  mrg queue_push_placeholder (struct goacc_asyncqueue *aq)
   2973      1.1  mrg {
   2974      1.1  mrg   struct placeholder *placeholderp;
   2975      1.1  mrg 
   2976      1.1  mrg   pthread_mutex_lock (&aq->mutex);
   2977      1.1  mrg 
   2978      1.1  mrg   wait_for_queue_nonfull (aq);
   2979      1.1  mrg 
   2980      1.1  mrg   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
   2981      1.1  mrg   if (DEBUG_QUEUES)
   2982      1.1  mrg     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
   2983      1.1  mrg 	       aq->id, queue_last);
   2984      1.1  mrg 
   2985      1.1  mrg   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
   2986      1.1  mrg   placeholderp = &aq->queue[queue_last].u.placeholder;
   2987      1.1  mrg 
   2988      1.1  mrg   if (pthread_mutex_init (&placeholderp->mutex, NULL))
   2989      1.1  mrg     {
   2990      1.1  mrg       pthread_mutex_unlock (&aq->mutex);
   2991      1.1  mrg       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
   2992      1.1  mrg     }
   2993      1.1  mrg 
   2994      1.1  mrg   if (pthread_cond_init (&placeholderp->cond, NULL))
   2995      1.1  mrg     {
   2996      1.1  mrg       pthread_mutex_unlock (&aq->mutex);
   2997      1.1  mrg       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
   2998      1.1  mrg     }
   2999      1.1  mrg 
   3000      1.1  mrg   placeholderp->executed = 0;
   3001      1.1  mrg 
   3002      1.1  mrg   aq->queue_n++;
   3003      1.1  mrg 
   3004      1.1  mrg   if (DEBUG_THREAD_SIGNAL)
   3005      1.1  mrg     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
   3006      1.1  mrg 	       aq->agent->device_id, aq->id);
   3007      1.1  mrg   pthread_cond_signal (&aq->queue_cond_in);
   3008      1.1  mrg 
   3009      1.1  mrg   pthread_mutex_unlock (&aq->mutex);
   3010      1.1  mrg 
   3011      1.1  mrg   return placeholderp;
   3012      1.1  mrg }
   3013      1.1  mrg 
   3014      1.1  mrg /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
   3015      1.1  mrg 
   3016      1.1  mrg static void
   3017      1.1  mrg finalize_async_thread (struct goacc_asyncqueue *aq)
   3018      1.1  mrg {
   3019      1.1  mrg   pthread_mutex_lock (&aq->mutex);
   3020      1.1  mrg   if (aq->drain_queue_stop == 2)
   3021      1.1  mrg     {
   3022      1.1  mrg       pthread_mutex_unlock (&aq->mutex);
   3023      1.1  mrg       return;
   3024      1.1  mrg     }
   3025      1.1  mrg 
   3026      1.1  mrg   aq->drain_queue_stop = 1;
   3027      1.1  mrg 
   3028      1.1  mrg   if (DEBUG_THREAD_SIGNAL)
   3029      1.1  mrg     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
   3030      1.1  mrg 	       aq->agent->device_id, aq->id);
   3031      1.1  mrg   pthread_cond_signal (&aq->queue_cond_in);
   3032      1.1  mrg 
   3033      1.1  mrg   while (aq->drain_queue_stop != 2)
   3034      1.1  mrg     {
   3035      1.1  mrg       if (DEBUG_THREAD_SLEEP)
   3036      1.1  mrg 	GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
   3037      1.1  mrg 		   " to sleep\n", aq->agent->device_id, aq->id);
   3038      1.1  mrg       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
   3039      1.1  mrg       if (DEBUG_THREAD_SLEEP)
   3040      1.1  mrg 	GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
   3041      1.1  mrg 		   aq->agent->device_id, aq->id);
   3042      1.1  mrg     }
   3043      1.1  mrg 
   3044      1.1  mrg   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
   3045      1.1  mrg 	     aq->id);
   3046      1.1  mrg   pthread_mutex_unlock (&aq->mutex);
   3047      1.1  mrg 
   3048      1.1  mrg   int err = pthread_join (aq->thread_drain_queue, NULL);
   3049      1.1  mrg   if (err != 0)
   3050      1.1  mrg     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
   3051      1.1  mrg 		       aq->agent->device_id, aq->id, strerror (err));
   3052      1.1  mrg   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
   3053      1.1  mrg }
   3054      1.1  mrg 
   3055      1.1  mrg /* Set up an async queue for OpenMP.  There will be only one.  The
   3056      1.1  mrg    implementation simply uses an OpenACC async queue.
   3057      1.1  mrg    FIXME: is this thread-safe if two threads call this function?  */
   3058      1.1  mrg 
   3059      1.1  mrg static void
   3060      1.1  mrg maybe_init_omp_async (struct agent_info *agent)
   3061      1.1  mrg {
   3062      1.1  mrg   if (!agent->omp_async_queue)
   3063      1.1  mrg     agent->omp_async_queue
   3064      1.1  mrg       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
   3065      1.1  mrg }
   3066      1.1  mrg 
   3067      1.1  mrg /* A wrapper that works around an issue in the HSA runtime with host-to-device
   3068      1.1  mrg    copies from read-only pages.  */
   3069      1.1  mrg 
   3070      1.1  mrg static void
   3071      1.1  mrg hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
   3072      1.1  mrg {
   3073      1.1  mrg   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
   3074      1.1  mrg 
   3075      1.1  mrg   if (status == HSA_STATUS_SUCCESS)
   3076      1.1  mrg     return;
   3077      1.1  mrg 
   3078      1.1  mrg   /* It appears that the copy fails if the source data is in a read-only page.
   3079      1.1  mrg      We can't detect that easily, so try copying the data to a temporary buffer
   3080      1.1  mrg      and doing the copy again if we got an error above.  */
   3081      1.1  mrg 
   3082      1.1  mrg   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
   3083      1.1  mrg 	       "[%p:+%d]\n", (void *) src, (int) len);
   3084      1.1  mrg 
   3085      1.1  mrg   void *src_copy = malloc (len);
   3086      1.1  mrg   memcpy (src_copy, src, len);
   3087      1.1  mrg   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
   3088      1.1  mrg   free (src_copy);
   3089      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   3090      1.1  mrg     GOMP_PLUGIN_error ("memory copy failed");
   3091      1.1  mrg }
   3092      1.1  mrg 
   3093      1.1  mrg /* Copy data to or from a device.  This is intended for use as an async
   3094      1.1  mrg    callback event.  */
   3095      1.1  mrg 
   3096      1.1  mrg static void
   3097      1.1  mrg copy_data (void *data_)
   3098      1.1  mrg {
   3099      1.1  mrg   struct copy_data *data = (struct copy_data *)data_;
   3100      1.1  mrg   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
   3101      1.1  mrg 	     data->aq->agent->device_id, data->aq->id, data->len, data->src,
   3102      1.1  mrg 	     data->dst);
   3103      1.1  mrg   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
   3104      1.1  mrg   free (data);
   3105      1.1  mrg }
   3106      1.1  mrg 
   3107      1.1  mrg /* Request an asynchronous data copy, to or from a device, on a given queue.
   3108  1.1.1.2  mrg    The event will be registered as a callback.  */
   3109      1.1  mrg 
   3110      1.1  mrg static void
   3111      1.1  mrg queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
   3112  1.1.1.2  mrg 		 size_t len)
   3113      1.1  mrg {
   3114      1.1  mrg   if (DEBUG_QUEUES)
   3115      1.1  mrg     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
   3116      1.1  mrg 	       aq->agent->device_id, aq->id, len, src, dst);
   3117      1.1  mrg   struct copy_data *data
   3118      1.1  mrg     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
   3119      1.1  mrg   data->dst = dst;
   3120      1.1  mrg   data->src = src;
   3121      1.1  mrg   data->len = len;
   3122      1.1  mrg   data->aq = aq;
   3123      1.1  mrg   queue_push_callback (aq, copy_data, data);
   3124      1.1  mrg }
   3125      1.1  mrg 
   3126      1.1  mrg /* Return true if the given queue is currently empty.  */
   3127      1.1  mrg 
   3128      1.1  mrg static int
   3129      1.1  mrg queue_empty (struct goacc_asyncqueue *aq)
   3130      1.1  mrg {
   3131      1.1  mrg   pthread_mutex_lock (&aq->mutex);
   3132      1.1  mrg   int res = aq->queue_n == 0 ? 1 : 0;
   3133      1.1  mrg   pthread_mutex_unlock (&aq->mutex);
   3134      1.1  mrg 
   3135      1.1  mrg   return res;
   3136      1.1  mrg }
   3137      1.1  mrg 
   3138      1.1  mrg /* Wait for a given queue to become empty.  This implements an OpenACC wait
   3139      1.1  mrg    directive.  */
   3140      1.1  mrg 
   3141      1.1  mrg static void
   3142      1.1  mrg wait_queue (struct goacc_asyncqueue *aq)
   3143      1.1  mrg {
   3144      1.1  mrg   if (DRAIN_QUEUE_SYNCHRONOUS_P)
   3145      1.1  mrg     {
   3146      1.1  mrg       drain_queue_synchronous (aq);
   3147      1.1  mrg       return;
   3148      1.1  mrg     }
   3149      1.1  mrg 
   3150      1.1  mrg   pthread_mutex_lock (&aq->mutex);
   3151      1.1  mrg 
   3152      1.1  mrg   while (aq->queue_n > 0)
   3153      1.1  mrg     {
   3154      1.1  mrg       if (DEBUG_THREAD_SLEEP)
   3155      1.1  mrg 	GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
   3156      1.1  mrg 		   aq->agent->device_id, aq->id);
   3157      1.1  mrg       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
   3158      1.1  mrg       if (DEBUG_THREAD_SLEEP)
   3159      1.1  mrg 	GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
   3160      1.1  mrg 		   aq->id);
   3161      1.1  mrg     }
   3162      1.1  mrg 
   3163      1.1  mrg   pthread_mutex_unlock (&aq->mutex);
   3164      1.1  mrg   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
   3165      1.1  mrg }
   3166      1.1  mrg 
   3167      1.1  mrg /* }}}  */
   3168      1.1  mrg /* {{{ OpenACC support  */
   3169      1.1  mrg 
   3170      1.1  mrg /* Execute an OpenACC kernel, synchronously or asynchronously.  */
   3171      1.1  mrg 
   3172      1.1  mrg static void
   3173  1.1.1.3  mrg gcn_exec (struct kernel_info *kernel,
   3174      1.1  mrg 	  void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
   3175      1.1  mrg 	  struct goacc_asyncqueue *aq)
   3176      1.1  mrg {
   3177      1.1  mrg   if (!GOMP_OFFLOAD_can_run (kernel))
   3178      1.1  mrg     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
   3179      1.1  mrg 
   3180      1.1  mrg   /* If we get here then this must be an OpenACC kernel.  */
   3181      1.1  mrg   kernel->kind = KIND_OPENACC;
   3182      1.1  mrg 
   3183      1.1  mrg   struct hsa_kernel_description *hsa_kernel_desc = NULL;
   3184      1.1  mrg   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
   3185      1.1  mrg     {
   3186      1.1  mrg       struct hsa_kernel_description *d
   3187      1.1  mrg 	= &kernel->module->image_desc->kernel_infos[i];
   3188      1.1  mrg       if (d->name == kernel->name)
   3189      1.1  mrg 	{
   3190      1.1  mrg 	  hsa_kernel_desc = d;
   3191      1.1  mrg 	  break;
   3192      1.1  mrg 	}
   3193      1.1  mrg     }
   3194      1.1  mrg 
   3195      1.1  mrg   /* We may have statically-determined dimensions in
   3196      1.1  mrg      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
   3197      1.1  mrg      invocation at runtime in dims[].  We allow static dimensions to take
   3198      1.1  mrg      priority over dynamic dimensions when present (non-zero).  */
   3199      1.1  mrg   if (hsa_kernel_desc->oacc_dims[0] > 0)
   3200      1.1  mrg     dims[0] = hsa_kernel_desc->oacc_dims[0];
   3201      1.1  mrg   if (hsa_kernel_desc->oacc_dims[1] > 0)
   3202      1.1  mrg     dims[1] = hsa_kernel_desc->oacc_dims[1];
   3203      1.1  mrg   if (hsa_kernel_desc->oacc_dims[2] > 0)
   3204      1.1  mrg     dims[2] = hsa_kernel_desc->oacc_dims[2];
   3205      1.1  mrg 
   3206  1.1.1.2  mrg   /* Ideally, when a dimension isn't explicitly specified, we should
   3207  1.1.1.2  mrg      tune it to run 40 (or 32?) threads per CU with no threads getting queued.
   3208  1.1.1.2  mrg      In practice, we tune for peak performance on BabelStream, which
   3209  1.1.1.2  mrg      for OpenACC is currently 32 threads per CU.  */
   3210  1.1.1.2  mrg   if (dims[0] == 0 && dims[1] == 0)
   3211  1.1.1.2  mrg     {
   3212  1.1.1.2  mrg       /* If any of the OpenACC dimensions remain 0 then we get to pick a
   3213  1.1.1.2  mrg 	 number.  There isn't really a correct answer for this without a clue
   3214  1.1.1.2  mrg 	 about the problem size, so let's do a reasonable number of workers
   3215  1.1.1.2  mrg 	 and gangs.  */
   3216  1.1.1.2  mrg 
   3217  1.1.1.2  mrg       dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs.  */
   3218  1.1.1.2  mrg       dims[1] = 8; /* Workers.  */
   3219  1.1.1.2  mrg     }
   3220  1.1.1.2  mrg   else if (dims[0] == 0 && dims[1] > 0)
   3221  1.1.1.2  mrg     {
   3222  1.1.1.2  mrg       /* Auto-scale the number of gangs with the requested number of workers.  */
   3223  1.1.1.2  mrg       dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
   3224  1.1.1.2  mrg     }
   3225  1.1.1.2  mrg   else if (dims[0] > 0 && dims[1] == 0)
   3226  1.1.1.2  mrg     {
   3227  1.1.1.2  mrg       /* Auto-scale the number of workers with the requested number of gangs.  */
   3228  1.1.1.2  mrg       dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
   3229  1.1.1.2  mrg       if (dims[1] == 0)
   3230  1.1.1.2  mrg 	dims[1] = 1;
   3231  1.1.1.2  mrg       if (dims[1] > 16)
   3232  1.1.1.2  mrg 	dims[1] = 16;
   3233  1.1.1.2  mrg     }
   3234      1.1  mrg 
   3235      1.1  mrg   /* The incoming dimensions are expressed in terms of gangs, workers, and
   3236      1.1  mrg      vectors.  The HSA dimensions are expressed in terms of "work-items",
   3237      1.1  mrg      which means multiples of vector lanes.
   3238      1.1  mrg 
   3239      1.1  mrg      The "grid size" specifies the size of the problem space, and the
   3240      1.1  mrg      "work-group size" specifies how much of that we want a single compute
   3241      1.1  mrg      unit to chew on at once.
   3242      1.1  mrg 
   3243      1.1  mrg      The three dimensions do not really correspond to hardware, but the
   3244      1.1  mrg      important thing is that the HSA runtime will launch as many
   3245      1.1  mrg      work-groups as it takes to process the entire grid, and each
   3246      1.1  mrg      work-group will contain as many wave-fronts as it takes to process
   3247      1.1  mrg      the work-items in that group.
   3248      1.1  mrg 
   3249      1.1  mrg      Essentially, as long as we set the Y dimension to 64 (the number of
   3250      1.1  mrg      vector lanes in hardware), and the Z group size to the maximum (16),
   3251      1.1  mrg      then we will get the gangs (X) and workers (Z) launched as we expect.
   3252      1.1  mrg 
   3253      1.1  mrg      The reason for the apparent reversal of vector and worker dimension
   3254      1.1  mrg      order is to do with the way the run-time distributes work-items across
   3255      1.1  mrg      v1 and v2.  */
   3256      1.1  mrg   struct GOMP_kernel_launch_attributes kla =
   3257      1.1  mrg     {3,
   3258      1.1  mrg      /* Grid size.  */
   3259      1.1  mrg      {dims[0], 64, dims[1]},
   3260      1.1  mrg      /* Work-group size.  */
   3261      1.1  mrg      {1,       64, 16}
   3262      1.1  mrg     };
   3263      1.1  mrg 
   3264      1.1  mrg   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
   3265      1.1  mrg   acc_prof_info *prof_info = thr->prof_info;
   3266      1.1  mrg   acc_event_info enqueue_launch_event_info;
   3267      1.1  mrg   acc_api_info *api_info = thr->api_info;
   3268      1.1  mrg   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
   3269      1.1  mrg   if (profiling_dispatch_p)
   3270      1.1  mrg     {
   3271      1.1  mrg       prof_info->event_type = acc_ev_enqueue_launch_start;
   3272      1.1  mrg 
   3273      1.1  mrg       enqueue_launch_event_info.launch_event.event_type
   3274      1.1  mrg 	= prof_info->event_type;
   3275      1.1  mrg       enqueue_launch_event_info.launch_event.valid_bytes
   3276      1.1  mrg 	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
   3277      1.1  mrg       enqueue_launch_event_info.launch_event.parent_construct
   3278      1.1  mrg 	= acc_construct_parallel;
   3279      1.1  mrg       enqueue_launch_event_info.launch_event.implicit = 1;
   3280      1.1  mrg       enqueue_launch_event_info.launch_event.tool_info = NULL;
   3281      1.1  mrg       enqueue_launch_event_info.launch_event.kernel_name
   3282      1.1  mrg 	= (char *) kernel->name;
   3283      1.1  mrg       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
   3284      1.1  mrg       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
   3285      1.1  mrg       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
   3286      1.1  mrg 
   3287      1.1  mrg       api_info->device_api = acc_device_api_other;
   3288      1.1  mrg 
   3289      1.1  mrg       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
   3290      1.1  mrg 	&enqueue_launch_event_info, api_info);
   3291      1.1  mrg     }
   3292      1.1  mrg 
   3293      1.1  mrg   if (!async)
   3294  1.1.1.3  mrg     run_kernel (kernel, devaddrs, &kla, NULL, false);
   3295      1.1  mrg   else
   3296  1.1.1.3  mrg     queue_push_launch (aq, kernel, devaddrs, &kla);
   3297      1.1  mrg 
   3298      1.1  mrg   if (profiling_dispatch_p)
   3299      1.1  mrg     {
   3300      1.1  mrg       prof_info->event_type = acc_ev_enqueue_launch_end;
   3301      1.1  mrg       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
   3302      1.1  mrg       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
   3303      1.1  mrg 					    &enqueue_launch_event_info,
   3304      1.1  mrg 					    api_info);
   3305      1.1  mrg     }
   3306      1.1  mrg }
   3307      1.1  mrg 
   3308      1.1  mrg /* }}}  */
   3309      1.1  mrg /* {{{ Generic Plugin API  */
   3310      1.1  mrg 
   3311      1.1  mrg /* Return the name of the accelerator, which is "gcn".  */
   3312      1.1  mrg 
   3313      1.1  mrg const char *
   3314      1.1  mrg GOMP_OFFLOAD_get_name (void)
   3315      1.1  mrg {
   3316      1.1  mrg   return "gcn";
   3317      1.1  mrg }
   3318      1.1  mrg 
   3319      1.1  mrg /* Return the specific capabilities the HSA accelerator have.  */
   3320      1.1  mrg 
   3321      1.1  mrg unsigned int
   3322      1.1  mrg GOMP_OFFLOAD_get_caps (void)
   3323      1.1  mrg {
   3324      1.1  mrg   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
   3325      1.1  mrg   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
   3326      1.1  mrg 	    | GOMP_OFFLOAD_CAP_OPENACC_200;
   3327      1.1  mrg }
   3328      1.1  mrg 
   3329      1.1  mrg /* Identify as GCN accelerator.  */
   3330      1.1  mrg 
   3331      1.1  mrg int
   3332      1.1  mrg GOMP_OFFLOAD_get_type (void)
   3333      1.1  mrg {
   3334      1.1  mrg   return OFFLOAD_TARGET_TYPE_GCN;
   3335      1.1  mrg }
   3336      1.1  mrg 
   3337      1.1  mrg /* Return the libgomp version number we're compatible with.  There is
   3338      1.1  mrg    no requirement for cross-version compatibility.  */
   3339      1.1  mrg 
   3340      1.1  mrg unsigned
   3341      1.1  mrg GOMP_OFFLOAD_version (void)
   3342      1.1  mrg {
   3343      1.1  mrg   return GOMP_VERSION;
   3344      1.1  mrg }
   3345      1.1  mrg 
   3346      1.1  mrg /* Return the number of GCN devices on the system.  */
   3347      1.1  mrg 
   3348      1.1  mrg int
   3349  1.1.1.3  mrg GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
   3350      1.1  mrg {
   3351  1.1.1.3  mrg   if (!init_hsa_context (true))
   3352  1.1.1.3  mrg     exit (EXIT_FAILURE);
   3353  1.1.1.3  mrg   /* Return -1 if no omp_requires_mask cannot be fulfilled but
   3354  1.1.1.3  mrg      devices were present.  */
   3355  1.1.1.3  mrg   if (hsa_context.agent_count > 0
   3356  1.1.1.3  mrg       && ((omp_requires_mask
   3357  1.1.1.3  mrg 	   & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
   3358  1.1.1.3  mrg 	       | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
   3359  1.1.1.3  mrg     return -1;
   3360      1.1  mrg   return hsa_context.agent_count;
   3361      1.1  mrg }
   3362      1.1  mrg 
   3363      1.1  mrg /* Initialize device (agent) number N so that it can be used for computation.
   3364      1.1  mrg    Return TRUE on success.  */
   3365      1.1  mrg 
   3366      1.1  mrg bool
   3367      1.1  mrg GOMP_OFFLOAD_init_device (int n)
   3368      1.1  mrg {
   3369  1.1.1.3  mrg   if (!init_hsa_context (false))
   3370      1.1  mrg     return false;
   3371      1.1  mrg   if (n >= hsa_context.agent_count)
   3372      1.1  mrg     {
   3373      1.1  mrg       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
   3374      1.1  mrg       return false;
   3375      1.1  mrg     }
   3376      1.1  mrg   struct agent_info *agent = &hsa_context.agents[n];
   3377      1.1  mrg 
   3378      1.1  mrg   if (agent->initialized)
   3379      1.1  mrg     return true;
   3380      1.1  mrg 
   3381      1.1  mrg   agent->device_id = n;
   3382      1.1  mrg 
   3383      1.1  mrg   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
   3384      1.1  mrg     {
   3385      1.1  mrg       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
   3386      1.1  mrg       return false;
   3387      1.1  mrg     }
   3388      1.1  mrg   if (pthread_mutex_init (&agent->prog_mutex, NULL))
   3389      1.1  mrg     {
   3390      1.1  mrg       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
   3391      1.1  mrg       return false;
   3392      1.1  mrg     }
   3393      1.1  mrg   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
   3394      1.1  mrg     {
   3395      1.1  mrg       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
   3396      1.1  mrg       return false;
   3397      1.1  mrg     }
   3398  1.1.1.3  mrg   if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
   3399      1.1  mrg     {
   3400      1.1  mrg       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
   3401      1.1  mrg       return false;
   3402      1.1  mrg     }
   3403      1.1  mrg   agent->async_queues = NULL;
   3404      1.1  mrg   agent->omp_async_queue = NULL;
   3405  1.1.1.3  mrg   agent->ephemeral_memories_list = NULL;
   3406      1.1  mrg 
   3407      1.1  mrg   uint32_t queue_size;
   3408      1.1  mrg   hsa_status_t status;
   3409      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
   3410      1.1  mrg 					  HSA_AGENT_INFO_QUEUE_MAX_SIZE,
   3411      1.1  mrg 					  &queue_size);
   3412      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   3413      1.1  mrg     return hsa_error ("Error requesting maximum queue size of the GCN agent",
   3414      1.1  mrg 		      status);
   3415      1.1  mrg 
   3416      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
   3417      1.1  mrg 					  &agent->name);
   3418      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   3419      1.1  mrg     return hsa_error ("Error querying the name of the agent", status);
   3420      1.1  mrg 
   3421      1.1  mrg   agent->device_isa = isa_code (agent->name);
   3422  1.1.1.3  mrg   if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED)
   3423      1.1  mrg     return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
   3424      1.1  mrg 
   3425      1.1  mrg   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
   3426      1.1  mrg 					  &agent->vendor_name);
   3427      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   3428      1.1  mrg     return hsa_error ("Error querying the vendor name of the agent", status);
   3429      1.1  mrg 
   3430      1.1  mrg   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
   3431      1.1  mrg 					HSA_QUEUE_TYPE_MULTI,
   3432      1.1  mrg 					hsa_queue_callback, NULL, UINT32_MAX,
   3433      1.1  mrg 					UINT32_MAX, &agent->sync_queue);
   3434      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   3435      1.1  mrg     return hsa_error ("Error creating command queue", status);
   3436      1.1  mrg 
   3437      1.1  mrg   agent->kernarg_region.handle = (uint64_t) -1;
   3438      1.1  mrg   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
   3439      1.1  mrg 						 get_kernarg_memory_region,
   3440      1.1  mrg 						 &agent->kernarg_region);
   3441      1.1  mrg   if (status != HSA_STATUS_SUCCESS
   3442      1.1  mrg       && status != HSA_STATUS_INFO_BREAK)
   3443      1.1  mrg     hsa_error ("Scanning memory regions failed", status);
   3444      1.1  mrg   if (agent->kernarg_region.handle == (uint64_t) -1)
   3445      1.1  mrg     {
   3446      1.1  mrg       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
   3447      1.1  mrg 			 "arguments");
   3448      1.1  mrg       return false;
   3449      1.1  mrg     }
   3450      1.1  mrg   GCN_DEBUG ("Selected kernel arguments memory region:\n");
   3451      1.1  mrg   dump_hsa_region (agent->kernarg_region, NULL);
   3452      1.1  mrg 
   3453      1.1  mrg   agent->data_region.handle = (uint64_t) -1;
   3454      1.1  mrg   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
   3455      1.1  mrg 						 get_data_memory_region,
   3456      1.1  mrg 						 &agent->data_region);
   3457      1.1  mrg   if (status != HSA_STATUS_SUCCESS
   3458      1.1  mrg       && status != HSA_STATUS_INFO_BREAK)
   3459      1.1  mrg     hsa_error ("Scanning memory regions failed", status);
   3460      1.1  mrg   if (agent->data_region.handle == (uint64_t) -1)
   3461      1.1  mrg     {
   3462      1.1  mrg       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
   3463      1.1  mrg 			 "data");
   3464      1.1  mrg       return false;
   3465      1.1  mrg     }
   3466      1.1  mrg   GCN_DEBUG ("Selected device data memory region:\n");
   3467      1.1  mrg   dump_hsa_region (agent->data_region, NULL);
   3468      1.1  mrg 
   3469      1.1  mrg   GCN_DEBUG ("GCN agent %d initialized\n", n);
   3470      1.1  mrg 
   3471      1.1  mrg   agent->initialized = true;
   3472      1.1  mrg   return true;
   3473      1.1  mrg }
   3474      1.1  mrg 
   3475      1.1  mrg /* Load GCN object-code module described by struct gcn_image_desc in
   3476      1.1  mrg    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
   3477  1.1.1.3  mrg    If there are any constructors then run them.  If not NULL, REV_FN_TABLE will
   3478  1.1.1.3  mrg    contain the on-device addresses of the functions for reverse offload.  To be
   3479  1.1.1.3  mrg    freed by the caller.  */
   3480      1.1  mrg 
   3481      1.1  mrg int
   3482      1.1  mrg GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
   3483  1.1.1.3  mrg 			 struct addr_pair **target_table,
   3484  1.1.1.3  mrg 			 uint64_t **rev_fn_table,
   3485  1.1.1.3  mrg 			 uint64_t *host_ind_fn_table)
   3486      1.1  mrg {
   3487      1.1  mrg   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
   3488      1.1  mrg     {
   3489      1.1  mrg       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
   3490      1.1  mrg 			 " (expected %u, received %u)",
   3491      1.1  mrg 			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
   3492      1.1  mrg       return -1;
   3493      1.1  mrg     }
   3494      1.1  mrg 
   3495      1.1  mrg   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
   3496      1.1  mrg   struct agent_info *agent;
   3497      1.1  mrg   struct addr_pair *pair;
   3498      1.1  mrg   struct module_info *module;
   3499      1.1  mrg   struct kernel_info *kernel;
   3500      1.1  mrg   int kernel_count = image_desc->kernel_count;
   3501  1.1.1.3  mrg   unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
   3502  1.1.1.3  mrg 			      ? image_desc->ind_func_count : 0;
   3503      1.1  mrg   unsigned var_count = image_desc->global_variable_count;
   3504  1.1.1.3  mrg   /* Currently, "others" is a struct of ICVS.  */
   3505  1.1.1.2  mrg   int other_count = 1;
   3506      1.1  mrg 
   3507      1.1  mrg   agent = get_agent_info (ord);
   3508      1.1  mrg   if (!agent)
   3509      1.1  mrg     return -1;
   3510      1.1  mrg 
   3511      1.1  mrg   if (pthread_rwlock_wrlock (&agent->module_rwlock))
   3512      1.1  mrg     {
   3513      1.1  mrg       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
   3514      1.1  mrg       return -1;
   3515      1.1  mrg     }
   3516      1.1  mrg   if (agent->prog_finalized
   3517      1.1  mrg       && !destroy_hsa_program (agent))
   3518      1.1  mrg     return -1;
   3519      1.1  mrg 
   3520      1.1  mrg   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
   3521  1.1.1.3  mrg   GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
   3522      1.1  mrg   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
   3523  1.1.1.2  mrg   GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
   3524  1.1.1.2  mrg   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
   3525      1.1  mrg 			     * sizeof (struct addr_pair));
   3526      1.1  mrg   *target_table = pair;
   3527      1.1  mrg   module = (struct module_info *)
   3528      1.1  mrg     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
   3529      1.1  mrg 				+ kernel_count * sizeof (struct kernel_info));
   3530      1.1  mrg   module->image_desc = image_desc;
   3531      1.1  mrg   module->kernel_count = kernel_count;
   3532      1.1  mrg   module->heap = NULL;
   3533      1.1  mrg   module->constructors_run_p = false;
   3534      1.1  mrg 
   3535      1.1  mrg   kernel = &module->kernels[0];
   3536      1.1  mrg 
   3537      1.1  mrg   /* Allocate memory for kernel dependencies.  */
   3538      1.1  mrg   for (unsigned i = 0; i < kernel_count; i++)
   3539      1.1  mrg     {
   3540      1.1  mrg       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
   3541      1.1  mrg       if (!init_basic_kernel_info (kernel, d, agent, module))
   3542      1.1  mrg 	return -1;
   3543      1.1  mrg       if (strcmp (d->name, "_init_array") == 0)
   3544      1.1  mrg 	module->init_array_func = kernel;
   3545      1.1  mrg       else if (strcmp (d->name, "_fini_array") == 0)
   3546      1.1  mrg         module->fini_array_func = kernel;
   3547      1.1  mrg       else
   3548      1.1  mrg 	{
   3549      1.1  mrg 	  pair->start = (uintptr_t) kernel;
   3550      1.1  mrg 	  pair->end = (uintptr_t) (kernel + 1);
   3551      1.1  mrg 	  pair++;
   3552      1.1  mrg 	}
   3553      1.1  mrg       kernel++;
   3554      1.1  mrg     }
   3555      1.1  mrg 
   3556      1.1  mrg   agent->module = module;
   3557      1.1  mrg   if (pthread_rwlock_unlock (&agent->module_rwlock))
   3558      1.1  mrg     {
   3559      1.1  mrg       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
   3560      1.1  mrg       return -1;
   3561      1.1  mrg     }
   3562      1.1  mrg 
   3563      1.1  mrg   if (!create_and_finalize_hsa_program (agent))
   3564      1.1  mrg     return -1;
   3565      1.1  mrg 
   3566  1.1.1.2  mrg   if (var_count > 0)
   3567      1.1  mrg     {
   3568      1.1  mrg       hsa_status_t status;
   3569      1.1  mrg       hsa_executable_symbol_t var_symbol;
   3570      1.1  mrg       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   3571  1.1.1.2  mrg 						     ".offload_var_table",
   3572  1.1.1.2  mrg 						     agent->id,
   3573      1.1  mrg 						     0, &var_symbol);
   3574      1.1  mrg 
   3575      1.1  mrg       if (status != HSA_STATUS_SUCCESS)
   3576      1.1  mrg 	hsa_fatal ("Could not find symbol for variable in the code object",
   3577      1.1  mrg 		   status);
   3578      1.1  mrg 
   3579  1.1.1.2  mrg       uint64_t var_table_addr;
   3580  1.1.1.2  mrg       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3581  1.1.1.2  mrg 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
   3582  1.1.1.2  mrg 	 &var_table_addr);
   3583  1.1.1.2  mrg       if (status != HSA_STATUS_SUCCESS)
   3584  1.1.1.2  mrg 	hsa_fatal ("Could not extract a variable from its symbol", status);
   3585  1.1.1.2  mrg 
   3586  1.1.1.2  mrg       struct {
   3587  1.1.1.2  mrg 	uint64_t addr;
   3588  1.1.1.2  mrg 	uint64_t size;
   3589  1.1.1.2  mrg       } var_table[var_count];
   3590  1.1.1.2  mrg       GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
   3591  1.1.1.2  mrg 			     (void*)var_table_addr, sizeof (var_table));
   3592  1.1.1.2  mrg 
   3593  1.1.1.2  mrg       for (unsigned i = 0; i < var_count; i++)
   3594  1.1.1.2  mrg 	{
   3595  1.1.1.2  mrg 	  pair->start = var_table[i].addr;
   3596  1.1.1.2  mrg 	  pair->end = var_table[i].addr + var_table[i].size;
   3597  1.1.1.2  mrg 	  GCN_DEBUG ("Found variable at %p with size %lu\n",
   3598  1.1.1.2  mrg 		     (void *)var_table[i].addr, var_table[i].size);
   3599  1.1.1.2  mrg 	  pair++;
   3600  1.1.1.2  mrg 	}
   3601  1.1.1.2  mrg     }
   3602  1.1.1.2  mrg 
   3603  1.1.1.3  mrg   if (ind_func_count > 0)
   3604  1.1.1.3  mrg     {
   3605  1.1.1.3  mrg       hsa_status_t status;
   3606  1.1.1.3  mrg 
   3607  1.1.1.3  mrg       /* Read indirect function table from image.  */
   3608  1.1.1.3  mrg       hsa_executable_symbol_t ind_funcs_symbol;
   3609  1.1.1.3  mrg       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   3610  1.1.1.3  mrg 						     ".offload_ind_func_table",
   3611  1.1.1.3  mrg 						     agent->id,
   3612  1.1.1.3  mrg 						     0, &ind_funcs_symbol);
   3613  1.1.1.3  mrg 
   3614  1.1.1.3  mrg       if (status != HSA_STATUS_SUCCESS)
   3615  1.1.1.3  mrg 	hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
   3616  1.1.1.3  mrg 		   "code object", status);
   3617  1.1.1.3  mrg 
   3618  1.1.1.3  mrg       uint64_t ind_funcs_table_addr;
   3619  1.1.1.3  mrg       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3620  1.1.1.3  mrg 	(ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
   3621  1.1.1.3  mrg 	 &ind_funcs_table_addr);
   3622  1.1.1.3  mrg       if (status != HSA_STATUS_SUCCESS)
   3623  1.1.1.3  mrg 	hsa_fatal ("Could not extract a variable from its symbol", status);
   3624  1.1.1.3  mrg 
   3625  1.1.1.3  mrg       uint64_t ind_funcs_table[ind_func_count];
   3626  1.1.1.3  mrg       GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
   3627  1.1.1.3  mrg 			     (void*) ind_funcs_table_addr,
   3628  1.1.1.3  mrg 			     sizeof (ind_funcs_table));
   3629  1.1.1.3  mrg 
   3630  1.1.1.3  mrg       /* Build host->target address map for indirect functions.  */
   3631  1.1.1.3  mrg       uint64_t ind_fn_map[ind_func_count * 2 + 1];
   3632  1.1.1.3  mrg       for (unsigned i = 0; i < ind_func_count; i++)
   3633  1.1.1.3  mrg 	{
   3634  1.1.1.3  mrg 	  ind_fn_map[i * 2] = host_ind_fn_table[i];
   3635  1.1.1.3  mrg 	  ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
   3636  1.1.1.3  mrg 	  GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
   3637  1.1.1.3  mrg 		     i, host_ind_fn_table[i], ind_funcs_table[i]);
   3638  1.1.1.3  mrg 	}
   3639  1.1.1.3  mrg       ind_fn_map[ind_func_count * 2] = 0;
   3640  1.1.1.3  mrg 
   3641  1.1.1.3  mrg       /* Write the map onto the target.  */
   3642  1.1.1.3  mrg       void *map_target_addr
   3643  1.1.1.3  mrg 	= GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
   3644  1.1.1.3  mrg       GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
   3645  1.1.1.3  mrg 
   3646  1.1.1.3  mrg       GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
   3647  1.1.1.3  mrg 			     (void*) ind_fn_map,
   3648  1.1.1.3  mrg 			     sizeof (ind_fn_map));
   3649  1.1.1.3  mrg 
   3650  1.1.1.3  mrg       /* Write address of the map onto the target.  */
   3651  1.1.1.3  mrg       hsa_executable_symbol_t symbol;
   3652  1.1.1.3  mrg 
   3653  1.1.1.3  mrg       status
   3654  1.1.1.3  mrg 	= hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   3655  1.1.1.3  mrg 						XSTRING (GOMP_INDIRECT_ADDR_MAP),
   3656  1.1.1.3  mrg 						agent->id, 0, &symbol);
   3657  1.1.1.3  mrg       if (status != HSA_STATUS_SUCCESS)
   3658  1.1.1.3  mrg 	hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
   3659  1.1.1.3  mrg 		   status);
   3660  1.1.1.3  mrg 
   3661  1.1.1.3  mrg       uint64_t varptr;
   3662  1.1.1.3  mrg       uint32_t varsize;
   3663  1.1.1.3  mrg 
   3664  1.1.1.3  mrg       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3665  1.1.1.3  mrg 	(symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
   3666  1.1.1.3  mrg 	 &varptr);
   3667  1.1.1.3  mrg       if (status != HSA_STATUS_SUCCESS)
   3668  1.1.1.3  mrg 	hsa_fatal ("Could not extract a variable from its symbol", status);
   3669  1.1.1.3  mrg       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3670  1.1.1.3  mrg 	(symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
   3671  1.1.1.3  mrg 	&varsize);
   3672  1.1.1.3  mrg       if (status != HSA_STATUS_SUCCESS)
   3673  1.1.1.3  mrg 	hsa_fatal ("Could not extract a variable size from its symbol",
   3674  1.1.1.3  mrg 		   status);
   3675  1.1.1.3  mrg 
   3676  1.1.1.3  mrg       GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
   3677  1.1.1.3  mrg 		 varptr, varsize);
   3678  1.1.1.3  mrg 
   3679  1.1.1.3  mrg       GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
   3680  1.1.1.3  mrg 			     &map_target_addr,
   3681  1.1.1.3  mrg 			     sizeof (map_target_addr));
   3682  1.1.1.3  mrg     }
   3683  1.1.1.3  mrg 
   3684  1.1.1.3  mrg   GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
   3685  1.1.1.2  mrg 
   3686  1.1.1.2  mrg   hsa_status_t status;
   3687  1.1.1.2  mrg   hsa_executable_symbol_t var_symbol;
   3688  1.1.1.2  mrg   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   3689  1.1.1.3  mrg 						 XSTRING (GOMP_ADDITIONAL_ICVS),
   3690  1.1.1.2  mrg 						 agent->id, 0, &var_symbol);
   3691  1.1.1.2  mrg   if (status == HSA_STATUS_SUCCESS)
   3692  1.1.1.2  mrg     {
   3693  1.1.1.3  mrg       uint64_t varptr;
   3694  1.1.1.3  mrg       uint32_t varsize;
   3695  1.1.1.2  mrg 
   3696      1.1  mrg       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3697  1.1.1.2  mrg 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
   3698  1.1.1.3  mrg 	 &varptr);
   3699      1.1  mrg       if (status != HSA_STATUS_SUCCESS)
   3700      1.1  mrg 	hsa_fatal ("Could not extract a variable from its symbol", status);
   3701      1.1  mrg       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3702  1.1.1.2  mrg 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
   3703  1.1.1.3  mrg 	 &varsize);
   3704      1.1  mrg       if (status != HSA_STATUS_SUCCESS)
   3705  1.1.1.3  mrg 	hsa_fatal ("Could not extract a variable size from its symbol",
   3706  1.1.1.3  mrg 		   status);
   3707      1.1  mrg 
   3708  1.1.1.3  mrg       pair->start = varptr;
   3709  1.1.1.3  mrg       pair->end = varptr + varsize;
   3710      1.1  mrg     }
   3711  1.1.1.2  mrg   else
   3712  1.1.1.3  mrg     {
   3713  1.1.1.3  mrg       /* The variable was not in this image.  */
   3714  1.1.1.3  mrg       GCN_DEBUG ("Variable not found in image: %s\n",
   3715  1.1.1.3  mrg 		 XSTRING (GOMP_ADDITIONAL_ICVS));
   3716  1.1.1.3  mrg       pair->start = pair->end = 0;
   3717  1.1.1.3  mrg     }
   3718      1.1  mrg 
   3719      1.1  mrg   /* Ensure that constructors are run first.  */
   3720      1.1  mrg   struct GOMP_kernel_launch_attributes kla =
   3721      1.1  mrg     { 3,
   3722      1.1  mrg       /* Grid size.  */
   3723      1.1  mrg       { 1, 64, 1 },
   3724      1.1  mrg       /* Work-group size.  */
   3725      1.1  mrg       { 1, 64, 1 }
   3726      1.1  mrg     };
   3727      1.1  mrg 
   3728      1.1  mrg   if (module->init_array_func)
   3729      1.1  mrg     {
   3730      1.1  mrg       init_kernel (module->init_array_func);
   3731      1.1  mrg       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
   3732      1.1  mrg     }
   3733      1.1  mrg   module->constructors_run_p = true;
   3734      1.1  mrg 
   3735      1.1  mrg   /* Don't report kernels that libgomp need not know about.  */
   3736      1.1  mrg   if (module->init_array_func)
   3737      1.1  mrg     kernel_count--;
   3738      1.1  mrg   if (module->fini_array_func)
   3739      1.1  mrg     kernel_count--;
   3740      1.1  mrg 
   3741  1.1.1.3  mrg   if (rev_fn_table != NULL && kernel_count == 0)
   3742  1.1.1.3  mrg     *rev_fn_table = NULL;
   3743  1.1.1.3  mrg   else if (rev_fn_table != NULL)
   3744  1.1.1.3  mrg     {
   3745  1.1.1.3  mrg       hsa_status_t status;
   3746  1.1.1.3  mrg       hsa_executable_symbol_t var_symbol;
   3747  1.1.1.3  mrg       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
   3748  1.1.1.3  mrg 						     ".offload_func_table",
   3749  1.1.1.3  mrg 						     agent->id, 0, &var_symbol);
   3750  1.1.1.3  mrg       if (status != HSA_STATUS_SUCCESS)
   3751  1.1.1.3  mrg 	hsa_fatal ("Could not find symbol for variable in the code object",
   3752  1.1.1.3  mrg 		   status);
   3753  1.1.1.3  mrg       uint64_t fn_table_addr;
   3754  1.1.1.3  mrg       status = hsa_fns.hsa_executable_symbol_get_info_fn
   3755  1.1.1.3  mrg 	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
   3756  1.1.1.3  mrg 	 &fn_table_addr);
   3757  1.1.1.3  mrg       if (status != HSA_STATUS_SUCCESS)
   3758  1.1.1.3  mrg 	hsa_fatal ("Could not extract a variable from its symbol", status);
   3759  1.1.1.3  mrg       *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
   3760  1.1.1.3  mrg       GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
   3761  1.1.1.3  mrg 			     (void*) fn_table_addr,
   3762  1.1.1.3  mrg 			     kernel_count * sizeof (uint64_t));
   3763  1.1.1.3  mrg     }
   3764  1.1.1.3  mrg 
   3765  1.1.1.2  mrg   return kernel_count + var_count + other_count;
   3766      1.1  mrg }
   3767      1.1  mrg 
   3768      1.1  mrg /* Unload GCN object-code module described by struct gcn_image_desc in
   3769      1.1  mrg    TARGET_DATA from agent number N.  Return TRUE on success.  */
   3770      1.1  mrg 
   3771      1.1  mrg bool
   3772      1.1  mrg GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
   3773      1.1  mrg {
   3774      1.1  mrg   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
   3775      1.1  mrg     {
   3776      1.1  mrg       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
   3777      1.1  mrg 			 " (expected %u, received %u)",
   3778      1.1  mrg 			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
   3779      1.1  mrg       return false;
   3780      1.1  mrg     }
   3781      1.1  mrg 
   3782      1.1  mrg   struct agent_info *agent;
   3783      1.1  mrg   agent = get_agent_info (n);
   3784      1.1  mrg   if (!agent)
   3785      1.1  mrg     return false;
   3786      1.1  mrg 
   3787      1.1  mrg   if (pthread_rwlock_wrlock (&agent->module_rwlock))
   3788      1.1  mrg     {
   3789      1.1  mrg       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
   3790      1.1  mrg       return false;
   3791      1.1  mrg     }
   3792      1.1  mrg 
   3793      1.1  mrg   if (!agent->module || agent->module->image_desc != target_data)
   3794      1.1  mrg     {
   3795      1.1  mrg       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
   3796      1.1  mrg 			 "loaded before");
   3797      1.1  mrg       return false;
   3798      1.1  mrg     }
   3799      1.1  mrg 
   3800      1.1  mrg   if (!destroy_module (agent->module, true))
   3801      1.1  mrg     return false;
   3802      1.1  mrg   free (agent->module);
   3803      1.1  mrg   agent->module = NULL;
   3804      1.1  mrg   if (!destroy_hsa_program (agent))
   3805      1.1  mrg     return false;
   3806      1.1  mrg   if (pthread_rwlock_unlock (&agent->module_rwlock))
   3807      1.1  mrg     {
   3808      1.1  mrg       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
   3809      1.1  mrg       return false;
   3810      1.1  mrg     }
   3811      1.1  mrg   return true;
   3812      1.1  mrg }
   3813      1.1  mrg 
   3814      1.1  mrg /* Deinitialize all information and status associated with agent number N.  We
   3815      1.1  mrg    do not attempt any synchronization, assuming the user and libgomp will not
   3816      1.1  mrg    attempt deinitialization of a device that is in any way being used at the
   3817      1.1  mrg    same time.  Return TRUE on success.  */
   3818      1.1  mrg 
   3819      1.1  mrg bool
   3820      1.1  mrg GOMP_OFFLOAD_fini_device (int n)
   3821      1.1  mrg {
   3822      1.1  mrg   struct agent_info *agent = get_agent_info (n);
   3823      1.1  mrg   if (!agent)
   3824      1.1  mrg     return false;
   3825      1.1  mrg 
   3826      1.1  mrg   if (!agent->initialized)
   3827      1.1  mrg     return true;
   3828      1.1  mrg 
   3829      1.1  mrg   if (agent->omp_async_queue)
   3830      1.1  mrg     {
   3831      1.1  mrg       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
   3832      1.1  mrg       agent->omp_async_queue = NULL;
   3833      1.1  mrg     }
   3834      1.1  mrg 
   3835      1.1  mrg   if (agent->module)
   3836      1.1  mrg     {
   3837      1.1  mrg       if (!destroy_module (agent->module, false))
   3838      1.1  mrg 	return false;
   3839      1.1  mrg       free (agent->module);
   3840      1.1  mrg       agent->module = NULL;
   3841      1.1  mrg     }
   3842      1.1  mrg 
   3843  1.1.1.3  mrg   if (!destroy_ephemeral_memories (agent))
   3844      1.1  mrg     return false;
   3845      1.1  mrg 
   3846      1.1  mrg   if (!destroy_hsa_program (agent))
   3847      1.1  mrg     return false;
   3848      1.1  mrg 
   3849      1.1  mrg   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
   3850      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   3851      1.1  mrg     return hsa_error ("Error destroying command queue", status);
   3852      1.1  mrg 
   3853      1.1  mrg   if (pthread_mutex_destroy (&agent->prog_mutex))
   3854      1.1  mrg     {
   3855      1.1  mrg       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
   3856      1.1  mrg       return false;
   3857      1.1  mrg     }
   3858      1.1  mrg   if (pthread_rwlock_destroy (&agent->module_rwlock))
   3859      1.1  mrg     {
   3860      1.1  mrg       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
   3861      1.1  mrg       return false;
   3862      1.1  mrg     }
   3863      1.1  mrg 
   3864      1.1  mrg   if (pthread_mutex_destroy (&agent->async_queues_mutex))
   3865      1.1  mrg     {
   3866      1.1  mrg       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
   3867      1.1  mrg       return false;
   3868      1.1  mrg     }
   3869  1.1.1.3  mrg   if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
   3870      1.1  mrg     {
   3871  1.1.1.3  mrg       GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
   3872      1.1  mrg       return false;
   3873      1.1  mrg     }
   3874      1.1  mrg   agent->initialized = false;
   3875      1.1  mrg   return true;
   3876      1.1  mrg }
   3877      1.1  mrg 
   3878      1.1  mrg /* Return true if the HSA runtime can run function FN_PTR.  */
   3879      1.1  mrg 
   3880      1.1  mrg bool
   3881      1.1  mrg GOMP_OFFLOAD_can_run (void *fn_ptr)
   3882      1.1  mrg {
   3883      1.1  mrg   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
   3884      1.1  mrg 
   3885      1.1  mrg   init_kernel (kernel);
   3886      1.1  mrg   if (kernel->initialization_failed)
   3887  1.1.1.3  mrg     GOMP_PLUGIN_fatal ("kernel initialization failed");
   3888      1.1  mrg 
   3889      1.1  mrg   return true;
   3890      1.1  mrg }
   3891      1.1  mrg 
   3892      1.1  mrg /* Allocate memory on device N.  */
   3893      1.1  mrg 
   3894      1.1  mrg void *
   3895      1.1  mrg GOMP_OFFLOAD_alloc (int n, size_t size)
   3896      1.1  mrg {
   3897      1.1  mrg   struct agent_info *agent = get_agent_info (n);
   3898      1.1  mrg   return alloc_by_agent (agent, size);
   3899      1.1  mrg }
   3900      1.1  mrg 
   3901      1.1  mrg /* Free memory from device N.  */
   3902      1.1  mrg 
   3903      1.1  mrg bool
   3904      1.1  mrg GOMP_OFFLOAD_free (int device, void *ptr)
   3905      1.1  mrg {
   3906      1.1  mrg   GCN_DEBUG ("Freeing memory on device %d\n", device);
   3907      1.1  mrg 
   3908      1.1  mrg   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
   3909      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   3910      1.1  mrg     {
   3911      1.1  mrg       hsa_error ("Could not free device memory", status);
   3912      1.1  mrg       return false;
   3913      1.1  mrg     }
   3914      1.1  mrg 
   3915      1.1  mrg   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
   3916      1.1  mrg   bool profiling_dispatch_p
   3917      1.1  mrg     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
   3918      1.1  mrg   if (profiling_dispatch_p)
   3919      1.1  mrg     {
   3920      1.1  mrg       acc_prof_info *prof_info = thr->prof_info;
   3921      1.1  mrg       acc_event_info data_event_info;
   3922      1.1  mrg       acc_api_info *api_info = thr->api_info;
   3923      1.1  mrg 
   3924      1.1  mrg       prof_info->event_type = acc_ev_free;
   3925      1.1  mrg 
   3926      1.1  mrg       data_event_info.data_event.event_type = prof_info->event_type;
   3927      1.1  mrg       data_event_info.data_event.valid_bytes
   3928      1.1  mrg 	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
   3929      1.1  mrg       data_event_info.data_event.parent_construct
   3930      1.1  mrg 	= acc_construct_parallel;
   3931      1.1  mrg       data_event_info.data_event.implicit = 1;
   3932      1.1  mrg       data_event_info.data_event.tool_info = NULL;
   3933      1.1  mrg       data_event_info.data_event.var_name = NULL;
   3934      1.1  mrg       data_event_info.data_event.bytes = 0;
   3935      1.1  mrg       data_event_info.data_event.host_ptr = NULL;
   3936      1.1  mrg       data_event_info.data_event.device_ptr = (void *) ptr;
   3937      1.1  mrg 
   3938      1.1  mrg       api_info->device_api = acc_device_api_other;
   3939      1.1  mrg 
   3940      1.1  mrg       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
   3941      1.1  mrg 					    api_info);
   3942      1.1  mrg     }
   3943      1.1  mrg 
   3944      1.1  mrg   return true;
   3945      1.1  mrg }
   3946      1.1  mrg 
   3947      1.1  mrg /* Copy data from DEVICE to host.  */
   3948      1.1  mrg 
   3949      1.1  mrg bool
   3950      1.1  mrg GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
   3951      1.1  mrg {
   3952      1.1  mrg   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
   3953      1.1  mrg 	     src, dst);
   3954      1.1  mrg   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
   3955      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   3956      1.1  mrg     GOMP_PLUGIN_error ("memory copy failed");
   3957      1.1  mrg   return true;
   3958      1.1  mrg }
   3959      1.1  mrg 
   3960      1.1  mrg /* Copy data from host to DEVICE.  */
   3961      1.1  mrg 
   3962      1.1  mrg bool
   3963      1.1  mrg GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
   3964      1.1  mrg {
   3965      1.1  mrg   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
   3966      1.1  mrg 	     device, dst);
   3967      1.1  mrg   hsa_memory_copy_wrapper (dst, src, n);
   3968      1.1  mrg   return true;
   3969      1.1  mrg }
   3970      1.1  mrg 
   3971      1.1  mrg /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
   3972      1.1  mrg 
   3973      1.1  mrg bool
   3974      1.1  mrg GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
   3975      1.1  mrg {
   3976      1.1  mrg   struct gcn_thread *thread_data = gcn_thread ();
   3977      1.1  mrg 
   3978      1.1  mrg   if (thread_data && !async_synchronous_p (thread_data->async))
   3979      1.1  mrg     {
   3980      1.1  mrg       struct agent_info *agent = get_agent_info (device);
   3981      1.1  mrg       maybe_init_omp_async (agent);
   3982  1.1.1.2  mrg       queue_push_copy (agent->omp_async_queue, dst, src, n);
   3983      1.1  mrg       return true;
   3984      1.1  mrg     }
   3985      1.1  mrg 
   3986      1.1  mrg   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
   3987      1.1  mrg 	     device, src, device, dst);
   3988      1.1  mrg   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
   3989      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   3990      1.1  mrg     GOMP_PLUGIN_error ("memory copy failed");
   3991      1.1  mrg   return true;
   3992      1.1  mrg }
   3993      1.1  mrg 
   3994  1.1.1.3  mrg /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
   3995  1.1.1.3  mrg    measured in bytes.  So we have:
   3996  1.1.1.3  mrg 
   3997  1.1.1.3  mrg    dim1_size: number of bytes to copy on innermost dimension ("row")
   3998  1.1.1.3  mrg    dim0_len: number of rows to copy
   3999  1.1.1.3  mrg    dst: base pointer for destination of copy
   4000  1.1.1.3  mrg    dst_offset1_size: innermost row offset (for dest), in bytes
   4001  1.1.1.3  mrg    dst_offset0_len: offset, number of rows (for dest)
   4002  1.1.1.3  mrg    dst_dim1_size: whole-array dest row length, in bytes (pitch)
   4003  1.1.1.3  mrg    src: base pointer for source of copy
   4004  1.1.1.3  mrg    src_offset1_size: innermost row offset (for source), in bytes
   4005  1.1.1.3  mrg    src_offset0_len: offset, number of rows (for source)
   4006  1.1.1.3  mrg    src_dim1_size: whole-array source row length, in bytes (pitch)
   4007  1.1.1.3  mrg */
   4008  1.1.1.3  mrg 
   4009  1.1.1.3  mrg int
   4010  1.1.1.3  mrg GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
   4011  1.1.1.3  mrg 		       size_t dim0_len, void *dst, size_t dst_offset1_size,
   4012  1.1.1.3  mrg 		       size_t dst_offset0_len, size_t dst_dim1_size,
   4013  1.1.1.3  mrg 		       const void *src, size_t src_offset1_size,
   4014  1.1.1.3  mrg 		       size_t src_offset0_len, size_t src_dim1_size)
   4015  1.1.1.3  mrg {
   4016  1.1.1.3  mrg   if (!hsa_fns.hsa_amd_memory_lock_fn
   4017  1.1.1.3  mrg       || !hsa_fns.hsa_amd_memory_unlock_fn
   4018  1.1.1.3  mrg       || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
   4019  1.1.1.3  mrg     return -1;
   4020  1.1.1.3  mrg 
   4021  1.1.1.3  mrg   /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
   4022  1.1.1.3  mrg      out quietly if we have anything oddly-aligned rather than letting the
   4023  1.1.1.3  mrg      driver raise an error.  */
   4024  1.1.1.3  mrg   if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
   4025  1.1.1.3  mrg     return -1;
   4026  1.1.1.3  mrg 
   4027  1.1.1.3  mrg   if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
   4028  1.1.1.3  mrg     return -1;
   4029  1.1.1.3  mrg 
   4030  1.1.1.3  mrg   /* Only handle host to device or device to host transfers here.  */
   4031  1.1.1.3  mrg   if ((dst_ord == -1 && src_ord == -1)
   4032  1.1.1.3  mrg       || (dst_ord != -1 && src_ord != -1))
   4033  1.1.1.3  mrg     return -1;
   4034  1.1.1.3  mrg 
   4035  1.1.1.3  mrg   hsa_amd_copy_direction_t dir
   4036  1.1.1.3  mrg     = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
   4037  1.1.1.3  mrg   hsa_agent_t copy_agent;
   4038  1.1.1.3  mrg 
   4039  1.1.1.3  mrg   /* We need to pin (lock) host memory before we start the transfer.  Try to
   4040  1.1.1.3  mrg      lock the minimum size necessary, i.e. using partial first/last rows of the
   4041  1.1.1.3  mrg      whole array.  Something like this:
   4042  1.1.1.3  mrg 
   4043  1.1.1.3  mrg 	 rows -->
   4044  1.1.1.3  mrg 	 ..............
   4045  1.1.1.3  mrg      c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
   4046  1.1.1.3  mrg      o | ++#######+++++ <- whole row
   4047  1.1.1.3  mrg      l | ++#######+++++ <-     "
   4048  1.1.1.3  mrg      s v ++#######..... <- last row apart from trailing remainder
   4049  1.1.1.3  mrg 	 ..............
   4050  1.1.1.3  mrg 
   4051  1.1.1.3  mrg      We could split very large transfers into several rectangular copies, but
   4052  1.1.1.3  mrg      that is unimplemented for now.  */
   4053  1.1.1.3  mrg 
   4054  1.1.1.3  mrg   size_t bounded_size_host, first_elem_offset_host;
   4055  1.1.1.3  mrg   void *host_ptr;
   4056  1.1.1.3  mrg   if (dir == hsaHostToDevice)
   4057  1.1.1.3  mrg     {
   4058  1.1.1.3  mrg       bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
   4059  1.1.1.3  mrg       first_elem_offset_host = src_offset0_len * src_dim1_size
   4060  1.1.1.3  mrg 			       + src_offset1_size;
   4061  1.1.1.3  mrg       host_ptr = (void *) src;
   4062  1.1.1.3  mrg       struct agent_info *agent = get_agent_info (dst_ord);
   4063  1.1.1.3  mrg       copy_agent = agent->id;
   4064  1.1.1.3  mrg     }
   4065  1.1.1.3  mrg   else
   4066  1.1.1.3  mrg     {
   4067  1.1.1.3  mrg       bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
   4068  1.1.1.3  mrg       first_elem_offset_host = dst_offset0_len * dst_dim1_size
   4069  1.1.1.3  mrg 			       + dst_offset1_size;
   4070  1.1.1.3  mrg       host_ptr = dst;
   4071  1.1.1.3  mrg       struct agent_info *agent = get_agent_info (src_ord);
   4072  1.1.1.3  mrg       copy_agent = agent->id;
   4073  1.1.1.3  mrg     }
   4074  1.1.1.3  mrg 
   4075  1.1.1.3  mrg   void *agent_ptr;
   4076  1.1.1.3  mrg 
   4077  1.1.1.3  mrg   hsa_status_t status
   4078  1.1.1.3  mrg     = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
   4079  1.1.1.3  mrg 				      bounded_size_host, NULL, 0, &agent_ptr);
   4080  1.1.1.3  mrg   /* We can't lock the host memory: don't give up though, we might still be
   4081  1.1.1.3  mrg      able to use the slow path in our caller.  So, don't make this an
   4082  1.1.1.3  mrg      error.  */
   4083  1.1.1.3  mrg   if (status != HSA_STATUS_SUCCESS)
   4084  1.1.1.3  mrg     return -1;
   4085  1.1.1.3  mrg 
   4086  1.1.1.3  mrg   hsa_pitched_ptr_t dstpp, srcpp;
   4087  1.1.1.3  mrg   hsa_dim3_t dst_offsets, src_offsets, ranges;
   4088  1.1.1.3  mrg 
   4089  1.1.1.3  mrg   int retval = 1;
   4090  1.1.1.3  mrg 
   4091  1.1.1.3  mrg   hsa_signal_t completion_signal;
   4092  1.1.1.3  mrg   status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
   4093  1.1.1.3  mrg   if (status != HSA_STATUS_SUCCESS)
   4094  1.1.1.3  mrg     {
   4095  1.1.1.3  mrg       retval = -1;
   4096  1.1.1.3  mrg       goto unlock;
   4097  1.1.1.3  mrg     }
   4098  1.1.1.3  mrg 
   4099  1.1.1.3  mrg   if (dir == hsaHostToDevice)
   4100  1.1.1.3  mrg     {
   4101  1.1.1.3  mrg       srcpp.base = agent_ptr - first_elem_offset_host;
   4102  1.1.1.3  mrg       dstpp.base = dst;
   4103  1.1.1.3  mrg     }
   4104  1.1.1.3  mrg   else
   4105  1.1.1.3  mrg     {
   4106  1.1.1.3  mrg       srcpp.base = (void *) src;
   4107  1.1.1.3  mrg       dstpp.base = agent_ptr - first_elem_offset_host;
   4108  1.1.1.3  mrg     }
   4109  1.1.1.3  mrg 
   4110  1.1.1.3  mrg   srcpp.pitch = src_dim1_size;
   4111  1.1.1.3  mrg   srcpp.slice = 0;
   4112  1.1.1.3  mrg 
   4113  1.1.1.3  mrg   src_offsets.x = src_offset1_size;
   4114  1.1.1.3  mrg   src_offsets.y = src_offset0_len;
   4115  1.1.1.3  mrg   src_offsets.z = 0;
   4116  1.1.1.3  mrg 
   4117  1.1.1.3  mrg   dstpp.pitch = dst_dim1_size;
   4118  1.1.1.3  mrg   dstpp.slice = 0;
   4119  1.1.1.3  mrg 
   4120  1.1.1.3  mrg   dst_offsets.x = dst_offset1_size;
   4121  1.1.1.3  mrg   dst_offsets.y = dst_offset0_len;
   4122  1.1.1.3  mrg   dst_offsets.z = 0;
   4123  1.1.1.3  mrg 
   4124  1.1.1.3  mrg   ranges.x = dim1_size;
   4125  1.1.1.3  mrg   ranges.y = dim0_len;
   4126  1.1.1.3  mrg   ranges.z = 1;
   4127  1.1.1.3  mrg 
   4128  1.1.1.3  mrg   status
   4129  1.1.1.3  mrg     = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
   4130  1.1.1.3  mrg 						 &src_offsets, &ranges,
   4131  1.1.1.3  mrg 						 copy_agent, dir, 0, NULL,
   4132  1.1.1.3  mrg 						 completion_signal);
   4133  1.1.1.3  mrg   /* If the rectangular copy fails, we might still be able to use the slow
   4134  1.1.1.3  mrg      path.  We need to unlock the host memory though, so don't return
   4135  1.1.1.3  mrg      immediately.  */
   4136  1.1.1.3  mrg   if (status != HSA_STATUS_SUCCESS)
   4137  1.1.1.3  mrg     retval = -1;
   4138  1.1.1.3  mrg   else
   4139  1.1.1.3  mrg     hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
   4140  1.1.1.3  mrg 					HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
   4141  1.1.1.3  mrg 					HSA_WAIT_STATE_ACTIVE);
   4142  1.1.1.3  mrg 
   4143  1.1.1.3  mrg   hsa_fns.hsa_signal_destroy_fn (completion_signal);
   4144  1.1.1.3  mrg 
   4145  1.1.1.3  mrg unlock:
   4146  1.1.1.3  mrg   status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
   4147  1.1.1.3  mrg   if (status != HSA_STATUS_SUCCESS)
   4148  1.1.1.3  mrg     hsa_fatal ("Could not unlock host memory", status);
   4149  1.1.1.3  mrg 
   4150  1.1.1.3  mrg   return retval;
   4151  1.1.1.3  mrg }
   4152  1.1.1.3  mrg 
   4153  1.1.1.3  mrg /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
   4154  1.1.1.3  mrg    measured in bytes.  So we have:
   4155  1.1.1.3  mrg 
   4156  1.1.1.3  mrg    dim2_size: number of bytes to copy on innermost dimension ("row")
   4157  1.1.1.3  mrg    dim1_len: number of rows per slice to copy
   4158  1.1.1.3  mrg    dim0_len: number of slices to copy
   4159  1.1.1.3  mrg    dst: base pointer for destination of copy
   4160  1.1.1.3  mrg    dst_offset2_size: innermost row offset (for dest), in bytes
   4161  1.1.1.3  mrg    dst_offset1_len: offset, number of rows (for dest)
   4162  1.1.1.3  mrg    dst_offset0_len: offset, number of slices (for dest)
   4163  1.1.1.3  mrg    dst_dim2_size: whole-array dest row length, in bytes (pitch)
   4164  1.1.1.3  mrg    dst_dim1_len: whole-array number of rows in slice (for dest)
   4165  1.1.1.3  mrg    src: base pointer for source of copy
   4166  1.1.1.3  mrg    src_offset2_size: innermost row offset (for source), in bytes
   4167  1.1.1.3  mrg    src_offset1_len: offset, number of rows (for source)
   4168  1.1.1.3  mrg    src_offset0_len: offset, number of slices (for source)
   4169  1.1.1.3  mrg    src_dim2_size: whole-array source row length, in bytes (pitch)
   4170  1.1.1.3  mrg    src_dim1_len: whole-array number of rows in slice (for source)
   4171  1.1.1.3  mrg */
   4172  1.1.1.3  mrg 
   4173  1.1.1.3  mrg int
   4174  1.1.1.3  mrg GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
   4175  1.1.1.3  mrg 		       size_t dim1_len, size_t dim0_len, void *dst,
   4176  1.1.1.3  mrg 		       size_t dst_offset2_size, size_t dst_offset1_len,
   4177  1.1.1.3  mrg 		       size_t dst_offset0_len, size_t dst_dim2_size,
   4178  1.1.1.3  mrg 		       size_t dst_dim1_len, const void *src,
   4179  1.1.1.3  mrg 		       size_t src_offset2_size, size_t src_offset1_len,
   4180  1.1.1.3  mrg 		       size_t src_offset0_len, size_t src_dim2_size,
   4181  1.1.1.3  mrg 		       size_t src_dim1_len)
   4182  1.1.1.3  mrg {
   4183  1.1.1.3  mrg   if (!hsa_fns.hsa_amd_memory_lock_fn
   4184  1.1.1.3  mrg       || !hsa_fns.hsa_amd_memory_unlock_fn
   4185  1.1.1.3  mrg       || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
   4186  1.1.1.3  mrg     return -1;
   4187  1.1.1.3  mrg 
   4188  1.1.1.3  mrg   /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
   4189  1.1.1.3  mrg      out quietly if we have anything oddly-aligned rather than letting the
   4190  1.1.1.3  mrg      driver raise an error.  */
   4191  1.1.1.3  mrg   if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
   4192  1.1.1.3  mrg     return -1;
   4193  1.1.1.3  mrg 
   4194  1.1.1.3  mrg   if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
   4195  1.1.1.3  mrg     return -1;
   4196  1.1.1.3  mrg 
   4197  1.1.1.3  mrg   /* Only handle host to device or device to host transfers here.  */
   4198  1.1.1.3  mrg   if ((dst_ord == -1 && src_ord == -1)
   4199  1.1.1.3  mrg       || (dst_ord != -1 && src_ord != -1))
   4200  1.1.1.3  mrg     return -1;
   4201  1.1.1.3  mrg 
   4202  1.1.1.3  mrg   hsa_amd_copy_direction_t dir
   4203  1.1.1.3  mrg     = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
   4204  1.1.1.3  mrg   hsa_agent_t copy_agent;
   4205  1.1.1.3  mrg 
   4206  1.1.1.3  mrg   /* We need to pin (lock) host memory before we start the transfer.  Try to
   4207  1.1.1.3  mrg      lock the minimum size necessary, i.e. using partial first/last slices of
   4208  1.1.1.3  mrg      the whole 3D array.  Something like this:
   4209  1.1.1.3  mrg 
   4210  1.1.1.3  mrg 	 slice 0:          slice 1:	     slice 2:
   4211  1.1.1.3  mrg 	     __________        __________        __________
   4212  1.1.1.3  mrg        ^    /+++++++++/    :  /+++++++++/    :	/         /
   4213  1.1.1.3  mrg     column /+++##++++/|    | /+++##++++/|    | /+++##    /  # = subarray
   4214  1.1.1.3  mrg      /	  /   ##++++/ |    |/+++##++++/ |    |/+++##++++/   + = area to pin
   4215  1.1.1.3  mrg 	 /_________/  :    /_________/  :    /_________/
   4216  1.1.1.3  mrg 	row --->
   4217  1.1.1.3  mrg 
   4218  1.1.1.3  mrg      We could split very large transfers into several rectangular copies, but
   4219  1.1.1.3  mrg      that is unimplemented for now.  */
   4220  1.1.1.3  mrg 
   4221  1.1.1.3  mrg   size_t bounded_size_host, first_elem_offset_host;
   4222  1.1.1.3  mrg   void *host_ptr;
   4223  1.1.1.3  mrg   if (dir == hsaHostToDevice)
   4224  1.1.1.3  mrg     {
   4225  1.1.1.3  mrg       size_t slice_bytes = src_dim2_size * src_dim1_len;
   4226  1.1.1.3  mrg       bounded_size_host = slice_bytes * (dim0_len - 1)
   4227  1.1.1.3  mrg 			  + src_dim2_size * (dim1_len - 1)
   4228  1.1.1.3  mrg 			  + dim2_size;
   4229  1.1.1.3  mrg       first_elem_offset_host = src_offset0_len * slice_bytes
   4230  1.1.1.3  mrg 			       + src_offset1_len * src_dim2_size
   4231  1.1.1.3  mrg 			       + src_offset2_size;
   4232  1.1.1.3  mrg       host_ptr = (void *) src;
   4233  1.1.1.3  mrg       struct agent_info *agent = get_agent_info (dst_ord);
   4234  1.1.1.3  mrg       copy_agent = agent->id;
   4235  1.1.1.3  mrg     }
   4236  1.1.1.3  mrg   else
   4237  1.1.1.3  mrg     {
   4238  1.1.1.3  mrg       size_t slice_bytes = dst_dim2_size * dst_dim1_len;
   4239  1.1.1.3  mrg       bounded_size_host = slice_bytes * (dim0_len - 1)
   4240  1.1.1.3  mrg 			  + dst_dim2_size * (dim1_len - 1)
   4241  1.1.1.3  mrg 			  + dim2_size;
   4242  1.1.1.3  mrg       first_elem_offset_host = dst_offset0_len * slice_bytes
   4243  1.1.1.3  mrg 			       + dst_offset1_len * dst_dim2_size
   4244  1.1.1.3  mrg 			       + dst_offset2_size;
   4245  1.1.1.3  mrg       host_ptr = dst;
   4246  1.1.1.3  mrg       struct agent_info *agent = get_agent_info (src_ord);
   4247  1.1.1.3  mrg       copy_agent = agent->id;
   4248  1.1.1.3  mrg     }
   4249  1.1.1.3  mrg 
   4250  1.1.1.3  mrg   void *agent_ptr;
   4251  1.1.1.3  mrg 
   4252  1.1.1.3  mrg   hsa_status_t status
   4253  1.1.1.3  mrg     = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
   4254  1.1.1.3  mrg 				      bounded_size_host, NULL, 0, &agent_ptr);
   4255  1.1.1.3  mrg   /* We can't lock the host memory: don't give up though, we might still be
   4256  1.1.1.3  mrg      able to use the slow path in our caller (maybe even with iterated memcpy2d
   4257  1.1.1.3  mrg      calls).  So, don't make this an error.  */
   4258  1.1.1.3  mrg   if (status != HSA_STATUS_SUCCESS)
   4259  1.1.1.3  mrg     return -1;
   4260  1.1.1.3  mrg 
   4261  1.1.1.3  mrg   hsa_pitched_ptr_t dstpp, srcpp;
   4262  1.1.1.3  mrg   hsa_dim3_t dst_offsets, src_offsets, ranges;
   4263  1.1.1.3  mrg 
   4264  1.1.1.3  mrg   int retval = 1;
   4265  1.1.1.3  mrg 
   4266  1.1.1.3  mrg   hsa_signal_t completion_signal;
   4267  1.1.1.3  mrg   status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
   4268  1.1.1.3  mrg   if (status != HSA_STATUS_SUCCESS)
   4269  1.1.1.3  mrg     {
   4270  1.1.1.3  mrg       retval = -1;
   4271  1.1.1.3  mrg       goto unlock;
   4272  1.1.1.3  mrg     }
   4273  1.1.1.3  mrg 
   4274  1.1.1.3  mrg   if (dir == hsaHostToDevice)
   4275  1.1.1.3  mrg     {
   4276  1.1.1.3  mrg       srcpp.base = agent_ptr - first_elem_offset_host;
   4277  1.1.1.3  mrg       dstpp.base = dst;
   4278  1.1.1.3  mrg     }
   4279  1.1.1.3  mrg   else
   4280  1.1.1.3  mrg     {
   4281  1.1.1.3  mrg       srcpp.base = (void *) src;
   4282  1.1.1.3  mrg       dstpp.base = agent_ptr - first_elem_offset_host;
   4283  1.1.1.3  mrg     }
   4284  1.1.1.3  mrg 
   4285  1.1.1.3  mrg   /* Pitch is measured in bytes.  */
   4286  1.1.1.3  mrg   srcpp.pitch = src_dim2_size;
   4287  1.1.1.3  mrg   /* Slice is also measured in bytes (i.e. total per-slice).  */
   4288  1.1.1.3  mrg   srcpp.slice = src_dim2_size * src_dim1_len;
   4289  1.1.1.3  mrg 
   4290  1.1.1.3  mrg   src_offsets.x = src_offset2_size;
   4291  1.1.1.3  mrg   src_offsets.y = src_offset1_len;
   4292  1.1.1.3  mrg   src_offsets.z = src_offset0_len;
   4293  1.1.1.3  mrg 
   4294  1.1.1.3  mrg   /* As above.  */
   4295  1.1.1.3  mrg   dstpp.pitch = dst_dim2_size;
   4296  1.1.1.3  mrg   dstpp.slice = dst_dim2_size * dst_dim1_len;
   4297  1.1.1.3  mrg 
   4298  1.1.1.3  mrg   dst_offsets.x = dst_offset2_size;
   4299  1.1.1.3  mrg   dst_offsets.y = dst_offset1_len;
   4300  1.1.1.3  mrg   dst_offsets.z = dst_offset0_len;
   4301  1.1.1.3  mrg 
   4302  1.1.1.3  mrg   ranges.x = dim2_size;
   4303  1.1.1.3  mrg   ranges.y = dim1_len;
   4304  1.1.1.3  mrg   ranges.z = dim0_len;
   4305  1.1.1.3  mrg 
   4306  1.1.1.3  mrg   status
   4307  1.1.1.3  mrg     = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
   4308  1.1.1.3  mrg 						 &src_offsets, &ranges,
   4309  1.1.1.3  mrg 						 copy_agent, dir, 0, NULL,
   4310  1.1.1.3  mrg 						 completion_signal);
   4311  1.1.1.3  mrg   /* If the rectangular copy fails, we might still be able to use the slow
   4312  1.1.1.3  mrg      path.  We need to unlock the host memory though, so don't return
   4313  1.1.1.3  mrg      immediately.  */
   4314  1.1.1.3  mrg   if (status != HSA_STATUS_SUCCESS)
   4315  1.1.1.3  mrg     retval = -1;
   4316  1.1.1.3  mrg   else
   4317  1.1.1.3  mrg     {
   4318  1.1.1.3  mrg       hsa_signal_value_t sv
   4319  1.1.1.3  mrg 	= hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
   4320  1.1.1.3  mrg 					      HSA_SIGNAL_CONDITION_LT, 1,
   4321  1.1.1.3  mrg 					      UINT64_MAX,
   4322  1.1.1.3  mrg 					      HSA_WAIT_STATE_ACTIVE);
   4323  1.1.1.3  mrg       if (sv < 0)
   4324  1.1.1.3  mrg 	{
   4325  1.1.1.3  mrg 	  GCN_WARNING ("async copy rect failure");
   4326  1.1.1.3  mrg 	  retval = -1;
   4327  1.1.1.3  mrg 	}
   4328  1.1.1.3  mrg     }
   4329  1.1.1.3  mrg 
   4330  1.1.1.3  mrg   hsa_fns.hsa_signal_destroy_fn (completion_signal);
   4331  1.1.1.3  mrg 
   4332  1.1.1.3  mrg unlock:
   4333  1.1.1.3  mrg   status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
   4334  1.1.1.3  mrg   if (status != HSA_STATUS_SUCCESS)
   4335  1.1.1.3  mrg     hsa_fatal ("Could not unlock host memory", status);
   4336  1.1.1.3  mrg 
   4337  1.1.1.3  mrg   return retval;
   4338  1.1.1.3  mrg }
   4339  1.1.1.3  mrg 
   4340      1.1  mrg /* }}}  */
   4341      1.1  mrg /* {{{ OpenMP Plugin API  */
   4342      1.1  mrg 
   4343      1.1  mrg /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
   4344      1.1  mrg    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
   4345      1.1  mrg    to a kernel_info structure, and must have previously been loaded to the
   4346      1.1  mrg    specified device.  */
   4347      1.1  mrg 
   4348      1.1  mrg void
   4349      1.1  mrg GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
   4350      1.1  mrg {
   4351      1.1  mrg   struct agent_info *agent = get_agent_info (device);
   4352      1.1  mrg   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
   4353      1.1  mrg   struct GOMP_kernel_launch_attributes def;
   4354      1.1  mrg   struct GOMP_kernel_launch_attributes *kla;
   4355      1.1  mrg   assert (agent == kernel->agent);
   4356      1.1  mrg 
   4357      1.1  mrg   /* If we get here then the kernel must be OpenMP.  */
   4358      1.1  mrg   kernel->kind = KIND_OPENMP;
   4359      1.1  mrg 
   4360      1.1  mrg   if (!parse_target_attributes (args, &def, &kla, agent))
   4361      1.1  mrg     {
   4362      1.1  mrg       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
   4363      1.1  mrg       return;
   4364      1.1  mrg     }
   4365      1.1  mrg   run_kernel (kernel, vars, kla, NULL, false);
   4366      1.1  mrg }
   4367      1.1  mrg 
   4368      1.1  mrg /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
   4369      1.1  mrg    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
   4370      1.1  mrg    GOMP_PLUGIN_target_task_completion when it has finished.  */
   4371      1.1  mrg 
   4372      1.1  mrg void
   4373      1.1  mrg GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
   4374      1.1  mrg 			void **args, void *async_data)
   4375      1.1  mrg {
   4376      1.1  mrg   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
   4377      1.1  mrg   struct agent_info *agent = get_agent_info (device);
   4378      1.1  mrg   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
   4379      1.1  mrg   struct GOMP_kernel_launch_attributes def;
   4380      1.1  mrg   struct GOMP_kernel_launch_attributes *kla;
   4381      1.1  mrg   assert (agent == kernel->agent);
   4382      1.1  mrg 
   4383      1.1  mrg   /* If we get here then the kernel must be OpenMP.  */
   4384      1.1  mrg   kernel->kind = KIND_OPENMP;
   4385      1.1  mrg 
   4386      1.1  mrg   if (!parse_target_attributes (args, &def, &kla, agent))
   4387      1.1  mrg     {
   4388      1.1  mrg       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
   4389      1.1  mrg       return;
   4390      1.1  mrg     }
   4391      1.1  mrg 
   4392      1.1  mrg   maybe_init_omp_async (agent);
   4393      1.1  mrg   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
   4394      1.1  mrg   queue_push_callback (agent->omp_async_queue,
   4395      1.1  mrg 		       GOMP_PLUGIN_target_task_completion, async_data);
   4396      1.1  mrg }
   4397      1.1  mrg 
   4398      1.1  mrg /* }}} */
   4399      1.1  mrg /* {{{ OpenACC Plugin API  */
   4400      1.1  mrg 
   4401      1.1  mrg /* Run a synchronous OpenACC kernel.  The device number is inferred from the
   4402      1.1  mrg    already-loaded KERNEL.  */
   4403      1.1  mrg 
   4404      1.1  mrg void
   4405  1.1.1.3  mrg GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
   4406  1.1.1.3  mrg 			   size_t mapnum __attribute__((unused)),
   4407  1.1.1.3  mrg 			   void **hostaddrs __attribute__((unused)),
   4408  1.1.1.3  mrg 			   void **devaddrs, unsigned *dims,
   4409      1.1  mrg 			   void *targ_mem_desc)
   4410      1.1  mrg {
   4411      1.1  mrg   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
   4412      1.1  mrg 
   4413  1.1.1.3  mrg   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
   4414      1.1  mrg }
   4415      1.1  mrg 
   4416      1.1  mrg /* Run an asynchronous OpenACC kernel on the specified queue.  */
   4417      1.1  mrg 
   4418      1.1  mrg void
   4419  1.1.1.3  mrg GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
   4420  1.1.1.3  mrg 				 size_t mapnum __attribute__((unused)),
   4421  1.1.1.3  mrg 				 void **hostaddrs __attribute__((unused)),
   4422  1.1.1.3  mrg 				 void **devaddrs,
   4423      1.1  mrg 				 unsigned *dims, void *targ_mem_desc,
   4424      1.1  mrg 				 struct goacc_asyncqueue *aq)
   4425      1.1  mrg {
   4426      1.1  mrg   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
   4427      1.1  mrg 
   4428  1.1.1.3  mrg   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
   4429      1.1  mrg }
   4430      1.1  mrg 
   4431      1.1  mrg /* Create a new asynchronous thread and queue for running future kernels.  */
   4432      1.1  mrg 
   4433      1.1  mrg struct goacc_asyncqueue *
   4434      1.1  mrg GOMP_OFFLOAD_openacc_async_construct (int device)
   4435      1.1  mrg {
   4436      1.1  mrg   struct agent_info *agent = get_agent_info (device);
   4437      1.1  mrg 
   4438      1.1  mrg   pthread_mutex_lock (&agent->async_queues_mutex);
   4439      1.1  mrg 
   4440      1.1  mrg   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
   4441      1.1  mrg   aq->agent = get_agent_info (device);
   4442      1.1  mrg   aq->prev = NULL;
   4443      1.1  mrg   aq->next = agent->async_queues;
   4444      1.1  mrg   if (aq->next)
   4445      1.1  mrg     {
   4446      1.1  mrg       aq->next->prev = aq;
   4447      1.1  mrg       aq->id = aq->next->id + 1;
   4448      1.1  mrg     }
   4449      1.1  mrg   else
   4450      1.1  mrg     aq->id = 1;
   4451      1.1  mrg   agent->async_queues = aq;
   4452      1.1  mrg 
   4453      1.1  mrg   aq->queue_first = 0;
   4454      1.1  mrg   aq->queue_n = 0;
   4455      1.1  mrg   aq->drain_queue_stop = 0;
   4456      1.1  mrg 
   4457      1.1  mrg   if (pthread_mutex_init (&aq->mutex, NULL))
   4458      1.1  mrg     {
   4459      1.1  mrg       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
   4460      1.1  mrg       return false;
   4461      1.1  mrg     }
   4462      1.1  mrg   if (pthread_cond_init (&aq->queue_cond_in, NULL))
   4463      1.1  mrg     {
   4464      1.1  mrg       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
   4465      1.1  mrg       return false;
   4466      1.1  mrg     }
   4467      1.1  mrg   if (pthread_cond_init (&aq->queue_cond_out, NULL))
   4468      1.1  mrg     {
   4469      1.1  mrg       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
   4470      1.1  mrg       return false;
   4471      1.1  mrg     }
   4472      1.1  mrg 
   4473      1.1  mrg   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
   4474      1.1  mrg 						     ASYNC_QUEUE_SIZE,
   4475      1.1  mrg 						     HSA_QUEUE_TYPE_MULTI,
   4476      1.1  mrg 						     hsa_queue_callback, NULL,
   4477      1.1  mrg 						     UINT32_MAX, UINT32_MAX,
   4478      1.1  mrg 						     &aq->hsa_queue);
   4479      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   4480      1.1  mrg     hsa_fatal ("Error creating command queue", status);
   4481      1.1  mrg 
   4482      1.1  mrg   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
   4483      1.1  mrg   if (err != 0)
   4484      1.1  mrg     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
   4485      1.1  mrg 		       strerror (err));
   4486      1.1  mrg   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
   4487      1.1  mrg 	     aq->id);
   4488      1.1  mrg 
   4489      1.1  mrg   pthread_mutex_unlock (&agent->async_queues_mutex);
   4490      1.1  mrg 
   4491      1.1  mrg   return aq;
   4492      1.1  mrg }
   4493      1.1  mrg 
   4494      1.1  mrg /* Destroy an existing asynchronous thread and queue.  Waits for any
   4495      1.1  mrg    currently-running task to complete, but cancels any queued tasks.  */
   4496      1.1  mrg 
   4497      1.1  mrg bool
   4498      1.1  mrg GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
   4499      1.1  mrg {
   4500      1.1  mrg   struct agent_info *agent = aq->agent;
   4501      1.1  mrg 
   4502      1.1  mrg   finalize_async_thread (aq);
   4503      1.1  mrg 
   4504      1.1  mrg   pthread_mutex_lock (&agent->async_queues_mutex);
   4505      1.1  mrg 
   4506      1.1  mrg   int err;
   4507      1.1  mrg   if ((err = pthread_mutex_destroy (&aq->mutex)))
   4508      1.1  mrg     {
   4509      1.1  mrg       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
   4510      1.1  mrg       goto fail;
   4511      1.1  mrg     }
   4512      1.1  mrg   if (pthread_cond_destroy (&aq->queue_cond_in))
   4513      1.1  mrg     {
   4514      1.1  mrg       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
   4515      1.1  mrg       goto fail;
   4516      1.1  mrg     }
   4517      1.1  mrg   if (pthread_cond_destroy (&aq->queue_cond_out))
   4518      1.1  mrg     {
   4519      1.1  mrg       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
   4520      1.1  mrg       goto fail;
   4521      1.1  mrg     }
   4522      1.1  mrg   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
   4523      1.1  mrg   if (status != HSA_STATUS_SUCCESS)
   4524      1.1  mrg     {
   4525      1.1  mrg       hsa_error ("Error destroying command queue", status);
   4526      1.1  mrg       goto fail;
   4527      1.1  mrg     }
   4528      1.1  mrg 
   4529      1.1  mrg   if (aq->prev)
   4530      1.1  mrg     aq->prev->next = aq->next;
   4531      1.1  mrg   if (aq->next)
   4532      1.1  mrg     aq->next->prev = aq->prev;
   4533      1.1  mrg   if (agent->async_queues == aq)
   4534      1.1  mrg     agent->async_queues = aq->next;
   4535      1.1  mrg 
   4536      1.1  mrg   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
   4537      1.1  mrg 
   4538      1.1  mrg   free (aq);
   4539      1.1  mrg   pthread_mutex_unlock (&agent->async_queues_mutex);
   4540      1.1  mrg   return true;
   4541      1.1  mrg 
   4542      1.1  mrg fail:
   4543      1.1  mrg   pthread_mutex_unlock (&agent->async_queues_mutex);
   4544      1.1  mrg   return false;
   4545      1.1  mrg }
   4546      1.1  mrg 
   4547      1.1  mrg /* Return true if the specified async queue is currently empty.  */
   4548      1.1  mrg 
   4549      1.1  mrg int
   4550      1.1  mrg GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
   4551      1.1  mrg {
   4552      1.1  mrg   return queue_empty (aq);
   4553      1.1  mrg }
   4554      1.1  mrg 
   4555      1.1  mrg /* Block until the specified queue has executed all its tasks and the
   4556      1.1  mrg    queue is empty.  */
   4557      1.1  mrg 
   4558      1.1  mrg bool
   4559      1.1  mrg GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
   4560      1.1  mrg {
   4561      1.1  mrg   wait_queue (aq);
   4562      1.1  mrg   return true;
   4563      1.1  mrg }
   4564      1.1  mrg 
   4565      1.1  mrg /* Add a serialization point across two async queues. Any new tasks added to
   4566      1.1  mrg    AQ2, after this call, will not run until all tasks on AQ1, at the time
   4567      1.1  mrg    of this call, have completed.  */
   4568      1.1  mrg 
   4569      1.1  mrg bool
   4570      1.1  mrg GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
   4571      1.1  mrg 				      struct goacc_asyncqueue *aq2)
   4572      1.1  mrg {
   4573      1.1  mrg   /* For serialize, stream aq2 waits for aq1 to complete work that has been
   4574      1.1  mrg      scheduled to run on it up to this point.  */
   4575      1.1  mrg   if (aq1 != aq2)
   4576      1.1  mrg     {
   4577      1.1  mrg       struct placeholder *placeholderp = queue_push_placeholder (aq1);
   4578      1.1  mrg       queue_push_asyncwait (aq2, placeholderp);
   4579      1.1  mrg     }
   4580      1.1  mrg   return true;
   4581      1.1  mrg }
   4582      1.1  mrg 
   4583      1.1  mrg /* Add an opaque callback to the given async queue.  */
   4584      1.1  mrg 
   4585      1.1  mrg void
   4586      1.1  mrg GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
   4587      1.1  mrg 					   void (*fn) (void *), void *data)
   4588      1.1  mrg {
   4589      1.1  mrg   queue_push_callback (aq, fn, data);
   4590      1.1  mrg }
   4591      1.1  mrg 
   4592      1.1  mrg /* Queue up an asynchronous data copy from host to DEVICE.  */
   4593      1.1  mrg 
   4594      1.1  mrg bool
   4595      1.1  mrg GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
   4596      1.1  mrg 				     size_t n, struct goacc_asyncqueue *aq)
   4597      1.1  mrg {
   4598      1.1  mrg   struct agent_info *agent = get_agent_info (device);
   4599      1.1  mrg   assert (agent == aq->agent);
   4600  1.1.1.2  mrg   queue_push_copy (aq, dst, src, n);
   4601      1.1  mrg   return true;
   4602      1.1  mrg }
   4603      1.1  mrg 
   4604      1.1  mrg /* Queue up an asynchronous data copy from DEVICE to host.  */
   4605      1.1  mrg 
   4606      1.1  mrg bool
   4607      1.1  mrg GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
   4608      1.1  mrg 				     size_t n, struct goacc_asyncqueue *aq)
   4609      1.1  mrg {
   4610      1.1  mrg   struct agent_info *agent = get_agent_info (device);
   4611      1.1  mrg   assert (agent == aq->agent);
   4612  1.1.1.2  mrg   queue_push_copy (aq, dst, src, n);
   4613      1.1  mrg   return true;
   4614      1.1  mrg }
   4615      1.1  mrg 
   4616      1.1  mrg union goacc_property_value
   4617      1.1  mrg GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
   4618      1.1  mrg {
   4619      1.1  mrg   struct agent_info *agent = get_agent_info (device);
   4620      1.1  mrg 
   4621      1.1  mrg   union goacc_property_value propval = { .val = 0 };
   4622      1.1  mrg 
   4623      1.1  mrg   switch (prop)
   4624      1.1  mrg     {
   4625      1.1  mrg     case GOACC_PROPERTY_FREE_MEMORY:
   4626      1.1  mrg       /* Not supported. */
   4627      1.1  mrg       break;
   4628      1.1  mrg     case GOACC_PROPERTY_MEMORY:
   4629      1.1  mrg       {
   4630      1.1  mrg 	size_t size;
   4631      1.1  mrg 	hsa_region_t region = agent->data_region;
   4632      1.1  mrg 	hsa_status_t status =
   4633      1.1  mrg 	  hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
   4634      1.1  mrg 	if (status == HSA_STATUS_SUCCESS)
   4635      1.1  mrg 	  propval.val = size;
   4636      1.1  mrg 	break;
   4637      1.1  mrg       }
   4638      1.1  mrg     case GOACC_PROPERTY_NAME:
   4639      1.1  mrg       propval.ptr = agent->name;
   4640      1.1  mrg       break;
   4641      1.1  mrg     case GOACC_PROPERTY_VENDOR:
   4642      1.1  mrg       propval.ptr = agent->vendor_name;
   4643      1.1  mrg       break;
   4644      1.1  mrg     case GOACC_PROPERTY_DRIVER:
   4645      1.1  mrg       propval.ptr = hsa_context.driver_version_s;
   4646      1.1  mrg       break;
   4647      1.1  mrg     }
   4648      1.1  mrg 
   4649      1.1  mrg   return propval;
   4650      1.1  mrg }
   4651      1.1  mrg 
   4652      1.1  mrg /* Set up plugin-specific thread-local-data (host-side).  */
   4653      1.1  mrg 
   4654      1.1  mrg void *
   4655      1.1  mrg GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
   4656      1.1  mrg {
   4657      1.1  mrg   struct gcn_thread *thread_data
   4658      1.1  mrg     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
   4659      1.1  mrg 
   4660      1.1  mrg   thread_data->async = GOMP_ASYNC_SYNC;
   4661      1.1  mrg 
   4662      1.1  mrg   return (void *) thread_data;
   4663      1.1  mrg }
   4664      1.1  mrg 
   4665      1.1  mrg /* Clean up plugin-specific thread-local-data.  */
   4666      1.1  mrg 
   4667      1.1  mrg void
   4668      1.1  mrg GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
   4669      1.1  mrg {
   4670      1.1  mrg   free (data);
   4671      1.1  mrg }
   4672      1.1  mrg 
   4673      1.1  mrg /* }}} */
   4674