1 1.1 mrg /* Plugin for AMD GCN execution. 2 1.1 mrg 3 1.1.1.3 mrg Copyright (C) 2013-2024 Free Software Foundation, Inc. 4 1.1 mrg 5 1.1 mrg Contributed by Mentor Embedded 6 1.1 mrg 7 1.1 mrg This file is part of the GNU Offloading and Multi Processing Library 8 1.1 mrg (libgomp). 9 1.1 mrg 10 1.1 mrg Libgomp is free software; you can redistribute it and/or modify it 11 1.1 mrg under the terms of the GNU General Public License as published by 12 1.1 mrg the Free Software Foundation; either version 3, or (at your option) 13 1.1 mrg any later version. 14 1.1 mrg 15 1.1 mrg Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY 16 1.1 mrg WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 17 1.1 mrg FOR A PARTICULAR PURPOSE. See the GNU General Public License for 18 1.1 mrg more details. 19 1.1 mrg 20 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 21 1.1 mrg permissions described in the GCC Runtime Library Exception, version 22 1.1 mrg 3.1, as published by the Free Software Foundation. 23 1.1 mrg 24 1.1 mrg You should have received a copy of the GNU General Public License and 25 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 26 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 27 1.1 mrg <http://www.gnu.org/licenses/>. */ 28 1.1 mrg 29 1.1 mrg /* {{{ Includes and defines */ 30 1.1 mrg 31 1.1 mrg #include "config.h" 32 1.1.1.2 mrg #include "symcat.h" 33 1.1 mrg #include <stdio.h> 34 1.1 mrg #include <stdlib.h> 35 1.1 mrg #include <string.h> 36 1.1 mrg #include <pthread.h> 37 1.1 mrg #include <inttypes.h> 38 1.1 mrg #include <stdbool.h> 39 1.1 mrg #include <limits.h> 40 1.1 mrg #include <hsa.h> 41 1.1.1.2 mrg #include <hsa_ext_amd.h> 42 1.1 mrg #include <dlfcn.h> 43 1.1 mrg #include <signal.h> 44 1.1 mrg #include "libgomp-plugin.h" 45 1.1.1.3 mrg #include "config/gcn/libgomp-gcn.h" /* For struct output. */ 46 1.1 mrg #include "gomp-constants.h" 47 1.1 mrg #include <elf.h> 48 1.1 mrg #include "oacc-plugin.h" 49 1.1 mrg #include "oacc-int.h" 50 1.1 mrg #include <assert.h> 51 1.1 mrg 52 1.1 mrg /* These probably won't be in elf.h for a while. */ 53 1.1.1.2 mrg #ifndef R_AMDGPU_NONE 54 1.1 mrg #define R_AMDGPU_NONE 0 55 1.1 mrg #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */ 56 1.1 mrg #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */ 57 1.1 mrg #define R_AMDGPU_ABS64 3 /* S + A */ 58 1.1 mrg #define R_AMDGPU_REL32 4 /* S + A - P */ 59 1.1 mrg #define R_AMDGPU_REL64 5 /* S + A - P */ 60 1.1 mrg #define R_AMDGPU_ABS32 6 /* S + A */ 61 1.1 mrg #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */ 62 1.1 mrg #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */ 63 1.1 mrg #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */ 64 1.1 mrg #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */ 65 1.1 mrg #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */ 66 1.1 mrg #define R_AMDGPU_RELATIVE64 13 /* B + A */ 67 1.1.1.2 mrg #endif 68 1.1 mrg 69 1.1 mrg /* GCN specific definitions for asynchronous queues. */ 70 1.1 mrg 71 1.1 mrg #define ASYNC_QUEUE_SIZE 64 72 1.1 mrg #define DRAIN_QUEUE_SYNCHRONOUS_P false 73 1.1 mrg #define DEBUG_QUEUES 0 74 1.1 mrg #define DEBUG_THREAD_SLEEP 0 75 1.1 mrg #define DEBUG_THREAD_SIGNAL 0 76 1.1 mrg 77 1.1 mrg /* Defaults. */ 78 1.1 mrg #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */ 79 1.1 mrg 80 1.1 mrg /* Secure getenv() which returns NULL if running as SUID/SGID. */ 81 1.1 mrg #ifndef HAVE_SECURE_GETENV 82 1.1 mrg #ifdef HAVE___SECURE_GETENV 83 1.1 mrg #define secure_getenv __secure_getenv 84 1.1 mrg #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \ 85 1.1 mrg && defined(HAVE_GETGID) && defined(HAVE_GETEGID) 86 1.1 mrg 87 1.1 mrg #include <unistd.h> 88 1.1 mrg 89 1.1 mrg /* Implementation of secure_getenv() for targets where it is not provided but 90 1.1 mrg we have at least means to test real and effective IDs. */ 91 1.1 mrg 92 1.1 mrg static char * 93 1.1 mrg secure_getenv (const char *name) 94 1.1 mrg { 95 1.1 mrg if ((getuid () == geteuid ()) && (getgid () == getegid ())) 96 1.1 mrg return getenv (name); 97 1.1 mrg else 98 1.1 mrg return NULL; 99 1.1 mrg } 100 1.1 mrg 101 1.1 mrg #else 102 1.1 mrg #define secure_getenv getenv 103 1.1 mrg #endif 104 1.1 mrg #endif 105 1.1 mrg 106 1.1 mrg /* }}} */ 107 1.1 mrg /* {{{ Types */ 108 1.1 mrg 109 1.1 mrg /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */ 110 1.1 mrg 111 1.1 mrg struct gcn_thread 112 1.1 mrg { 113 1.1 mrg /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */ 114 1.1 mrg int async; 115 1.1 mrg }; 116 1.1 mrg 117 1.1 mrg /* As an HSA runtime is dlopened, following structure defines function 118 1.1 mrg pointers utilized by the HSA plug-in. */ 119 1.1 mrg 120 1.1 mrg struct hsa_runtime_fn_info 121 1.1 mrg { 122 1.1 mrg /* HSA runtime. */ 123 1.1 mrg hsa_status_t (*hsa_status_string_fn) (hsa_status_t status, 124 1.1 mrg const char **status_string); 125 1.1 mrg hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute, 126 1.1 mrg void *value); 127 1.1 mrg hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent, 128 1.1 mrg hsa_agent_info_t attribute, 129 1.1 mrg void *value); 130 1.1 mrg hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa, 131 1.1 mrg hsa_isa_info_t attribute, 132 1.1 mrg uint32_t index, 133 1.1 mrg void *value); 134 1.1 mrg hsa_status_t (*hsa_init_fn) (void); 135 1.1 mrg hsa_status_t (*hsa_iterate_agents_fn) 136 1.1 mrg (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data); 137 1.1 mrg hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region, 138 1.1 mrg hsa_region_info_t attribute, 139 1.1 mrg void *value); 140 1.1 mrg hsa_status_t (*hsa_queue_create_fn) 141 1.1 mrg (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type, 142 1.1 mrg void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), 143 1.1 mrg void *data, uint32_t private_segment_size, 144 1.1 mrg uint32_t group_segment_size, hsa_queue_t **queue); 145 1.1 mrg hsa_status_t (*hsa_agent_iterate_regions_fn) 146 1.1 mrg (hsa_agent_t agent, 147 1.1 mrg hsa_status_t (*callback)(hsa_region_t region, void *data), void *data); 148 1.1 mrg hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable); 149 1.1 mrg hsa_status_t (*hsa_executable_create_fn) 150 1.1 mrg (hsa_profile_t profile, hsa_executable_state_t executable_state, 151 1.1 mrg const char *options, hsa_executable_t *executable); 152 1.1 mrg hsa_status_t (*hsa_executable_global_variable_define_fn) 153 1.1 mrg (hsa_executable_t executable, const char *variable_name, void *address); 154 1.1 mrg hsa_status_t (*hsa_executable_load_code_object_fn) 155 1.1 mrg (hsa_executable_t executable, hsa_agent_t agent, 156 1.1 mrg hsa_code_object_t code_object, const char *options); 157 1.1 mrg hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable, 158 1.1 mrg const char *options); 159 1.1 mrg hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value, 160 1.1 mrg uint32_t num_consumers, 161 1.1 mrg const hsa_agent_t *consumers, 162 1.1 mrg hsa_signal_t *signal); 163 1.1 mrg hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size, 164 1.1 mrg void **ptr); 165 1.1 mrg hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent, 166 1.1 mrg hsa_access_permission_t access); 167 1.1 mrg hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size); 168 1.1 mrg hsa_status_t (*hsa_memory_free_fn) (void *ptr); 169 1.1 mrg hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal); 170 1.1 mrg hsa_status_t (*hsa_executable_get_symbol_fn) 171 1.1 mrg (hsa_executable_t executable, const char *module_name, 172 1.1 mrg const char *symbol_name, hsa_agent_t agent, int32_t call_convention, 173 1.1 mrg hsa_executable_symbol_t *symbol); 174 1.1 mrg hsa_status_t (*hsa_executable_symbol_get_info_fn) 175 1.1 mrg (hsa_executable_symbol_t executable_symbol, 176 1.1 mrg hsa_executable_symbol_info_t attribute, void *value); 177 1.1 mrg hsa_status_t (*hsa_executable_iterate_symbols_fn) 178 1.1 mrg (hsa_executable_t executable, 179 1.1 mrg hsa_status_t (*callback)(hsa_executable_t executable, 180 1.1 mrg hsa_executable_symbol_t symbol, void *data), 181 1.1 mrg void *data); 182 1.1 mrg uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue, 183 1.1 mrg uint64_t value); 184 1.1 mrg uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue); 185 1.1 mrg void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal, 186 1.1 mrg hsa_signal_value_t value); 187 1.1 mrg void (*hsa_signal_store_release_fn) (hsa_signal_t signal, 188 1.1 mrg hsa_signal_value_t value); 189 1.1 mrg hsa_signal_value_t (*hsa_signal_wait_acquire_fn) 190 1.1 mrg (hsa_signal_t signal, hsa_signal_condition_t condition, 191 1.1 mrg hsa_signal_value_t compare_value, uint64_t timeout_hint, 192 1.1 mrg hsa_wait_state_t wait_state_hint); 193 1.1 mrg hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal); 194 1.1 mrg hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue); 195 1.1 mrg 196 1.1 mrg hsa_status_t (*hsa_code_object_deserialize_fn) 197 1.1 mrg (void *serialized_code_object, size_t serialized_code_object_size, 198 1.1 mrg const char *options, hsa_code_object_t *code_object); 199 1.1.1.3 mrg hsa_status_t (*hsa_amd_memory_lock_fn) 200 1.1.1.3 mrg (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent, 201 1.1.1.3 mrg void **agent_ptr); 202 1.1.1.3 mrg hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr); 203 1.1.1.3 mrg hsa_status_t (*hsa_amd_memory_async_copy_rect_fn) 204 1.1.1.3 mrg (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset, 205 1.1.1.3 mrg const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset, 206 1.1.1.3 mrg const hsa_dim3_t *range, hsa_agent_t copy_agent, 207 1.1.1.3 mrg hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, 208 1.1.1.3 mrg const hsa_signal_t *dep_signals, hsa_signal_t completion_signal); 209 1.1 mrg }; 210 1.1 mrg 211 1.1 mrg /* Structure describing the run-time and grid properties of an HSA kernel 212 1.1 mrg lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */ 213 1.1 mrg 214 1.1 mrg struct GOMP_kernel_launch_attributes 215 1.1 mrg { 216 1.1 mrg /* Number of dimensions the workload has. Maximum number is 3. */ 217 1.1 mrg uint32_t ndim; 218 1.1 mrg /* Size of the grid in the three respective dimensions. */ 219 1.1 mrg uint32_t gdims[3]; 220 1.1 mrg /* Size of work-groups in the respective dimensions. */ 221 1.1 mrg uint32_t wdims[3]; 222 1.1 mrg }; 223 1.1 mrg 224 1.1 mrg /* Collection of information needed for a dispatch of a kernel from a 225 1.1 mrg kernel. */ 226 1.1 mrg 227 1.1 mrg struct kernel_dispatch 228 1.1 mrg { 229 1.1 mrg struct agent_info *agent; 230 1.1 mrg /* Pointer to a command queue associated with a kernel dispatch agent. */ 231 1.1 mrg void *queue; 232 1.1 mrg /* Pointer to a memory space used for kernel arguments passing. */ 233 1.1 mrg void *kernarg_address; 234 1.1 mrg /* Kernel object. */ 235 1.1 mrg uint64_t object; 236 1.1 mrg /* Synchronization signal used for dispatch synchronization. */ 237 1.1 mrg uint64_t signal; 238 1.1 mrg /* Private segment size. */ 239 1.1 mrg uint32_t private_segment_size; 240 1.1 mrg /* Group segment size. */ 241 1.1 mrg uint32_t group_segment_size; 242 1.1 mrg }; 243 1.1 mrg 244 1.1 mrg /* Structure of the kernargs segment, supporting console output. 245 1.1 mrg 246 1.1 mrg This needs to match the definitions in Newlib, and the expectations 247 1.1 mrg in libgomp target code. */ 248 1.1 mrg 249 1.1 mrg struct kernargs { 250 1.1.1.3 mrg struct kernargs_abi abi; 251 1.1 mrg 252 1.1 mrg /* Output data. */ 253 1.1.1.3 mrg struct output output_data; 254 1.1 mrg }; 255 1.1 mrg 256 1.1 mrg /* A queue entry for a future asynchronous launch. */ 257 1.1 mrg 258 1.1 mrg struct kernel_launch 259 1.1 mrg { 260 1.1 mrg struct kernel_info *kernel; 261 1.1 mrg void *vars; 262 1.1 mrg struct GOMP_kernel_launch_attributes kla; 263 1.1 mrg }; 264 1.1 mrg 265 1.1 mrg /* A queue entry for a future callback. */ 266 1.1 mrg 267 1.1 mrg struct callback 268 1.1 mrg { 269 1.1 mrg void (*fn)(void *); 270 1.1 mrg void *data; 271 1.1 mrg }; 272 1.1 mrg 273 1.1 mrg /* A data struct for the copy_data callback. */ 274 1.1 mrg 275 1.1 mrg struct copy_data 276 1.1 mrg { 277 1.1 mrg void *dst; 278 1.1 mrg const void *src; 279 1.1 mrg size_t len; 280 1.1 mrg struct goacc_asyncqueue *aq; 281 1.1 mrg }; 282 1.1 mrg 283 1.1 mrg /* A queue entry for a placeholder. These correspond to a wait event. */ 284 1.1 mrg 285 1.1 mrg struct placeholder 286 1.1 mrg { 287 1.1 mrg int executed; 288 1.1 mrg pthread_cond_t cond; 289 1.1 mrg pthread_mutex_t mutex; 290 1.1 mrg }; 291 1.1 mrg 292 1.1 mrg /* A queue entry for a wait directive. */ 293 1.1 mrg 294 1.1 mrg struct asyncwait_info 295 1.1 mrg { 296 1.1 mrg struct placeholder *placeholderp; 297 1.1 mrg }; 298 1.1 mrg 299 1.1 mrg /* Encode the type of an entry in an async queue. */ 300 1.1 mrg 301 1.1 mrg enum entry_type 302 1.1 mrg { 303 1.1 mrg KERNEL_LAUNCH, 304 1.1 mrg CALLBACK, 305 1.1 mrg ASYNC_WAIT, 306 1.1 mrg ASYNC_PLACEHOLDER 307 1.1 mrg }; 308 1.1 mrg 309 1.1 mrg /* An entry in an async queue. */ 310 1.1 mrg 311 1.1 mrg struct queue_entry 312 1.1 mrg { 313 1.1 mrg enum entry_type type; 314 1.1 mrg union { 315 1.1 mrg struct kernel_launch launch; 316 1.1 mrg struct callback callback; 317 1.1 mrg struct asyncwait_info asyncwait; 318 1.1 mrg struct placeholder placeholder; 319 1.1 mrg } u; 320 1.1 mrg }; 321 1.1 mrg 322 1.1 mrg /* An async queue header. 323 1.1 mrg 324 1.1 mrg OpenMP may create one of these. 325 1.1 mrg OpenACC may create many. */ 326 1.1 mrg 327 1.1 mrg struct goacc_asyncqueue 328 1.1 mrg { 329 1.1 mrg struct agent_info *agent; 330 1.1 mrg hsa_queue_t *hsa_queue; 331 1.1 mrg 332 1.1 mrg pthread_t thread_drain_queue; 333 1.1 mrg pthread_mutex_t mutex; 334 1.1 mrg pthread_cond_t queue_cond_in; 335 1.1 mrg pthread_cond_t queue_cond_out; 336 1.1 mrg struct queue_entry queue[ASYNC_QUEUE_SIZE]; 337 1.1 mrg int queue_first; 338 1.1 mrg int queue_n; 339 1.1 mrg int drain_queue_stop; 340 1.1 mrg 341 1.1 mrg int id; 342 1.1 mrg struct goacc_asyncqueue *prev; 343 1.1 mrg struct goacc_asyncqueue *next; 344 1.1 mrg }; 345 1.1 mrg 346 1.1 mrg /* Mkoffload uses this structure to describe a kernel. 347 1.1 mrg 348 1.1 mrg OpenMP kernel dimensions are passed at runtime. 349 1.1 mrg OpenACC kernel dimensions are passed at compile time, here. */ 350 1.1 mrg 351 1.1 mrg struct hsa_kernel_description 352 1.1 mrg { 353 1.1 mrg const char *name; 354 1.1 mrg int oacc_dims[3]; /* Only present for GCN kernels. */ 355 1.1 mrg int sgpr_count; 356 1.1 mrg int vpgr_count; 357 1.1 mrg }; 358 1.1 mrg 359 1.1 mrg /* Mkoffload uses this structure to describe an offload variable. */ 360 1.1 mrg 361 1.1 mrg struct global_var_info 362 1.1 mrg { 363 1.1 mrg const char *name; 364 1.1 mrg void *address; 365 1.1 mrg }; 366 1.1 mrg 367 1.1 mrg /* Mkoffload uses this structure to describe all the kernels in a 368 1.1 mrg loadable module. These are passed the libgomp via static constructors. */ 369 1.1 mrg 370 1.1 mrg struct gcn_image_desc 371 1.1 mrg { 372 1.1 mrg struct gcn_image { 373 1.1 mrg size_t size; 374 1.1 mrg void *image; 375 1.1 mrg } *gcn_image; 376 1.1 mrg const unsigned kernel_count; 377 1.1 mrg struct hsa_kernel_description *kernel_infos; 378 1.1.1.3 mrg const unsigned ind_func_count; 379 1.1 mrg const unsigned global_variable_count; 380 1.1 mrg }; 381 1.1 mrg 382 1.1 mrg /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we 383 1.1 mrg support. 384 1.1 mrg See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */ 385 1.1 mrg 386 1.1 mrg typedef enum { 387 1.1.1.3 mrg EF_AMDGPU_MACH_UNSUPPORTED = -1, 388 1.1 mrg EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a, 389 1.1 mrg EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c, 390 1.1 mrg EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, 391 1.1.1.3 mrg EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030, 392 1.1.1.3 mrg EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f, 393 1.1.1.3 mrg EF_AMDGPU_MACH_AMDGCN_GFX90c = 0x032, 394 1.1.1.3 mrg EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036, 395 1.1.1.3 mrg EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045, 396 1.1.1.3 mrg EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, 397 1.1.1.3 mrg EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044 398 1.1 mrg } EF_AMDGPU_MACH; 399 1.1 mrg 400 1.1 mrg const static int EF_AMDGPU_MACH_MASK = 0x000000ff; 401 1.1 mrg typedef EF_AMDGPU_MACH gcn_isa; 402 1.1 mrg 403 1.1 mrg /* Description of an HSA GPU agent (device) and the program associated with 404 1.1 mrg it. */ 405 1.1 mrg 406 1.1 mrg struct agent_info 407 1.1 mrg { 408 1.1 mrg /* The HSA ID of the agent. Assigned when hsa_context is initialized. */ 409 1.1 mrg hsa_agent_t id; 410 1.1 mrg /* The user-visible device number. */ 411 1.1 mrg int device_id; 412 1.1 mrg /* Whether the agent has been initialized. The fields below are usable only 413 1.1 mrg if it has been. */ 414 1.1 mrg bool initialized; 415 1.1 mrg 416 1.1 mrg /* The instruction set architecture of the device. */ 417 1.1 mrg gcn_isa device_isa; 418 1.1 mrg /* Name of the agent. */ 419 1.1 mrg char name[64]; 420 1.1 mrg /* Name of the vendor of the agent. */ 421 1.1 mrg char vendor_name[64]; 422 1.1 mrg /* Command queues of the agent. */ 423 1.1 mrg hsa_queue_t *sync_queue; 424 1.1 mrg struct goacc_asyncqueue *async_queues, *omp_async_queue; 425 1.1 mrg pthread_mutex_t async_queues_mutex; 426 1.1 mrg 427 1.1 mrg /* The HSA memory region from which to allocate kernel arguments. */ 428 1.1 mrg hsa_region_t kernarg_region; 429 1.1 mrg 430 1.1 mrg /* The HSA memory region from which to allocate device data. */ 431 1.1 mrg hsa_region_t data_region; 432 1.1 mrg 433 1.1.1.3 mrg /* Allocated ephemeral memories (team arena and stack space). */ 434 1.1.1.3 mrg struct ephemeral_memories_list *ephemeral_memories_list; 435 1.1.1.3 mrg pthread_mutex_t ephemeral_memories_write_lock; 436 1.1 mrg 437 1.1 mrg /* Read-write lock that protects kernels which are running or about to be run 438 1.1 mrg from interference with loading and unloading of images. Needs to be 439 1.1 mrg locked for reading while a kernel is being run, and for writing if the 440 1.1 mrg list of modules is manipulated (and thus the HSA program invalidated). */ 441 1.1 mrg pthread_rwlock_t module_rwlock; 442 1.1 mrg 443 1.1 mrg /* The module associated with this kernel. */ 444 1.1 mrg struct module_info *module; 445 1.1 mrg 446 1.1 mrg /* Mutex enforcing that only one thread will finalize the HSA program. A 447 1.1 mrg thread should have locked agent->module_rwlock for reading before 448 1.1 mrg acquiring it. */ 449 1.1 mrg pthread_mutex_t prog_mutex; 450 1.1 mrg /* Flag whether the HSA program that consists of all the modules has been 451 1.1 mrg finalized. */ 452 1.1 mrg bool prog_finalized; 453 1.1 mrg /* HSA executable - the finalized program that is used to locate kernels. */ 454 1.1 mrg hsa_executable_t executable; 455 1.1 mrg }; 456 1.1 mrg 457 1.1 mrg /* Information required to identify, finalize and run any given kernel. */ 458 1.1 mrg 459 1.1 mrg enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC}; 460 1.1 mrg 461 1.1 mrg struct kernel_info 462 1.1 mrg { 463 1.1 mrg /* Name of the kernel, required to locate it within the GCN object-code 464 1.1 mrg module. */ 465 1.1 mrg const char *name; 466 1.1 mrg /* The specific agent the kernel has been or will be finalized for and run 467 1.1 mrg on. */ 468 1.1 mrg struct agent_info *agent; 469 1.1 mrg /* The specific module where the kernel takes place. */ 470 1.1 mrg struct module_info *module; 471 1.1 mrg /* Information provided by mkoffload associated with the kernel. */ 472 1.1 mrg struct hsa_kernel_description *description; 473 1.1 mrg /* Mutex enforcing that at most once thread ever initializes a kernel for 474 1.1 mrg use. A thread should have locked agent->module_rwlock for reading before 475 1.1 mrg acquiring it. */ 476 1.1 mrg pthread_mutex_t init_mutex; 477 1.1 mrg /* Flag indicating whether the kernel has been initialized and all fields 478 1.1 mrg below it contain valid data. */ 479 1.1 mrg bool initialized; 480 1.1 mrg /* Flag indicating that the kernel has a problem that blocks an execution. */ 481 1.1 mrg bool initialization_failed; 482 1.1 mrg /* The object to be put into the dispatch queue. */ 483 1.1 mrg uint64_t object; 484 1.1 mrg /* Required size of kernel arguments. */ 485 1.1 mrg uint32_t kernarg_segment_size; 486 1.1 mrg /* Required size of group segment. */ 487 1.1 mrg uint32_t group_segment_size; 488 1.1 mrg /* Required size of private segment. */ 489 1.1 mrg uint32_t private_segment_size; 490 1.1 mrg /* Set up for OpenMP or OpenACC? */ 491 1.1 mrg enum offload_kind kind; 492 1.1 mrg }; 493 1.1 mrg 494 1.1 mrg /* Information about a particular GCN module, its image and kernels. */ 495 1.1 mrg 496 1.1 mrg struct module_info 497 1.1 mrg { 498 1.1 mrg /* The description with which the program has registered the image. */ 499 1.1 mrg struct gcn_image_desc *image_desc; 500 1.1 mrg /* GCN heap allocation. */ 501 1.1 mrg struct heap *heap; 502 1.1 mrg /* Physical boundaries of the loaded module. */ 503 1.1 mrg Elf64_Addr phys_address_start; 504 1.1 mrg Elf64_Addr phys_address_end; 505 1.1 mrg 506 1.1 mrg bool constructors_run_p; 507 1.1 mrg struct kernel_info *init_array_func, *fini_array_func; 508 1.1 mrg 509 1.1 mrg /* Number of kernels in this module. */ 510 1.1 mrg int kernel_count; 511 1.1 mrg /* An array of kernel_info structures describing each kernel in this 512 1.1 mrg module. */ 513 1.1 mrg struct kernel_info kernels[]; 514 1.1 mrg }; 515 1.1 mrg 516 1.1 mrg /* A linked list of memory arenas allocated on the device. 517 1.1.1.3 mrg These are used by OpenMP, as a means to optimize per-team malloc, 518 1.1.1.3 mrg and for host-accessible stack space. */ 519 1.1 mrg 520 1.1.1.3 mrg struct ephemeral_memories_list 521 1.1 mrg { 522 1.1.1.3 mrg struct ephemeral_memories_list *next; 523 1.1 mrg 524 1.1.1.3 mrg /* The size is determined by the number of teams and threads. */ 525 1.1.1.3 mrg size_t size; 526 1.1.1.3 mrg /* The device address allocated memory. */ 527 1.1.1.3 mrg void *address; 528 1.1.1.3 mrg /* A flag to prevent two asynchronous kernels trying to use the same memory. 529 1.1 mrg The mutex is locked until the kernel exits. */ 530 1.1 mrg pthread_mutex_t in_use; 531 1.1 mrg }; 532 1.1 mrg 533 1.1 mrg /* Information about the whole HSA environment and all of its agents. */ 534 1.1 mrg 535 1.1 mrg struct hsa_context_info 536 1.1 mrg { 537 1.1 mrg /* Whether the structure has been initialized. */ 538 1.1 mrg bool initialized; 539 1.1 mrg /* Number of usable GPU HSA agents in the system. */ 540 1.1 mrg int agent_count; 541 1.1 mrg /* Array of agent_info structures describing the individual HSA agents. */ 542 1.1 mrg struct agent_info *agents; 543 1.1 mrg /* Driver version string. */ 544 1.1 mrg char driver_version_s[30]; 545 1.1 mrg }; 546 1.1 mrg 547 1.1 mrg /* }}} */ 548 1.1 mrg /* {{{ Global variables */ 549 1.1 mrg 550 1.1 mrg /* Information about the whole HSA environment and all of its agents. */ 551 1.1 mrg 552 1.1 mrg static struct hsa_context_info hsa_context; 553 1.1 mrg 554 1.1 mrg /* HSA runtime functions that are initialized in init_hsa_context. */ 555 1.1 mrg 556 1.1 mrg static struct hsa_runtime_fn_info hsa_fns; 557 1.1 mrg 558 1.1 mrg /* Heap space, allocated target-side, provided for use of newlib malloc. 559 1.1 mrg Each module should have it's own heap allocated. 560 1.1 mrg Beware that heap usage increases with OpenMP teams. See also arenas. */ 561 1.1 mrg 562 1.1 mrg static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE; 563 1.1 mrg 564 1.1.1.3 mrg /* Ephemeral memory sizes for each kernel launch. */ 565 1.1.1.3 mrg 566 1.1.1.3 mrg static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE; 567 1.1.1.3 mrg static int stack_size = DEFAULT_GCN_STACK_SIZE; 568 1.1.1.3 mrg static int lowlat_size = -1; 569 1.1.1.3 mrg 570 1.1 mrg /* Flag to decide whether print to stderr information about what is going on. 571 1.1 mrg Set in init_debug depending on environment variables. */ 572 1.1 mrg 573 1.1 mrg static bool debug; 574 1.1 mrg 575 1.1 mrg /* Flag to decide if the runtime should suppress a possible fallback to host 576 1.1 mrg execution. */ 577 1.1 mrg 578 1.1 mrg static bool suppress_host_fallback; 579 1.1 mrg 580 1.1 mrg /* Flag to locate HSA runtime shared library that is dlopened 581 1.1 mrg by this plug-in. */ 582 1.1 mrg 583 1.1 mrg static const char *hsa_runtime_lib; 584 1.1 mrg 585 1.1 mrg /* Flag to decide if the runtime should support also CPU devices (can be 586 1.1 mrg a simulator). */ 587 1.1 mrg 588 1.1 mrg static bool support_cpu_devices; 589 1.1 mrg 590 1.1 mrg /* Runtime dimension overrides. Zero indicates default. */ 591 1.1 mrg 592 1.1 mrg static int override_x_dim = 0; 593 1.1 mrg static int override_z_dim = 0; 594 1.1 mrg 595 1.1 mrg /* }}} */ 596 1.1 mrg /* {{{ Debug & Diagnostic */ 597 1.1 mrg 598 1.1 mrg /* Print a message to stderr if GCN_DEBUG value is set to true. */ 599 1.1 mrg 600 1.1 mrg #define DEBUG_PRINT(...) \ 601 1.1 mrg do \ 602 1.1 mrg { \ 603 1.1 mrg if (debug) \ 604 1.1 mrg { \ 605 1.1 mrg fprintf (stderr, __VA_ARGS__); \ 606 1.1 mrg } \ 607 1.1 mrg } \ 608 1.1 mrg while (false); 609 1.1 mrg 610 1.1 mrg /* Flush stderr if GCN_DEBUG value is set to true. */ 611 1.1 mrg 612 1.1 mrg #define DEBUG_FLUSH() \ 613 1.1 mrg do { \ 614 1.1 mrg if (debug) \ 615 1.1 mrg fflush (stderr); \ 616 1.1 mrg } while (false) 617 1.1 mrg 618 1.1 mrg /* Print a logging message with PREFIX to stderr if GCN_DEBUG value 619 1.1 mrg is set to true. */ 620 1.1 mrg 621 1.1 mrg #define DEBUG_LOG(prefix, ...) \ 622 1.1 mrg do \ 623 1.1 mrg { \ 624 1.1 mrg DEBUG_PRINT (prefix); \ 625 1.1 mrg DEBUG_PRINT (__VA_ARGS__); \ 626 1.1 mrg DEBUG_FLUSH (); \ 627 1.1 mrg } while (false) 628 1.1 mrg 629 1.1 mrg /* Print a debugging message to stderr. */ 630 1.1 mrg 631 1.1 mrg #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__) 632 1.1 mrg 633 1.1 mrg /* Print a warning message to stderr. */ 634 1.1 mrg 635 1.1 mrg #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__) 636 1.1 mrg 637 1.1 mrg /* Print HSA warning STR with an HSA STATUS code. */ 638 1.1 mrg 639 1.1 mrg static void 640 1.1 mrg hsa_warn (const char *str, hsa_status_t status) 641 1.1 mrg { 642 1.1 mrg if (!debug) 643 1.1 mrg return; 644 1.1 mrg 645 1.1 mrg const char *hsa_error_msg = "[unknown]"; 646 1.1 mrg hsa_fns.hsa_status_string_fn (status, &hsa_error_msg); 647 1.1 mrg 648 1.1 mrg fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str, 649 1.1 mrg hsa_error_msg); 650 1.1 mrg } 651 1.1 mrg 652 1.1 mrg /* Report a fatal error STR together with the HSA error corresponding to STATUS 653 1.1 mrg and terminate execution of the current process. */ 654 1.1 mrg 655 1.1 mrg static void 656 1.1 mrg hsa_fatal (const char *str, hsa_status_t status) 657 1.1 mrg { 658 1.1 mrg const char *hsa_error_msg = "[unknown]"; 659 1.1 mrg hsa_fns.hsa_status_string_fn (status, &hsa_error_msg); 660 1.1 mrg GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str, 661 1.1 mrg hsa_error_msg); 662 1.1 mrg } 663 1.1 mrg 664 1.1 mrg /* Like hsa_fatal, except only report error message, and return FALSE 665 1.1 mrg for propagating error processing to outside of plugin. */ 666 1.1 mrg 667 1.1 mrg static bool 668 1.1 mrg hsa_error (const char *str, hsa_status_t status) 669 1.1 mrg { 670 1.1 mrg const char *hsa_error_msg = "[unknown]"; 671 1.1 mrg hsa_fns.hsa_status_string_fn (status, &hsa_error_msg); 672 1.1 mrg GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str, 673 1.1 mrg hsa_error_msg); 674 1.1 mrg return false; 675 1.1 mrg } 676 1.1 mrg 677 1.1 mrg /* Dump information about the available hardware. */ 678 1.1 mrg 679 1.1 mrg static void 680 1.1 mrg dump_hsa_system_info (void) 681 1.1 mrg { 682 1.1 mrg hsa_status_t status; 683 1.1 mrg 684 1.1 mrg hsa_endianness_t endianness; 685 1.1 mrg status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS, 686 1.1 mrg &endianness); 687 1.1 mrg if (status == HSA_STATUS_SUCCESS) 688 1.1 mrg switch (endianness) 689 1.1 mrg { 690 1.1 mrg case HSA_ENDIANNESS_LITTLE: 691 1.1 mrg GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n"); 692 1.1 mrg break; 693 1.1 mrg case HSA_ENDIANNESS_BIG: 694 1.1 mrg GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n"); 695 1.1 mrg break; 696 1.1 mrg default: 697 1.1 mrg GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n"); 698 1.1 mrg } 699 1.1 mrg else 700 1.1 mrg GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n"); 701 1.1 mrg 702 1.1 mrg uint8_t extensions[128]; 703 1.1 mrg status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS, 704 1.1 mrg &extensions); 705 1.1 mrg if (status == HSA_STATUS_SUCCESS) 706 1.1 mrg { 707 1.1 mrg if (extensions[0] & (1 << HSA_EXTENSION_IMAGES)) 708 1.1 mrg GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n"); 709 1.1 mrg } 710 1.1 mrg else 711 1.1 mrg GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n"); 712 1.1 mrg } 713 1.1 mrg 714 1.1 mrg /* Dump information about the available hardware. */ 715 1.1 mrg 716 1.1 mrg static void 717 1.1 mrg dump_machine_model (hsa_machine_model_t machine_model, const char *s) 718 1.1 mrg { 719 1.1 mrg switch (machine_model) 720 1.1 mrg { 721 1.1 mrg case HSA_MACHINE_MODEL_SMALL: 722 1.1 mrg GCN_DEBUG ("%s: SMALL\n", s); 723 1.1 mrg break; 724 1.1 mrg case HSA_MACHINE_MODEL_LARGE: 725 1.1 mrg GCN_DEBUG ("%s: LARGE\n", s); 726 1.1 mrg break; 727 1.1 mrg default: 728 1.1 mrg GCN_WARNING ("%s: UNKNOWN\n", s); 729 1.1 mrg break; 730 1.1 mrg } 731 1.1 mrg } 732 1.1 mrg 733 1.1 mrg /* Dump information about the available hardware. */ 734 1.1 mrg 735 1.1 mrg static void 736 1.1 mrg dump_profile (hsa_profile_t profile, const char *s) 737 1.1 mrg { 738 1.1 mrg switch (profile) 739 1.1 mrg { 740 1.1 mrg case HSA_PROFILE_FULL: 741 1.1 mrg GCN_DEBUG ("%s: FULL\n", s); 742 1.1 mrg break; 743 1.1 mrg case HSA_PROFILE_BASE: 744 1.1 mrg GCN_DEBUG ("%s: BASE\n", s); 745 1.1 mrg break; 746 1.1 mrg default: 747 1.1 mrg GCN_WARNING ("%s: UNKNOWN\n", s); 748 1.1 mrg break; 749 1.1 mrg } 750 1.1 mrg } 751 1.1 mrg 752 1.1 mrg /* Dump information about a device memory region. */ 753 1.1 mrg 754 1.1 mrg static hsa_status_t 755 1.1 mrg dump_hsa_region (hsa_region_t region, void *data __attribute__((unused))) 756 1.1 mrg { 757 1.1 mrg hsa_status_t status; 758 1.1 mrg 759 1.1 mrg hsa_region_segment_t segment; 760 1.1 mrg status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, 761 1.1 mrg &segment); 762 1.1 mrg if (status == HSA_STATUS_SUCCESS) 763 1.1 mrg { 764 1.1 mrg if (segment == HSA_REGION_SEGMENT_GLOBAL) 765 1.1 mrg GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n"); 766 1.1 mrg else if (segment == HSA_REGION_SEGMENT_READONLY) 767 1.1 mrg GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n"); 768 1.1 mrg else if (segment == HSA_REGION_SEGMENT_PRIVATE) 769 1.1 mrg GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n"); 770 1.1 mrg else if (segment == HSA_REGION_SEGMENT_GROUP) 771 1.1 mrg GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n"); 772 1.1 mrg else 773 1.1 mrg GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n"); 774 1.1 mrg } 775 1.1 mrg else 776 1.1 mrg GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n"); 777 1.1 mrg 778 1.1 mrg if (segment == HSA_REGION_SEGMENT_GLOBAL) 779 1.1 mrg { 780 1.1 mrg uint32_t flags; 781 1.1 mrg status 782 1.1 mrg = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS, 783 1.1 mrg &flags); 784 1.1 mrg if (status == HSA_STATUS_SUCCESS) 785 1.1 mrg { 786 1.1 mrg if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) 787 1.1 mrg GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n"); 788 1.1 mrg if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) 789 1.1 mrg GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n"); 790 1.1 mrg if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) 791 1.1 mrg GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n"); 792 1.1 mrg } 793 1.1 mrg else 794 1.1 mrg GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n"); 795 1.1 mrg } 796 1.1 mrg 797 1.1 mrg size_t size; 798 1.1 mrg status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size); 799 1.1 mrg if (status == HSA_STATUS_SUCCESS) 800 1.1 mrg GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size); 801 1.1 mrg else 802 1.1 mrg GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n"); 803 1.1 mrg 804 1.1 mrg status 805 1.1 mrg = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE, 806 1.1 mrg &size); 807 1.1 mrg if (status == HSA_STATUS_SUCCESS) 808 1.1 mrg GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size); 809 1.1 mrg else 810 1.1 mrg GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n"); 811 1.1 mrg 812 1.1 mrg bool alloc_allowed; 813 1.1 mrg status 814 1.1 mrg = hsa_fns.hsa_region_get_info_fn (region, 815 1.1 mrg HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED, 816 1.1 mrg &alloc_allowed); 817 1.1 mrg if (status == HSA_STATUS_SUCCESS) 818 1.1 mrg GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed); 819 1.1 mrg else 820 1.1 mrg GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n"); 821 1.1 mrg 822 1.1 mrg if (status != HSA_STATUS_SUCCESS || !alloc_allowed) 823 1.1 mrg return HSA_STATUS_SUCCESS; 824 1.1 mrg 825 1.1 mrg status 826 1.1 mrg = hsa_fns.hsa_region_get_info_fn (region, 827 1.1 mrg HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE, 828 1.1 mrg &size); 829 1.1 mrg if (status == HSA_STATUS_SUCCESS) 830 1.1 mrg GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size); 831 1.1 mrg else 832 1.1 mrg GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n"); 833 1.1 mrg 834 1.1 mrg size_t align; 835 1.1 mrg status 836 1.1 mrg = hsa_fns.hsa_region_get_info_fn (region, 837 1.1 mrg HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT, 838 1.1 mrg &align); 839 1.1 mrg if (status == HSA_STATUS_SUCCESS) 840 1.1 mrg GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align); 841 1.1 mrg else 842 1.1 mrg GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n"); 843 1.1 mrg 844 1.1 mrg return HSA_STATUS_SUCCESS; 845 1.1 mrg } 846 1.1 mrg 847 1.1 mrg /* Dump information about all the device memory regions. */ 848 1.1 mrg 849 1.1 mrg static void 850 1.1 mrg dump_hsa_regions (hsa_agent_t agent) 851 1.1 mrg { 852 1.1 mrg hsa_status_t status; 853 1.1 mrg status = hsa_fns.hsa_agent_iterate_regions_fn (agent, 854 1.1 mrg dump_hsa_region, 855 1.1 mrg NULL); 856 1.1 mrg if (status != HSA_STATUS_SUCCESS) 857 1.1 mrg hsa_error ("Dumping hsa regions failed", status); 858 1.1 mrg } 859 1.1 mrg 860 1.1 mrg /* Dump information about the available devices. */ 861 1.1 mrg 862 1.1 mrg static hsa_status_t 863 1.1 mrg dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused))) 864 1.1 mrg { 865 1.1 mrg hsa_status_t status; 866 1.1 mrg 867 1.1 mrg char buf[64]; 868 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, 869 1.1 mrg &buf); 870 1.1 mrg if (status == HSA_STATUS_SUCCESS) 871 1.1 mrg GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf); 872 1.1 mrg else 873 1.1 mrg GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n"); 874 1.1 mrg 875 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME, 876 1.1 mrg &buf); 877 1.1 mrg if (status == HSA_STATUS_SUCCESS) 878 1.1 mrg GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf); 879 1.1 mrg else 880 1.1 mrg GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n"); 881 1.1 mrg 882 1.1 mrg hsa_machine_model_t machine_model; 883 1.1 mrg status 884 1.1 mrg = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL, 885 1.1 mrg &machine_model); 886 1.1 mrg if (status == HSA_STATUS_SUCCESS) 887 1.1 mrg dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL"); 888 1.1 mrg else 889 1.1 mrg GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n"); 890 1.1 mrg 891 1.1 mrg hsa_profile_t profile; 892 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE, 893 1.1 mrg &profile); 894 1.1 mrg if (status == HSA_STATUS_SUCCESS) 895 1.1 mrg dump_profile (profile, "HSA_AGENT_INFO_PROFILE"); 896 1.1 mrg else 897 1.1 mrg GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n"); 898 1.1 mrg 899 1.1 mrg hsa_device_type_t device_type; 900 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE, 901 1.1 mrg &device_type); 902 1.1 mrg if (status == HSA_STATUS_SUCCESS) 903 1.1 mrg { 904 1.1 mrg switch (device_type) 905 1.1 mrg { 906 1.1 mrg case HSA_DEVICE_TYPE_CPU: 907 1.1 mrg GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n"); 908 1.1 mrg break; 909 1.1 mrg case HSA_DEVICE_TYPE_GPU: 910 1.1 mrg GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n"); 911 1.1 mrg break; 912 1.1 mrg case HSA_DEVICE_TYPE_DSP: 913 1.1 mrg GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n"); 914 1.1 mrg break; 915 1.1 mrg default: 916 1.1 mrg GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n"); 917 1.1 mrg break; 918 1.1 mrg } 919 1.1 mrg } 920 1.1 mrg else 921 1.1 mrg GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n"); 922 1.1 mrg 923 1.1 mrg uint32_t cu_count; 924 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn 925 1.1 mrg (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count); 926 1.1 mrg if (status == HSA_STATUS_SUCCESS) 927 1.1 mrg GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count); 928 1.1 mrg else 929 1.1 mrg GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n"); 930 1.1 mrg 931 1.1 mrg uint32_t size; 932 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, 933 1.1 mrg &size); 934 1.1 mrg if (status == HSA_STATUS_SUCCESS) 935 1.1 mrg GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size); 936 1.1 mrg else 937 1.1 mrg GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n"); 938 1.1 mrg 939 1.1 mrg uint32_t max_dim; 940 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent, 941 1.1 mrg HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 942 1.1 mrg &max_dim); 943 1.1 mrg if (status == HSA_STATUS_SUCCESS) 944 1.1 mrg GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim); 945 1.1 mrg else 946 1.1 mrg GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n"); 947 1.1 mrg 948 1.1 mrg uint32_t max_size; 949 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent, 950 1.1 mrg HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, 951 1.1 mrg &max_size); 952 1.1 mrg if (status == HSA_STATUS_SUCCESS) 953 1.1 mrg GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size); 954 1.1 mrg else 955 1.1 mrg GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n"); 956 1.1 mrg 957 1.1 mrg uint32_t grid_max_dim; 958 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM, 959 1.1 mrg &grid_max_dim); 960 1.1 mrg if (status == HSA_STATUS_SUCCESS) 961 1.1 mrg GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim); 962 1.1 mrg else 963 1.1 mrg GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n"); 964 1.1 mrg 965 1.1 mrg uint32_t grid_max_size; 966 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE, 967 1.1 mrg &grid_max_size); 968 1.1 mrg if (status == HSA_STATUS_SUCCESS) 969 1.1 mrg GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size); 970 1.1 mrg else 971 1.1 mrg GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n"); 972 1.1 mrg 973 1.1 mrg dump_hsa_regions (agent); 974 1.1 mrg 975 1.1 mrg return HSA_STATUS_SUCCESS; 976 1.1 mrg } 977 1.1 mrg 978 1.1 mrg /* Forward reference. */ 979 1.1 mrg 980 1.1 mrg static char *get_executable_symbol_name (hsa_executable_symbol_t symbol); 981 1.1 mrg 982 1.1 mrg /* Helper function for dump_executable_symbols. */ 983 1.1 mrg 984 1.1 mrg static hsa_status_t 985 1.1 mrg dump_executable_symbol (hsa_executable_t executable, 986 1.1 mrg hsa_executable_symbol_t symbol, 987 1.1 mrg void *data __attribute__((unused))) 988 1.1 mrg { 989 1.1 mrg char *name = get_executable_symbol_name (symbol); 990 1.1 mrg 991 1.1 mrg if (name) 992 1.1 mrg { 993 1.1 mrg GCN_DEBUG ("executable symbol: %s\n", name); 994 1.1 mrg free (name); 995 1.1 mrg } 996 1.1 mrg 997 1.1 mrg return HSA_STATUS_SUCCESS; 998 1.1 mrg } 999 1.1 mrg 1000 1.1 mrg /* Dump all global symbol in an executable. */ 1001 1.1 mrg 1002 1.1 mrg static void 1003 1.1 mrg dump_executable_symbols (hsa_executable_t executable) 1004 1.1 mrg { 1005 1.1 mrg hsa_status_t status; 1006 1.1 mrg status 1007 1.1 mrg = hsa_fns.hsa_executable_iterate_symbols_fn (executable, 1008 1.1 mrg dump_executable_symbol, 1009 1.1 mrg NULL); 1010 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1011 1.1 mrg hsa_fatal ("Could not dump HSA executable symbols", status); 1012 1.1 mrg } 1013 1.1 mrg 1014 1.1 mrg /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */ 1015 1.1 mrg 1016 1.1 mrg static void 1017 1.1 mrg print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent) 1018 1.1 mrg { 1019 1.1 mrg struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address; 1020 1.1 mrg 1021 1.1 mrg fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch); 1022 1.1 mrg fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue); 1023 1.1 mrg fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs); 1024 1.1 mrg fprintf (stderr, "%*sheap address: %p\n", indent, "", 1025 1.1.1.3 mrg (void*)kernargs->abi.heap_ptr); 1026 1.1.1.3 mrg fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent, 1027 1.1.1.3 mrg "", (void*)kernargs->abi.arena_ptr, 1028 1.1.1.3 mrg kernargs->abi.arena_size_per_team); 1029 1.1.1.3 mrg fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent, 1030 1.1.1.3 mrg "", (void*)kernargs->abi.stack_ptr, 1031 1.1.1.3 mrg kernargs->abi.stack_size_per_thread); 1032 1.1 mrg fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object); 1033 1.1 mrg fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "", 1034 1.1 mrg dispatch->private_segment_size); 1035 1.1.1.3 mrg fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent, 1036 1.1.1.3 mrg "", dispatch->group_segment_size); 1037 1.1 mrg fprintf (stderr, "\n"); 1038 1.1 mrg } 1039 1.1 mrg 1040 1.1 mrg /* }}} */ 1041 1.1 mrg /* {{{ Utility functions */ 1042 1.1 mrg 1043 1.1 mrg /* Cast the thread local storage to gcn_thread. */ 1044 1.1 mrg 1045 1.1 mrg static inline struct gcn_thread * 1046 1.1 mrg gcn_thread (void) 1047 1.1 mrg { 1048 1.1 mrg return (struct gcn_thread *) GOMP_PLUGIN_acc_thread (); 1049 1.1 mrg } 1050 1.1 mrg 1051 1.1 mrg /* Initialize debug and suppress_host_fallback according to the environment. */ 1052 1.1 mrg 1053 1.1 mrg static void 1054 1.1 mrg init_environment_variables (void) 1055 1.1 mrg { 1056 1.1 mrg if (secure_getenv ("GCN_DEBUG")) 1057 1.1 mrg debug = true; 1058 1.1 mrg else 1059 1.1 mrg debug = false; 1060 1.1 mrg 1061 1.1 mrg if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK")) 1062 1.1 mrg suppress_host_fallback = true; 1063 1.1 mrg else 1064 1.1 mrg suppress_host_fallback = false; 1065 1.1 mrg 1066 1.1 mrg hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB"); 1067 1.1 mrg if (hsa_runtime_lib == NULL) 1068 1.1.1.2 mrg hsa_runtime_lib = "libhsa-runtime64.so.1"; 1069 1.1 mrg 1070 1.1 mrg support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES"); 1071 1.1 mrg 1072 1.1 mrg const char *x = secure_getenv ("GCN_NUM_TEAMS"); 1073 1.1 mrg if (!x) 1074 1.1 mrg x = secure_getenv ("GCN_NUM_GANGS"); 1075 1.1 mrg if (x) 1076 1.1 mrg override_x_dim = atoi (x); 1077 1.1 mrg 1078 1.1 mrg const char *z = secure_getenv ("GCN_NUM_THREADS"); 1079 1.1 mrg if (!z) 1080 1.1 mrg z = secure_getenv ("GCN_NUM_WORKERS"); 1081 1.1 mrg if (z) 1082 1.1 mrg override_z_dim = atoi (z); 1083 1.1 mrg 1084 1.1 mrg const char *heap = secure_getenv ("GCN_HEAP_SIZE"); 1085 1.1 mrg if (heap) 1086 1.1 mrg { 1087 1.1 mrg size_t tmp = atol (heap); 1088 1.1 mrg if (tmp) 1089 1.1 mrg gcn_kernel_heap_size = tmp; 1090 1.1 mrg } 1091 1.1.1.3 mrg 1092 1.1.1.3 mrg const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE"); 1093 1.1.1.3 mrg if (arena) 1094 1.1.1.3 mrg { 1095 1.1.1.3 mrg int tmp = atoi (arena); 1096 1.1.1.3 mrg if (tmp) 1097 1.1.1.3 mrg team_arena_size = tmp;; 1098 1.1.1.3 mrg } 1099 1.1.1.3 mrg 1100 1.1.1.3 mrg const char *stack = secure_getenv ("GCN_STACK_SIZE"); 1101 1.1.1.3 mrg if (stack) 1102 1.1.1.3 mrg { 1103 1.1.1.3 mrg int tmp = atoi (stack); 1104 1.1.1.3 mrg if (tmp) 1105 1.1.1.3 mrg stack_size = tmp;; 1106 1.1.1.3 mrg } 1107 1.1.1.3 mrg 1108 1.1.1.3 mrg const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL"); 1109 1.1.1.3 mrg if (lowlat) 1110 1.1.1.3 mrg lowlat_size = atoi (lowlat); 1111 1.1 mrg } 1112 1.1 mrg 1113 1.1 mrg /* Return malloc'd string with name of SYMBOL. */ 1114 1.1 mrg 1115 1.1 mrg static char * 1116 1.1 mrg get_executable_symbol_name (hsa_executable_symbol_t symbol) 1117 1.1 mrg { 1118 1.1 mrg hsa_status_t status; 1119 1.1 mrg char *res; 1120 1.1 mrg uint32_t len; 1121 1.1 mrg const hsa_executable_symbol_info_t info_name_length 1122 1.1 mrg = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH; 1123 1.1 mrg 1124 1.1 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length, 1125 1.1 mrg &len); 1126 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1127 1.1 mrg { 1128 1.1 mrg hsa_error ("Could not get length of symbol name", status); 1129 1.1 mrg return NULL; 1130 1.1 mrg } 1131 1.1 mrg 1132 1.1 mrg res = GOMP_PLUGIN_malloc (len + 1); 1133 1.1 mrg 1134 1.1 mrg const hsa_executable_symbol_info_t info_name 1135 1.1 mrg = HSA_EXECUTABLE_SYMBOL_INFO_NAME; 1136 1.1 mrg 1137 1.1 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res); 1138 1.1 mrg 1139 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1140 1.1 mrg { 1141 1.1 mrg hsa_error ("Could not get symbol name", status); 1142 1.1 mrg free (res); 1143 1.1 mrg return NULL; 1144 1.1 mrg } 1145 1.1 mrg 1146 1.1 mrg res[len] = '\0'; 1147 1.1 mrg 1148 1.1 mrg return res; 1149 1.1 mrg } 1150 1.1 mrg 1151 1.1 mrg /* Get the number of GPU Compute Units. */ 1152 1.1 mrg 1153 1.1 mrg static int 1154 1.1 mrg get_cu_count (struct agent_info *agent) 1155 1.1 mrg { 1156 1.1 mrg uint32_t cu_count; 1157 1.1 mrg hsa_status_t status = hsa_fns.hsa_agent_get_info_fn 1158 1.1 mrg (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count); 1159 1.1 mrg if (status == HSA_STATUS_SUCCESS) 1160 1.1 mrg return cu_count; 1161 1.1 mrg else 1162 1.1 mrg return 64; /* The usual number for older devices. */ 1163 1.1 mrg } 1164 1.1 mrg 1165 1.1 mrg /* Calculate the maximum grid size for OMP threads / OACC workers. 1166 1.1 mrg This depends on the kernel's resource usage levels. */ 1167 1.1 mrg 1168 1.1 mrg static int 1169 1.1 mrg limit_worker_threads (int threads) 1170 1.1 mrg { 1171 1.1 mrg /* FIXME Do something more inteligent here. 1172 1.1 mrg GCN can always run 4 threads within a Compute Unit, but 1173 1.1 mrg more than that depends on register usage. */ 1174 1.1 mrg if (threads > 16) 1175 1.1 mrg threads = 16; 1176 1.1 mrg return threads; 1177 1.1 mrg } 1178 1.1 mrg 1179 1.1.1.3 mrg /* This sets the maximum number of teams to twice the number of GPU Compute 1180 1.1.1.3 mrg Units to avoid memory waste and corresponding memory access faults. */ 1181 1.1.1.3 mrg 1182 1.1.1.3 mrg static int 1183 1.1.1.3 mrg limit_teams (int teams, struct agent_info *agent) 1184 1.1.1.3 mrg { 1185 1.1.1.3 mrg int max_teams = 2 * get_cu_count (agent); 1186 1.1.1.3 mrg if (teams > max_teams) 1187 1.1.1.3 mrg teams = max_teams; 1188 1.1.1.3 mrg return teams; 1189 1.1.1.3 mrg } 1190 1.1.1.3 mrg 1191 1.1 mrg /* Parse the target attributes INPUT provided by the compiler and return true 1192 1.1 mrg if we should run anything all. If INPUT is NULL, fill DEF with default 1193 1.1 mrg values, then store INPUT or DEF into *RESULT. 1194 1.1 mrg 1195 1.1 mrg This is used for OpenMP only. */ 1196 1.1 mrg 1197 1.1 mrg static bool 1198 1.1 mrg parse_target_attributes (void **input, 1199 1.1 mrg struct GOMP_kernel_launch_attributes *def, 1200 1.1 mrg struct GOMP_kernel_launch_attributes **result, 1201 1.1 mrg struct agent_info *agent) 1202 1.1 mrg { 1203 1.1 mrg if (!input) 1204 1.1 mrg GOMP_PLUGIN_fatal ("No target arguments provided"); 1205 1.1 mrg 1206 1.1 mrg bool grid_attrs_found = false; 1207 1.1 mrg bool gcn_dims_found = false; 1208 1.1 mrg int gcn_teams = 0; 1209 1.1 mrg int gcn_threads = 0; 1210 1.1 mrg while (*input) 1211 1.1 mrg { 1212 1.1 mrg intptr_t id = (intptr_t) *input++, val; 1213 1.1 mrg 1214 1.1 mrg if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM) 1215 1.1 mrg val = (intptr_t) *input++; 1216 1.1 mrg else 1217 1.1 mrg val = id >> GOMP_TARGET_ARG_VALUE_SHIFT; 1218 1.1 mrg 1219 1.1 mrg val = (val > INT_MAX) ? INT_MAX : val; 1220 1.1 mrg 1221 1.1 mrg if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN 1222 1.1 mrg && ((id & GOMP_TARGET_ARG_ID_MASK) 1223 1.1 mrg == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES)) 1224 1.1 mrg { 1225 1.1 mrg grid_attrs_found = true; 1226 1.1 mrg break; 1227 1.1 mrg } 1228 1.1 mrg else if ((id & GOMP_TARGET_ARG_DEVICE_MASK) 1229 1.1 mrg == GOMP_TARGET_ARG_DEVICE_ALL) 1230 1.1 mrg { 1231 1.1 mrg gcn_dims_found = true; 1232 1.1 mrg switch (id & GOMP_TARGET_ARG_ID_MASK) 1233 1.1 mrg { 1234 1.1 mrg case GOMP_TARGET_ARG_NUM_TEAMS: 1235 1.1.1.3 mrg gcn_teams = limit_teams (val, agent); 1236 1.1 mrg break; 1237 1.1 mrg case GOMP_TARGET_ARG_THREAD_LIMIT: 1238 1.1 mrg gcn_threads = limit_worker_threads (val); 1239 1.1 mrg break; 1240 1.1 mrg default: 1241 1.1 mrg ; 1242 1.1 mrg } 1243 1.1 mrg } 1244 1.1 mrg } 1245 1.1 mrg 1246 1.1 mrg if (gcn_dims_found) 1247 1.1 mrg { 1248 1.1.1.2 mrg bool gfx900_workaround_p = false; 1249 1.1.1.2 mrg 1250 1.1 mrg if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900 1251 1.1 mrg && gcn_threads == 0 && override_z_dim == 0) 1252 1.1 mrg { 1253 1.1.1.2 mrg gfx900_workaround_p = true; 1254 1.1 mrg GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of " 1255 1.1.1.2 mrg "threads to at most 4 per team.\n"); 1256 1.1 mrg GCN_WARNING (" - If this is not a Vega 10 device, please use " 1257 1.1 mrg "GCN_NUM_THREADS=16\n"); 1258 1.1 mrg } 1259 1.1 mrg 1260 1.1.1.2 mrg /* Ideally, when a dimension isn't explicitly specified, we should 1261 1.1.1.2 mrg tune it to run 40 (or 32?) threads per CU with no threads getting queued. 1262 1.1.1.2 mrg In practice, we tune for peak performance on BabelStream, which 1263 1.1.1.2 mrg for OpenACC is currently 32 threads per CU. */ 1264 1.1 mrg def->ndim = 3; 1265 1.1.1.2 mrg if (gcn_teams <= 0 && gcn_threads <= 0) 1266 1.1.1.2 mrg { 1267 1.1.1.2 mrg /* Set up a reasonable number of teams and threads. */ 1268 1.1.1.2 mrg gcn_threads = gfx900_workaround_p ? 4 : 16; // 8; 1269 1.1.1.2 mrg def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads); 1270 1.1.1.2 mrg def->gdims[2] = gcn_threads; 1271 1.1.1.2 mrg } 1272 1.1.1.2 mrg else if (gcn_teams <= 0 && gcn_threads > 0) 1273 1.1.1.2 mrg { 1274 1.1.1.2 mrg /* Auto-scale the number of teams with the number of threads. */ 1275 1.1.1.2 mrg def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads); 1276 1.1.1.2 mrg def->gdims[2] = gcn_threads; 1277 1.1.1.2 mrg } 1278 1.1.1.2 mrg else if (gcn_teams > 0 && gcn_threads <= 0) 1279 1.1.1.2 mrg { 1280 1.1.1.2 mrg int max_threads = gfx900_workaround_p ? 4 : 16; 1281 1.1.1.2 mrg 1282 1.1.1.2 mrg /* Auto-scale the number of threads with the number of teams. */ 1283 1.1.1.2 mrg def->gdims[0] = gcn_teams; 1284 1.1.1.2 mrg def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams; 1285 1.1.1.2 mrg if (def->gdims[2] == 0) 1286 1.1.1.2 mrg def->gdims[2] = 1; 1287 1.1.1.2 mrg else if (def->gdims[2] > max_threads) 1288 1.1.1.2 mrg def->gdims[2] = max_threads; 1289 1.1.1.2 mrg } 1290 1.1.1.2 mrg else 1291 1.1.1.2 mrg { 1292 1.1.1.2 mrg def->gdims[0] = gcn_teams; 1293 1.1.1.2 mrg def->gdims[2] = gcn_threads; 1294 1.1.1.2 mrg } 1295 1.1.1.2 mrg def->gdims[1] = 64; /* Each thread is 64 work items wide. */ 1296 1.1.1.2 mrg def->wdims[0] = 1; /* Single team per work-group. */ 1297 1.1 mrg def->wdims[1] = 64; 1298 1.1 mrg def->wdims[2] = 16; 1299 1.1 mrg *result = def; 1300 1.1 mrg return true; 1301 1.1 mrg } 1302 1.1 mrg else if (!grid_attrs_found) 1303 1.1 mrg { 1304 1.1 mrg def->ndim = 1; 1305 1.1 mrg def->gdims[0] = 1; 1306 1.1 mrg def->gdims[1] = 1; 1307 1.1 mrg def->gdims[2] = 1; 1308 1.1 mrg def->wdims[0] = 1; 1309 1.1 mrg def->wdims[1] = 1; 1310 1.1 mrg def->wdims[2] = 1; 1311 1.1 mrg *result = def; 1312 1.1 mrg GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n"); 1313 1.1 mrg return true; 1314 1.1 mrg } 1315 1.1 mrg 1316 1.1 mrg struct GOMP_kernel_launch_attributes *kla; 1317 1.1 mrg kla = (struct GOMP_kernel_launch_attributes *) *input; 1318 1.1 mrg *result = kla; 1319 1.1 mrg if (kla->ndim == 0 || kla->ndim > 3) 1320 1.1 mrg GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim); 1321 1.1 mrg 1322 1.1 mrg GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim); 1323 1.1 mrg unsigned i; 1324 1.1 mrg for (i = 0; i < kla->ndim; i++) 1325 1.1 mrg { 1326 1.1 mrg GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i, 1327 1.1 mrg kla->gdims[i], kla->wdims[i]); 1328 1.1 mrg if (kla->gdims[i] == 0) 1329 1.1 mrg return false; 1330 1.1 mrg } 1331 1.1 mrg return true; 1332 1.1 mrg } 1333 1.1 mrg 1334 1.1 mrg /* Return the group size given the requested GROUP size, GRID size and number 1335 1.1 mrg of grid dimensions NDIM. */ 1336 1.1 mrg 1337 1.1 mrg static uint32_t 1338 1.1 mrg get_group_size (uint32_t ndim, uint32_t grid, uint32_t group) 1339 1.1 mrg { 1340 1.1 mrg if (group == 0) 1341 1.1 mrg { 1342 1.1 mrg /* TODO: Provide a default via environment or device characteristics. */ 1343 1.1 mrg if (ndim == 1) 1344 1.1 mrg group = 64; 1345 1.1 mrg else if (ndim == 2) 1346 1.1 mrg group = 8; 1347 1.1 mrg else 1348 1.1 mrg group = 4; 1349 1.1 mrg } 1350 1.1 mrg 1351 1.1 mrg if (group > grid) 1352 1.1 mrg group = grid; 1353 1.1 mrg return group; 1354 1.1 mrg } 1355 1.1 mrg 1356 1.1 mrg /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */ 1357 1.1 mrg 1358 1.1 mrg static void 1359 1.1 mrg packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest) 1360 1.1 mrg { 1361 1.1 mrg __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE); 1362 1.1 mrg } 1363 1.1 mrg 1364 1.1 mrg /* A never-called callback for the HSA command queues. These signal events 1365 1.1 mrg that we don't use, so we trigger an error. 1366 1.1 mrg 1367 1.1 mrg This "queue" is not to be confused with the async queues, below. */ 1368 1.1 mrg 1369 1.1 mrg static void 1370 1.1 mrg hsa_queue_callback (hsa_status_t status, 1371 1.1 mrg hsa_queue_t *queue __attribute__ ((unused)), 1372 1.1 mrg void *data __attribute__ ((unused))) 1373 1.1 mrg { 1374 1.1 mrg hsa_fatal ("Asynchronous queue error", status); 1375 1.1 mrg } 1376 1.1 mrg 1377 1.1 mrg /* }}} */ 1378 1.1 mrg /* {{{ HSA initialization */ 1379 1.1 mrg 1380 1.1 mrg /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */ 1381 1.1 mrg 1382 1.1 mrg static bool 1383 1.1 mrg init_hsa_runtime_functions (void) 1384 1.1 mrg { 1385 1.1 mrg #define DLSYM_FN(function) \ 1386 1.1 mrg hsa_fns.function##_fn = dlsym (handle, #function); \ 1387 1.1 mrg if (hsa_fns.function##_fn == NULL) \ 1388 1.1.1.3 mrg GOMP_PLUGIN_fatal ("'%s' is missing '%s'", hsa_runtime_lib, #function); 1389 1.1.1.3 mrg #define DLSYM_OPT_FN(function) \ 1390 1.1.1.3 mrg hsa_fns.function##_fn = dlsym (handle, #function); 1391 1.1.1.3 mrg 1392 1.1 mrg void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY); 1393 1.1 mrg if (handle == NULL) 1394 1.1 mrg return false; 1395 1.1 mrg 1396 1.1 mrg DLSYM_FN (hsa_status_string) 1397 1.1 mrg DLSYM_FN (hsa_system_get_info) 1398 1.1 mrg DLSYM_FN (hsa_agent_get_info) 1399 1.1 mrg DLSYM_FN (hsa_init) 1400 1.1 mrg DLSYM_FN (hsa_iterate_agents) 1401 1.1 mrg DLSYM_FN (hsa_region_get_info) 1402 1.1 mrg DLSYM_FN (hsa_queue_create) 1403 1.1 mrg DLSYM_FN (hsa_agent_iterate_regions) 1404 1.1 mrg DLSYM_FN (hsa_executable_destroy) 1405 1.1 mrg DLSYM_FN (hsa_executable_create) 1406 1.1 mrg DLSYM_FN (hsa_executable_global_variable_define) 1407 1.1 mrg DLSYM_FN (hsa_executable_load_code_object) 1408 1.1 mrg DLSYM_FN (hsa_executable_freeze) 1409 1.1 mrg DLSYM_FN (hsa_signal_create) 1410 1.1 mrg DLSYM_FN (hsa_memory_allocate) 1411 1.1 mrg DLSYM_FN (hsa_memory_assign_agent) 1412 1.1 mrg DLSYM_FN (hsa_memory_copy) 1413 1.1 mrg DLSYM_FN (hsa_memory_free) 1414 1.1 mrg DLSYM_FN (hsa_signal_destroy) 1415 1.1 mrg DLSYM_FN (hsa_executable_get_symbol) 1416 1.1 mrg DLSYM_FN (hsa_executable_symbol_get_info) 1417 1.1 mrg DLSYM_FN (hsa_executable_iterate_symbols) 1418 1.1 mrg DLSYM_FN (hsa_queue_add_write_index_release) 1419 1.1 mrg DLSYM_FN (hsa_queue_load_read_index_acquire) 1420 1.1 mrg DLSYM_FN (hsa_signal_wait_acquire) 1421 1.1 mrg DLSYM_FN (hsa_signal_store_relaxed) 1422 1.1 mrg DLSYM_FN (hsa_signal_store_release) 1423 1.1 mrg DLSYM_FN (hsa_signal_load_acquire) 1424 1.1 mrg DLSYM_FN (hsa_queue_destroy) 1425 1.1 mrg DLSYM_FN (hsa_code_object_deserialize) 1426 1.1.1.3 mrg DLSYM_OPT_FN (hsa_amd_memory_lock) 1427 1.1.1.3 mrg DLSYM_OPT_FN (hsa_amd_memory_unlock) 1428 1.1.1.3 mrg DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect) 1429 1.1 mrg return true; 1430 1.1.1.3 mrg #undef DLSYM_OPT_FN 1431 1.1 mrg #undef DLSYM_FN 1432 1.1 mrg } 1433 1.1 mrg 1434 1.1.1.3 mrg static gcn_isa isa_code (const char *isa); 1435 1.1.1.3 mrg 1436 1.1 mrg /* Return true if the agent is a GPU and can accept of concurrent submissions 1437 1.1 mrg from different threads. */ 1438 1.1 mrg 1439 1.1 mrg static bool 1440 1.1 mrg suitable_hsa_agent_p (hsa_agent_t agent) 1441 1.1 mrg { 1442 1.1 mrg hsa_device_type_t device_type; 1443 1.1 mrg hsa_status_t status 1444 1.1 mrg = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE, 1445 1.1 mrg &device_type); 1446 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1447 1.1 mrg return false; 1448 1.1 mrg 1449 1.1 mrg switch (device_type) 1450 1.1 mrg { 1451 1.1 mrg case HSA_DEVICE_TYPE_GPU: 1452 1.1.1.3 mrg { 1453 1.1.1.3 mrg char name[64]; 1454 1.1.1.3 mrg hsa_status_t status 1455 1.1.1.3 mrg = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, name); 1456 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS 1457 1.1.1.3 mrg || isa_code (name) == EF_AMDGPU_MACH_UNSUPPORTED) 1458 1.1.1.3 mrg { 1459 1.1.1.3 mrg GCN_DEBUG ("Ignoring unsupported agent '%s'\n", 1460 1.1.1.3 mrg status == HSA_STATUS_SUCCESS ? name : "invalid"); 1461 1.1.1.3 mrg return false; 1462 1.1.1.3 mrg } 1463 1.1.1.3 mrg } 1464 1.1 mrg break; 1465 1.1 mrg case HSA_DEVICE_TYPE_CPU: 1466 1.1 mrg if (!support_cpu_devices) 1467 1.1 mrg return false; 1468 1.1 mrg break; 1469 1.1 mrg default: 1470 1.1 mrg return false; 1471 1.1 mrg } 1472 1.1 mrg 1473 1.1 mrg uint32_t features = 0; 1474 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE, 1475 1.1 mrg &features); 1476 1.1 mrg if (status != HSA_STATUS_SUCCESS 1477 1.1 mrg || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)) 1478 1.1 mrg return false; 1479 1.1 mrg hsa_queue_type_t queue_type; 1480 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE, 1481 1.1 mrg &queue_type); 1482 1.1 mrg if (status != HSA_STATUS_SUCCESS 1483 1.1 mrg || (queue_type != HSA_QUEUE_TYPE_MULTI)) 1484 1.1 mrg return false; 1485 1.1 mrg 1486 1.1 mrg return true; 1487 1.1 mrg } 1488 1.1 mrg 1489 1.1 mrg /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment 1490 1.1 mrg agent_count in hsa_context. */ 1491 1.1 mrg 1492 1.1 mrg static hsa_status_t 1493 1.1 mrg count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused))) 1494 1.1 mrg { 1495 1.1 mrg if (suitable_hsa_agent_p (agent)) 1496 1.1 mrg hsa_context.agent_count++; 1497 1.1 mrg return HSA_STATUS_SUCCESS; 1498 1.1 mrg } 1499 1.1 mrg 1500 1.1 mrg /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent 1501 1.1 mrg id to the describing structure in the hsa context. The index of the 1502 1.1 mrg structure is pointed to by DATA, increment it afterwards. */ 1503 1.1 mrg 1504 1.1 mrg static hsa_status_t 1505 1.1 mrg assign_agent_ids (hsa_agent_t agent, void *data) 1506 1.1 mrg { 1507 1.1 mrg if (suitable_hsa_agent_p (agent)) 1508 1.1 mrg { 1509 1.1 mrg int *agent_index = (int *) data; 1510 1.1 mrg hsa_context.agents[*agent_index].id = agent; 1511 1.1 mrg ++*agent_index; 1512 1.1 mrg } 1513 1.1 mrg return HSA_STATUS_SUCCESS; 1514 1.1 mrg } 1515 1.1 mrg 1516 1.1 mrg /* Initialize hsa_context if it has not already been done. 1517 1.1.1.3 mrg If !PROBE: returns TRUE on success. 1518 1.1.1.3 mrg If PROBE: returns TRUE on success or if the plugin/device shall be silently 1519 1.1.1.3 mrg ignored, and otherwise emits an error and returns FALSE. */ 1520 1.1 mrg 1521 1.1 mrg static bool 1522 1.1.1.3 mrg init_hsa_context (bool probe) 1523 1.1 mrg { 1524 1.1 mrg hsa_status_t status; 1525 1.1 mrg int agent_index = 0; 1526 1.1 mrg 1527 1.1 mrg if (hsa_context.initialized) 1528 1.1 mrg return true; 1529 1.1 mrg init_environment_variables (); 1530 1.1 mrg if (!init_hsa_runtime_functions ()) 1531 1.1 mrg { 1532 1.1.1.3 mrg const char *msg = "Run-time could not be dynamically opened"; 1533 1.1 mrg if (suppress_host_fallback) 1534 1.1.1.3 mrg GOMP_PLUGIN_fatal ("%s\n", msg); 1535 1.1.1.3 mrg else 1536 1.1.1.3 mrg GCN_WARNING ("%s\n", msg); 1537 1.1.1.3 mrg return probe ? true : false; 1538 1.1 mrg } 1539 1.1 mrg status = hsa_fns.hsa_init_fn (); 1540 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1541 1.1 mrg return hsa_error ("Run-time could not be initialized", status); 1542 1.1 mrg GCN_DEBUG ("HSA run-time initialized for GCN\n"); 1543 1.1 mrg 1544 1.1 mrg if (debug) 1545 1.1 mrg dump_hsa_system_info (); 1546 1.1 mrg 1547 1.1 mrg status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL); 1548 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1549 1.1 mrg return hsa_error ("GCN GPU devices could not be enumerated", status); 1550 1.1 mrg GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count); 1551 1.1 mrg 1552 1.1 mrg hsa_context.agents 1553 1.1 mrg = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count 1554 1.1 mrg * sizeof (struct agent_info)); 1555 1.1 mrg status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index); 1556 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1557 1.1 mrg return hsa_error ("Scanning compute agents failed", status); 1558 1.1 mrg if (agent_index != hsa_context.agent_count) 1559 1.1 mrg { 1560 1.1 mrg GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents"); 1561 1.1 mrg return false; 1562 1.1 mrg } 1563 1.1 mrg 1564 1.1 mrg if (debug) 1565 1.1 mrg { 1566 1.1 mrg status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL); 1567 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1568 1.1 mrg GOMP_PLUGIN_error ("Failed to list all HSA runtime agents"); 1569 1.1 mrg } 1570 1.1 mrg 1571 1.1 mrg uint16_t minor, major; 1572 1.1 mrg status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR, 1573 1.1 mrg &minor); 1574 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1575 1.1 mrg GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version"); 1576 1.1 mrg status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR, 1577 1.1 mrg &major); 1578 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1579 1.1 mrg GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version"); 1580 1.1 mrg 1581 1.1 mrg size_t len = sizeof hsa_context.driver_version_s; 1582 1.1 mrg int printed = snprintf (hsa_context.driver_version_s, len, 1583 1.1 mrg "HSA Runtime %hu.%hu", (unsigned short int)major, 1584 1.1 mrg (unsigned short int)minor); 1585 1.1 mrg if (printed >= len) 1586 1.1 mrg GCN_WARNING ("HSA runtime version string was truncated." 1587 1.1 mrg "Version %hu.%hu is too long.", (unsigned short int)major, 1588 1.1 mrg (unsigned short int)minor); 1589 1.1 mrg 1590 1.1 mrg hsa_context.initialized = true; 1591 1.1 mrg return true; 1592 1.1 mrg } 1593 1.1 mrg 1594 1.1 mrg /* Verify that hsa_context has already been initialized and return the 1595 1.1 mrg agent_info structure describing device number N. Return NULL on error. */ 1596 1.1 mrg 1597 1.1 mrg static struct agent_info * 1598 1.1 mrg get_agent_info (int n) 1599 1.1 mrg { 1600 1.1 mrg if (!hsa_context.initialized) 1601 1.1 mrg { 1602 1.1 mrg GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context."); 1603 1.1 mrg return NULL; 1604 1.1 mrg } 1605 1.1 mrg if (n >= hsa_context.agent_count) 1606 1.1 mrg { 1607 1.1 mrg GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n); 1608 1.1 mrg return NULL; 1609 1.1 mrg } 1610 1.1 mrg if (!hsa_context.agents[n].initialized) 1611 1.1 mrg { 1612 1.1 mrg GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent."); 1613 1.1 mrg return NULL; 1614 1.1 mrg } 1615 1.1 mrg return &hsa_context.agents[n]; 1616 1.1 mrg } 1617 1.1 mrg 1618 1.1 mrg /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions. 1619 1.1 mrg 1620 1.1 mrg Selects (breaks at) a suitable region of type KIND. */ 1621 1.1 mrg 1622 1.1 mrg static hsa_status_t 1623 1.1 mrg get_memory_region (hsa_region_t region, hsa_region_t *retval, 1624 1.1 mrg hsa_region_global_flag_t kind) 1625 1.1 mrg { 1626 1.1 mrg hsa_status_t status; 1627 1.1 mrg hsa_region_segment_t segment; 1628 1.1 mrg 1629 1.1 mrg status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, 1630 1.1 mrg &segment); 1631 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1632 1.1 mrg return status; 1633 1.1 mrg if (segment != HSA_REGION_SEGMENT_GLOBAL) 1634 1.1 mrg return HSA_STATUS_SUCCESS; 1635 1.1 mrg 1636 1.1 mrg uint32_t flags; 1637 1.1 mrg status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS, 1638 1.1 mrg &flags); 1639 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1640 1.1 mrg return status; 1641 1.1 mrg if (flags & kind) 1642 1.1 mrg { 1643 1.1 mrg *retval = region; 1644 1.1 mrg return HSA_STATUS_INFO_BREAK; 1645 1.1 mrg } 1646 1.1 mrg return HSA_STATUS_SUCCESS; 1647 1.1 mrg } 1648 1.1 mrg 1649 1.1 mrg /* Callback of hsa_agent_iterate_regions. 1650 1.1 mrg 1651 1.1 mrg Selects a kernargs memory region. */ 1652 1.1 mrg 1653 1.1 mrg static hsa_status_t 1654 1.1 mrg get_kernarg_memory_region (hsa_region_t region, void *data) 1655 1.1 mrg { 1656 1.1 mrg return get_memory_region (region, (hsa_region_t *)data, 1657 1.1 mrg HSA_REGION_GLOBAL_FLAG_KERNARG); 1658 1.1 mrg } 1659 1.1 mrg 1660 1.1 mrg /* Callback of hsa_agent_iterate_regions. 1661 1.1 mrg 1662 1.1 mrg Selects a coarse-grained memory region suitable for the heap and 1663 1.1 mrg offload data. */ 1664 1.1 mrg 1665 1.1 mrg static hsa_status_t 1666 1.1 mrg get_data_memory_region (hsa_region_t region, void *data) 1667 1.1 mrg { 1668 1.1 mrg return get_memory_region (region, (hsa_region_t *)data, 1669 1.1 mrg HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED); 1670 1.1 mrg } 1671 1.1 mrg 1672 1.1 mrg static int 1673 1.1 mrg elf_gcn_isa_field (Elf64_Ehdr *image) 1674 1.1 mrg { 1675 1.1 mrg return image->e_flags & EF_AMDGPU_MACH_MASK; 1676 1.1 mrg } 1677 1.1 mrg 1678 1.1 mrg const static char *gcn_gfx803_s = "gfx803"; 1679 1.1 mrg const static char *gcn_gfx900_s = "gfx900"; 1680 1.1 mrg const static char *gcn_gfx906_s = "gfx906"; 1681 1.1.1.2 mrg const static char *gcn_gfx908_s = "gfx908"; 1682 1.1.1.3 mrg const static char *gcn_gfx90a_s = "gfx90a"; 1683 1.1.1.3 mrg const static char *gcn_gfx90c_s = "gfx90c"; 1684 1.1.1.3 mrg const static char *gcn_gfx1030_s = "gfx1030"; 1685 1.1.1.3 mrg const static char *gcn_gfx1036_s = "gfx1036"; 1686 1.1.1.3 mrg const static char *gcn_gfx1100_s = "gfx1100"; 1687 1.1.1.3 mrg const static char *gcn_gfx1103_s = "gfx1103"; 1688 1.1.1.3 mrg const static int gcn_isa_name_len = 7; 1689 1.1 mrg 1690 1.1 mrg /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not 1691 1.1 mrg support the ISA. */ 1692 1.1 mrg 1693 1.1 mrg static const char* 1694 1.1 mrg isa_hsa_name (int isa) { 1695 1.1 mrg switch(isa) 1696 1.1 mrg { 1697 1.1 mrg case EF_AMDGPU_MACH_AMDGCN_GFX803: 1698 1.1 mrg return gcn_gfx803_s; 1699 1.1 mrg case EF_AMDGPU_MACH_AMDGCN_GFX900: 1700 1.1 mrg return gcn_gfx900_s; 1701 1.1 mrg case EF_AMDGPU_MACH_AMDGCN_GFX906: 1702 1.1 mrg return gcn_gfx906_s; 1703 1.1.1.2 mrg case EF_AMDGPU_MACH_AMDGCN_GFX908: 1704 1.1.1.2 mrg return gcn_gfx908_s; 1705 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX90a: 1706 1.1.1.3 mrg return gcn_gfx90a_s; 1707 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX90c: 1708 1.1.1.3 mrg return gcn_gfx90c_s; 1709 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX1030: 1710 1.1.1.3 mrg return gcn_gfx1030_s; 1711 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX1036: 1712 1.1.1.3 mrg return gcn_gfx1036_s; 1713 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX1100: 1714 1.1.1.3 mrg return gcn_gfx1100_s; 1715 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX1103: 1716 1.1.1.3 mrg return gcn_gfx1103_s; 1717 1.1 mrg } 1718 1.1 mrg return NULL; 1719 1.1 mrg } 1720 1.1 mrg 1721 1.1 mrg /* Returns the user-facing name that GCC uses to identify the architecture (e.g. 1722 1.1 mrg with -march) or NULL if we do not support the ISA. 1723 1.1 mrg Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */ 1724 1.1 mrg 1725 1.1 mrg static const char* 1726 1.1 mrg isa_gcc_name (int isa) { 1727 1.1 mrg switch(isa) 1728 1.1 mrg { 1729 1.1 mrg case EF_AMDGPU_MACH_AMDGCN_GFX803: 1730 1.1 mrg return "fiji"; 1731 1.1 mrg default: 1732 1.1 mrg return isa_hsa_name (isa); 1733 1.1 mrg } 1734 1.1 mrg } 1735 1.1 mrg 1736 1.1 mrg /* Returns the code which is used in the GCN object code to identify the ISA with 1737 1.1 mrg the given name (as used by the HSA runtime). */ 1738 1.1 mrg 1739 1.1 mrg static gcn_isa 1740 1.1 mrg isa_code(const char *isa) { 1741 1.1 mrg if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len)) 1742 1.1 mrg return EF_AMDGPU_MACH_AMDGCN_GFX803; 1743 1.1 mrg 1744 1.1 mrg if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len)) 1745 1.1 mrg return EF_AMDGPU_MACH_AMDGCN_GFX900; 1746 1.1 mrg 1747 1.1 mrg if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len)) 1748 1.1 mrg return EF_AMDGPU_MACH_AMDGCN_GFX906; 1749 1.1 mrg 1750 1.1.1.2 mrg if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len)) 1751 1.1.1.2 mrg return EF_AMDGPU_MACH_AMDGCN_GFX908; 1752 1.1.1.2 mrg 1753 1.1.1.3 mrg if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len)) 1754 1.1.1.3 mrg return EF_AMDGPU_MACH_AMDGCN_GFX90a; 1755 1.1.1.3 mrg 1756 1.1.1.3 mrg if (!strncmp (isa, gcn_gfx90c_s, gcn_isa_name_len)) 1757 1.1.1.3 mrg return EF_AMDGPU_MACH_AMDGCN_GFX90c; 1758 1.1.1.3 mrg 1759 1.1.1.3 mrg if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len)) 1760 1.1.1.3 mrg return EF_AMDGPU_MACH_AMDGCN_GFX1030; 1761 1.1.1.3 mrg 1762 1.1.1.3 mrg if (!strncmp (isa, gcn_gfx1036_s, gcn_isa_name_len)) 1763 1.1.1.3 mrg return EF_AMDGPU_MACH_AMDGCN_GFX1036; 1764 1.1.1.3 mrg 1765 1.1.1.3 mrg if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len)) 1766 1.1.1.3 mrg return EF_AMDGPU_MACH_AMDGCN_GFX1100; 1767 1.1.1.3 mrg 1768 1.1.1.3 mrg if (!strncmp (isa, gcn_gfx1103_s, gcn_isa_name_len)) 1769 1.1.1.3 mrg return EF_AMDGPU_MACH_AMDGCN_GFX1103; 1770 1.1.1.3 mrg 1771 1.1.1.3 mrg return EF_AMDGPU_MACH_UNSUPPORTED; 1772 1.1.1.3 mrg } 1773 1.1.1.3 mrg 1774 1.1.1.3 mrg /* CDNA2 devices have twice as many VGPRs compared to older devices. */ 1775 1.1.1.3 mrg 1776 1.1.1.3 mrg static int 1777 1.1.1.3 mrg max_isa_vgprs (int isa) 1778 1.1.1.3 mrg { 1779 1.1.1.3 mrg switch (isa) 1780 1.1.1.3 mrg { 1781 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX803: 1782 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX900: 1783 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX906: 1784 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX908: 1785 1.1.1.3 mrg return 256; 1786 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX90a: 1787 1.1.1.3 mrg return 512; 1788 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX90c: 1789 1.1.1.3 mrg return 256; 1790 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX1030: 1791 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX1036: 1792 1.1.1.3 mrg return 512; /* 512 SIMD32 = 256 wavefrontsize64. */ 1793 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX1100: 1794 1.1.1.3 mrg case EF_AMDGPU_MACH_AMDGCN_GFX1103: 1795 1.1.1.3 mrg return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */ 1796 1.1.1.3 mrg } 1797 1.1.1.3 mrg GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs"); 1798 1.1 mrg } 1799 1.1 mrg 1800 1.1 mrg /* }}} */ 1801 1.1 mrg /* {{{ Run */ 1802 1.1 mrg 1803 1.1.1.3 mrg /* Create or reuse a team arena and stack space. 1804 1.1 mrg 1805 1.1 mrg Team arenas are used by OpenMP to avoid calling malloc multiple times 1806 1.1 mrg while setting up each team. This is purely a performance optimization. 1807 1.1 mrg 1808 1.1.1.3 mrg The stack space is used by all kernels. We must allocate it in such a 1809 1.1.1.3 mrg way that the reverse offload implmentation can access the data. 1810 1.1 mrg 1811 1.1.1.3 mrg Allocating this memory costs performance, so this function will reuse an 1812 1.1.1.3 mrg existing allocation if a large enough one is idle. 1813 1.1.1.3 mrg The memory lock is released, but not deallocated, when the kernel exits. */ 1814 1.1.1.3 mrg 1815 1.1.1.3 mrg static void 1816 1.1.1.3 mrg configure_ephemeral_memories (struct kernel_info *kernel, 1817 1.1.1.3 mrg struct kernargs_abi *kernargs, int num_teams, 1818 1.1.1.3 mrg int num_threads) 1819 1.1 mrg { 1820 1.1.1.3 mrg struct agent_info *agent = kernel->agent; 1821 1.1.1.3 mrg struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list; 1822 1.1.1.3 mrg struct ephemeral_memories_list *item; 1823 1.1.1.3 mrg 1824 1.1.1.3 mrg int actual_arena_size = (kernel->kind == KIND_OPENMP 1825 1.1.1.3 mrg ? team_arena_size : 0); 1826 1.1.1.3 mrg int actual_arena_total_size = actual_arena_size * num_teams; 1827 1.1.1.3 mrg size_t size = (actual_arena_total_size 1828 1.1.1.3 mrg + num_teams * num_threads * stack_size); 1829 1.1 mrg 1830 1.1 mrg for (item = *next_ptr; item; next_ptr = &item->next, item = item->next) 1831 1.1 mrg { 1832 1.1.1.3 mrg if (item->size < size) 1833 1.1 mrg continue; 1834 1.1 mrg 1835 1.1.1.3 mrg if (pthread_mutex_trylock (&item->in_use) == 0) 1836 1.1.1.3 mrg break; 1837 1.1 mrg } 1838 1.1 mrg 1839 1.1.1.3 mrg if (!item) 1840 1.1 mrg { 1841 1.1.1.3 mrg GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads" 1842 1.1.1.3 mrg " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""), 1843 1.1.1.3 mrg num_teams, num_threads, size); 1844 1.1 mrg 1845 1.1.1.3 mrg if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock)) 1846 1.1.1.3 mrg { 1847 1.1.1.3 mrg GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex"); 1848 1.1.1.3 mrg return; 1849 1.1.1.3 mrg } 1850 1.1.1.3 mrg item = malloc (sizeof (*item)); 1851 1.1.1.3 mrg item->size = size; 1852 1.1.1.3 mrg item->next = NULL; 1853 1.1.1.3 mrg *next_ptr = item; 1854 1.1 mrg 1855 1.1.1.3 mrg if (pthread_mutex_init (&item->in_use, NULL)) 1856 1.1.1.3 mrg { 1857 1.1.1.3 mrg GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex"); 1858 1.1.1.3 mrg return; 1859 1.1.1.3 mrg } 1860 1.1.1.3 mrg if (pthread_mutex_lock (&item->in_use)) 1861 1.1.1.3 mrg { 1862 1.1.1.3 mrg GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex"); 1863 1.1.1.3 mrg return; 1864 1.1.1.3 mrg } 1865 1.1.1.3 mrg if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock)) 1866 1.1.1.3 mrg { 1867 1.1.1.3 mrg GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex"); 1868 1.1.1.3 mrg return; 1869 1.1.1.3 mrg } 1870 1.1.1.3 mrg 1871 1.1.1.3 mrg hsa_status_t status; 1872 1.1.1.3 mrg status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size, 1873 1.1.1.3 mrg &item->address); 1874 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 1875 1.1.1.3 mrg hsa_fatal ("Could not allocate memory for GCN kernel arena", status); 1876 1.1.1.3 mrg status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id, 1877 1.1.1.3 mrg HSA_ACCESS_PERMISSION_RW); 1878 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 1879 1.1.1.3 mrg hsa_fatal ("Could not assign arena & stack memory to device", status); 1880 1.1.1.3 mrg } 1881 1.1 mrg 1882 1.1.1.3 mrg kernargs->arena_ptr = (actual_arena_total_size 1883 1.1.1.3 mrg ? (uint64_t)item->address 1884 1.1.1.3 mrg : 0); 1885 1.1.1.3 mrg kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size; 1886 1.1.1.3 mrg kernargs->arena_size_per_team = actual_arena_size; 1887 1.1.1.3 mrg kernargs->stack_size_per_thread = stack_size; 1888 1.1 mrg } 1889 1.1 mrg 1890 1.1.1.3 mrg /* Mark an ephemeral memory space available for reuse. */ 1891 1.1 mrg 1892 1.1 mrg static void 1893 1.1.1.3 mrg release_ephemeral_memories (struct agent_info* agent, void *address) 1894 1.1 mrg { 1895 1.1.1.3 mrg struct ephemeral_memories_list *item; 1896 1.1 mrg 1897 1.1.1.3 mrg for (item = agent->ephemeral_memories_list; item; item = item->next) 1898 1.1 mrg { 1899 1.1.1.3 mrg if (item->address == address) 1900 1.1 mrg { 1901 1.1 mrg if (pthread_mutex_unlock (&item->in_use)) 1902 1.1 mrg GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex"); 1903 1.1 mrg return; 1904 1.1 mrg } 1905 1.1 mrg } 1906 1.1 mrg GOMP_PLUGIN_error ("Could not find a GCN arena to release."); 1907 1.1 mrg } 1908 1.1 mrg 1909 1.1 mrg /* Clean up all the allocated team arenas. */ 1910 1.1 mrg 1911 1.1 mrg static bool 1912 1.1.1.3 mrg destroy_ephemeral_memories (struct agent_info *agent) 1913 1.1 mrg { 1914 1.1.1.3 mrg struct ephemeral_memories_list *item, *next; 1915 1.1 mrg 1916 1.1.1.3 mrg for (item = agent->ephemeral_memories_list; item; item = next) 1917 1.1 mrg { 1918 1.1 mrg next = item->next; 1919 1.1.1.3 mrg hsa_fns.hsa_memory_free_fn (item->address); 1920 1.1 mrg if (pthread_mutex_destroy (&item->in_use)) 1921 1.1 mrg { 1922 1.1.1.3 mrg GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex"); 1923 1.1 mrg return false; 1924 1.1 mrg } 1925 1.1 mrg free (item); 1926 1.1 mrg } 1927 1.1.1.3 mrg agent->ephemeral_memories_list = NULL; 1928 1.1 mrg 1929 1.1 mrg return true; 1930 1.1 mrg } 1931 1.1 mrg 1932 1.1 mrg /* Allocate memory on a specified device. */ 1933 1.1 mrg 1934 1.1 mrg static void * 1935 1.1 mrg alloc_by_agent (struct agent_info *agent, size_t size) 1936 1.1 mrg { 1937 1.1 mrg GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id); 1938 1.1 mrg 1939 1.1 mrg void *ptr; 1940 1.1 mrg hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, 1941 1.1 mrg size, &ptr); 1942 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1943 1.1 mrg { 1944 1.1 mrg hsa_error ("Could not allocate device memory", status); 1945 1.1 mrg return NULL; 1946 1.1 mrg } 1947 1.1 mrg 1948 1.1 mrg status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id, 1949 1.1 mrg HSA_ACCESS_PERMISSION_RW); 1950 1.1 mrg if (status != HSA_STATUS_SUCCESS) 1951 1.1 mrg { 1952 1.1 mrg hsa_error ("Could not assign data memory to device", status); 1953 1.1 mrg return NULL; 1954 1.1 mrg } 1955 1.1 mrg 1956 1.1 mrg struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 1957 1.1 mrg bool profiling_dispatch_p 1958 1.1 mrg = __builtin_expect (thr != NULL && thr->prof_info != NULL, false); 1959 1.1 mrg if (profiling_dispatch_p) 1960 1.1 mrg { 1961 1.1 mrg acc_prof_info *prof_info = thr->prof_info; 1962 1.1 mrg acc_event_info data_event_info; 1963 1.1 mrg acc_api_info *api_info = thr->api_info; 1964 1.1 mrg 1965 1.1 mrg prof_info->event_type = acc_ev_alloc; 1966 1.1 mrg 1967 1.1 mrg data_event_info.data_event.event_type = prof_info->event_type; 1968 1.1 mrg data_event_info.data_event.valid_bytes 1969 1.1 mrg = _ACC_DATA_EVENT_INFO_VALID_BYTES; 1970 1.1 mrg data_event_info.data_event.parent_construct 1971 1.1 mrg = acc_construct_parallel; 1972 1.1 mrg data_event_info.data_event.implicit = 1; 1973 1.1 mrg data_event_info.data_event.tool_info = NULL; 1974 1.1 mrg data_event_info.data_event.var_name = NULL; 1975 1.1 mrg data_event_info.data_event.bytes = size; 1976 1.1 mrg data_event_info.data_event.host_ptr = NULL; 1977 1.1 mrg data_event_info.data_event.device_ptr = (void *) ptr; 1978 1.1 mrg 1979 1.1 mrg api_info->device_api = acc_device_api_other; 1980 1.1 mrg 1981 1.1 mrg GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, 1982 1.1 mrg api_info); 1983 1.1 mrg } 1984 1.1 mrg 1985 1.1 mrg return ptr; 1986 1.1 mrg } 1987 1.1 mrg 1988 1.1 mrg /* Create kernel dispatch data structure for given KERNEL, along with 1989 1.1 mrg the necessary device signals and memory allocations. */ 1990 1.1 mrg 1991 1.1 mrg static struct kernel_dispatch * 1992 1.1.1.3 mrg create_kernel_dispatch (struct kernel_info *kernel, int num_teams, 1993 1.1.1.3 mrg int num_threads) 1994 1.1 mrg { 1995 1.1 mrg struct agent_info *agent = kernel->agent; 1996 1.1 mrg struct kernel_dispatch *shadow 1997 1.1 mrg = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch)); 1998 1.1 mrg 1999 1.1 mrg shadow->agent = kernel->agent; 2000 1.1 mrg shadow->object = kernel->object; 2001 1.1 mrg 2002 1.1 mrg hsa_signal_t sync_signal; 2003 1.1 mrg hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal); 2004 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2005 1.1 mrg hsa_fatal ("Error creating the GCN sync signal", status); 2006 1.1 mrg 2007 1.1 mrg shadow->signal = sync_signal.handle; 2008 1.1 mrg shadow->private_segment_size = kernel->private_segment_size; 2009 1.1.1.3 mrg 2010 1.1.1.3 mrg if (lowlat_size < 0) 2011 1.1.1.3 mrg { 2012 1.1.1.3 mrg /* Divide the LDS between the number of running teams. 2013 1.1.1.3 mrg Allocate not less than is defined in the kernel metadata. */ 2014 1.1.1.3 mrg int teams_per_cu = num_teams / get_cu_count (agent); 2015 1.1.1.3 mrg int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536); 2016 1.1.1.3 mrg shadow->group_segment_size 2017 1.1.1.3 mrg = (kernel->group_segment_size > LDS_per_team 2018 1.1.1.3 mrg ? kernel->group_segment_size 2019 1.1.1.3 mrg : LDS_per_team);; 2020 1.1.1.3 mrg } 2021 1.1.1.3 mrg else if (lowlat_size < GCN_LOWLAT_HEAP+8) 2022 1.1.1.3 mrg /* Ensure that there's space for the OpenMP libgomp data. */ 2023 1.1.1.3 mrg shadow->group_segment_size = GCN_LOWLAT_HEAP+8; 2024 1.1.1.3 mrg else 2025 1.1.1.3 mrg shadow->group_segment_size = (lowlat_size > 65536 2026 1.1.1.3 mrg ? 65536 2027 1.1.1.3 mrg : lowlat_size); 2028 1.1 mrg 2029 1.1 mrg /* We expect kernels to request a single pointer, explicitly, and the 2030 1.1 mrg rest of struct kernargs, implicitly. If they request anything else 2031 1.1 mrg then something is wrong. */ 2032 1.1 mrg if (kernel->kernarg_segment_size > 8) 2033 1.1 mrg { 2034 1.1 mrg GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested"); 2035 1.1 mrg return NULL; 2036 1.1 mrg } 2037 1.1 mrg 2038 1.1 mrg status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region, 2039 1.1 mrg sizeof (struct kernargs), 2040 1.1 mrg &shadow->kernarg_address); 2041 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2042 1.1 mrg hsa_fatal ("Could not allocate memory for GCN kernel arguments", status); 2043 1.1 mrg struct kernargs *kernargs = shadow->kernarg_address; 2044 1.1 mrg 2045 1.1 mrg /* Zero-initialize the output_data (minimum needed). */ 2046 1.1.1.3 mrg kernargs->abi.out_ptr = (int64_t)&kernargs->output_data; 2047 1.1 mrg kernargs->output_data.next_output = 0; 2048 1.1 mrg for (unsigned i = 0; 2049 1.1 mrg i < (sizeof (kernargs->output_data.queue) 2050 1.1 mrg / sizeof (kernargs->output_data.queue[0])); 2051 1.1 mrg i++) 2052 1.1 mrg kernargs->output_data.queue[i].written = 0; 2053 1.1 mrg kernargs->output_data.consumed = 0; 2054 1.1 mrg 2055 1.1 mrg /* Pass in the heap location. */ 2056 1.1.1.3 mrg kernargs->abi.heap_ptr = (int64_t)kernel->module->heap; 2057 1.1 mrg 2058 1.1.1.3 mrg /* Create the ephemeral memory spaces. */ 2059 1.1.1.3 mrg configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads); 2060 1.1 mrg 2061 1.1 mrg /* Ensure we can recognize unset return values. */ 2062 1.1 mrg kernargs->output_data.return_value = 0xcafe0000; 2063 1.1 mrg 2064 1.1 mrg return shadow; 2065 1.1 mrg } 2066 1.1 mrg 2067 1.1.1.3 mrg static void 2068 1.1.1.3 mrg process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs, 2069 1.1.1.3 mrg uint64_t sizes, uint64_t kinds, uint64_t dev_num64) 2070 1.1.1.3 mrg { 2071 1.1.1.3 mrg int dev_num = dev_num64; 2072 1.1.1.3 mrg GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num, 2073 1.1.1.3 mrg NULL); 2074 1.1.1.3 mrg } 2075 1.1.1.3 mrg 2076 1.1 mrg /* Output any data written to console output from the kernel. It is expected 2077 1.1 mrg that this function is polled during kernel execution. 2078 1.1 mrg 2079 1.1 mrg We print all entries from the last item printed to the next entry without 2080 1.1 mrg a "written" flag. If the "final" flag is set then it'll continue right to 2081 1.1 mrg the end. 2082 1.1 mrg 2083 1.1 mrg The print buffer is circular, but the from and to locations don't wrap when 2084 1.1 mrg the buffer does, so the output limit is UINT_MAX. The target blocks on 2085 1.1 mrg output when the buffer is full. */ 2086 1.1 mrg 2087 1.1 mrg static void 2088 1.1 mrg console_output (struct kernel_info *kernel, struct kernargs *kernargs, 2089 1.1 mrg bool final) 2090 1.1 mrg { 2091 1.1 mrg unsigned int limit = (sizeof (kernargs->output_data.queue) 2092 1.1 mrg / sizeof (kernargs->output_data.queue[0])); 2093 1.1 mrg 2094 1.1 mrg unsigned int from = __atomic_load_n (&kernargs->output_data.consumed, 2095 1.1 mrg __ATOMIC_ACQUIRE); 2096 1.1 mrg unsigned int to = kernargs->output_data.next_output; 2097 1.1 mrg 2098 1.1 mrg if (from > to) 2099 1.1 mrg { 2100 1.1 mrg /* Overflow. */ 2101 1.1 mrg if (final) 2102 1.1 mrg printf ("GCN print buffer overflowed.\n"); 2103 1.1 mrg return; 2104 1.1 mrg } 2105 1.1 mrg 2106 1.1 mrg unsigned int i; 2107 1.1 mrg for (i = from; i < to; i++) 2108 1.1 mrg { 2109 1.1 mrg struct printf_data *data = &kernargs->output_data.queue[i%limit]; 2110 1.1 mrg 2111 1.1 mrg if (!data->written && !final) 2112 1.1 mrg break; 2113 1.1 mrg 2114 1.1 mrg switch (data->type) 2115 1.1 mrg { 2116 1.1 mrg case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break; 2117 1.1 mrg case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break; 2118 1.1 mrg case 2: printf ("%.128s%.128s\n", data->msg, data->text); break; 2119 1.1 mrg case 3: printf ("%.128s%.128s", data->msg, data->text); break; 2120 1.1.1.3 mrg case 4: 2121 1.1.1.3 mrg process_reverse_offload (data->value_u64[0], data->value_u64[1], 2122 1.1.1.3 mrg data->value_u64[2], data->value_u64[3], 2123 1.1.1.3 mrg data->value_u64[4], data->value_u64[5]); 2124 1.1.1.3 mrg break; 2125 1.1 mrg default: printf ("GCN print buffer error!\n"); break; 2126 1.1 mrg } 2127 1.1 mrg data->written = 0; 2128 1.1 mrg __atomic_store_n (&kernargs->output_data.consumed, i+1, 2129 1.1 mrg __ATOMIC_RELEASE); 2130 1.1 mrg } 2131 1.1 mrg fflush (stdout); 2132 1.1 mrg } 2133 1.1 mrg 2134 1.1 mrg /* Release data structure created for a kernel dispatch in SHADOW argument, 2135 1.1 mrg and clean up the signal and memory allocations. */ 2136 1.1 mrg 2137 1.1 mrg static void 2138 1.1 mrg release_kernel_dispatch (struct kernel_dispatch *shadow) 2139 1.1 mrg { 2140 1.1 mrg GCN_DEBUG ("Released kernel dispatch: %p\n", shadow); 2141 1.1 mrg 2142 1.1 mrg struct kernargs *kernargs = shadow->kernarg_address; 2143 1.1.1.3 mrg void *addr = (void *)kernargs->abi.arena_ptr; 2144 1.1.1.3 mrg if (!addr) 2145 1.1.1.3 mrg addr = (void *)kernargs->abi.stack_ptr; 2146 1.1.1.3 mrg release_ephemeral_memories (shadow->agent, addr); 2147 1.1 mrg 2148 1.1 mrg hsa_fns.hsa_memory_free_fn (shadow->kernarg_address); 2149 1.1 mrg 2150 1.1 mrg hsa_signal_t s; 2151 1.1 mrg s.handle = shadow->signal; 2152 1.1 mrg hsa_fns.hsa_signal_destroy_fn (s); 2153 1.1 mrg 2154 1.1 mrg free (shadow); 2155 1.1 mrg } 2156 1.1 mrg 2157 1.1 mrg /* Extract the properties from a kernel binary. */ 2158 1.1 mrg 2159 1.1 mrg static void 2160 1.1 mrg init_kernel_properties (struct kernel_info *kernel) 2161 1.1 mrg { 2162 1.1 mrg hsa_status_t status; 2163 1.1 mrg struct agent_info *agent = kernel->agent; 2164 1.1 mrg hsa_executable_symbol_t kernel_symbol; 2165 1.1.1.2 mrg char *buf = alloca (strlen (kernel->name) + 4); 2166 1.1.1.2 mrg sprintf (buf, "%s.kd", kernel->name); 2167 1.1 mrg status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL, 2168 1.1.1.2 mrg buf, agent->id, 2169 1.1 mrg 0, &kernel_symbol); 2170 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2171 1.1 mrg { 2172 1.1 mrg hsa_warn ("Could not find symbol for kernel in the code object", status); 2173 1.1.1.2 mrg fprintf (stderr, "not found name: '%s'\n", buf); 2174 1.1 mrg dump_executable_symbols (agent->executable); 2175 1.1 mrg goto failure; 2176 1.1 mrg } 2177 1.1 mrg GCN_DEBUG ("Located kernel %s\n", kernel->name); 2178 1.1 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn 2179 1.1 mrg (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object); 2180 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2181 1.1 mrg hsa_fatal ("Could not extract a kernel object from its symbol", status); 2182 1.1 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn 2183 1.1 mrg (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, 2184 1.1 mrg &kernel->kernarg_segment_size); 2185 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2186 1.1 mrg hsa_fatal ("Could not get info about kernel argument size", status); 2187 1.1 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn 2188 1.1 mrg (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, 2189 1.1 mrg &kernel->group_segment_size); 2190 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2191 1.1 mrg hsa_fatal ("Could not get info about kernel group segment size", status); 2192 1.1 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn 2193 1.1 mrg (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, 2194 1.1 mrg &kernel->private_segment_size); 2195 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2196 1.1 mrg hsa_fatal ("Could not get info about kernel private segment size", 2197 1.1 mrg status); 2198 1.1 mrg 2199 1.1 mrg /* The kernel type is not known until something tries to launch it. */ 2200 1.1 mrg kernel->kind = KIND_UNKNOWN; 2201 1.1 mrg 2202 1.1 mrg GCN_DEBUG ("Kernel structure for %s fully initialized with " 2203 1.1 mrg "following segment sizes: \n", kernel->name); 2204 1.1 mrg GCN_DEBUG (" group_segment_size: %u\n", 2205 1.1 mrg (unsigned) kernel->group_segment_size); 2206 1.1 mrg GCN_DEBUG (" private_segment_size: %u\n", 2207 1.1 mrg (unsigned) kernel->private_segment_size); 2208 1.1 mrg GCN_DEBUG (" kernarg_segment_size: %u\n", 2209 1.1 mrg (unsigned) kernel->kernarg_segment_size); 2210 1.1 mrg return; 2211 1.1 mrg 2212 1.1 mrg failure: 2213 1.1 mrg kernel->initialization_failed = true; 2214 1.1 mrg } 2215 1.1 mrg 2216 1.1 mrg /* Do all the work that is necessary before running KERNEL for the first time. 2217 1.1 mrg The function assumes the program has been created, finalized and frozen by 2218 1.1 mrg create_and_finalize_hsa_program. */ 2219 1.1 mrg 2220 1.1 mrg static void 2221 1.1 mrg init_kernel (struct kernel_info *kernel) 2222 1.1 mrg { 2223 1.1 mrg if (pthread_mutex_lock (&kernel->init_mutex)) 2224 1.1 mrg GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex"); 2225 1.1 mrg if (kernel->initialized) 2226 1.1 mrg { 2227 1.1 mrg if (pthread_mutex_unlock (&kernel->init_mutex)) 2228 1.1 mrg GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization " 2229 1.1 mrg "mutex"); 2230 1.1 mrg 2231 1.1 mrg return; 2232 1.1 mrg } 2233 1.1 mrg 2234 1.1 mrg init_kernel_properties (kernel); 2235 1.1 mrg 2236 1.1 mrg if (!kernel->initialization_failed) 2237 1.1 mrg { 2238 1.1 mrg GCN_DEBUG ("\n"); 2239 1.1 mrg 2240 1.1 mrg kernel->initialized = true; 2241 1.1 mrg } 2242 1.1 mrg if (pthread_mutex_unlock (&kernel->init_mutex)) 2243 1.1 mrg GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization " 2244 1.1 mrg "mutex"); 2245 1.1 mrg } 2246 1.1 mrg 2247 1.1 mrg /* Run KERNEL on its agent, pass VARS to it as arguments and take 2248 1.1 mrg launch attributes from KLA. 2249 1.1 mrg 2250 1.1 mrg MODULE_LOCKED indicates that the caller already holds the lock and 2251 1.1 mrg run_kernel need not lock it again. 2252 1.1 mrg If AQ is NULL then agent->sync_queue will be used. */ 2253 1.1 mrg 2254 1.1 mrg static void 2255 1.1 mrg run_kernel (struct kernel_info *kernel, void *vars, 2256 1.1 mrg struct GOMP_kernel_launch_attributes *kla, 2257 1.1 mrg struct goacc_asyncqueue *aq, bool module_locked) 2258 1.1 mrg { 2259 1.1.1.3 mrg struct agent_info *agent = kernel->agent; 2260 1.1 mrg GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count, 2261 1.1 mrg kernel->description->vpgr_count); 2262 1.1 mrg 2263 1.1 mrg /* Reduce the number of threads/workers if there are insufficient 2264 1.1 mrg VGPRs available to run the kernels together. */ 2265 1.1 mrg if (kla->ndim == 3 && kernel->description->vpgr_count > 0) 2266 1.1 mrg { 2267 1.1.1.3 mrg int max_vgprs = max_isa_vgprs (agent->device_isa); 2268 1.1 mrg int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3; 2269 1.1.1.3 mrg int max_threads = (max_vgprs / granulated_vgprs) * 4; 2270 1.1 mrg if (kla->gdims[2] > max_threads) 2271 1.1 mrg { 2272 1.1 mrg GCN_WARNING ("Too many VGPRs required to support %d threads/workers" 2273 1.1 mrg " per team/gang - reducing to %d threads/workers.\n", 2274 1.1 mrg kla->gdims[2], max_threads); 2275 1.1 mrg kla->gdims[2] = max_threads; 2276 1.1 mrg } 2277 1.1 mrg } 2278 1.1 mrg 2279 1.1 mrg GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id, 2280 1.1 mrg (aq ? aq->id : 0)); 2281 1.1 mrg GCN_DEBUG ("GCN launch attribs: gdims:["); 2282 1.1 mrg int i; 2283 1.1 mrg for (i = 0; i < kla->ndim; ++i) 2284 1.1 mrg { 2285 1.1 mrg if (i) 2286 1.1 mrg DEBUG_PRINT (", "); 2287 1.1 mrg DEBUG_PRINT ("%u", kla->gdims[i]); 2288 1.1 mrg } 2289 1.1 mrg DEBUG_PRINT ("], normalized gdims:["); 2290 1.1 mrg for (i = 0; i < kla->ndim; ++i) 2291 1.1 mrg { 2292 1.1 mrg if (i) 2293 1.1 mrg DEBUG_PRINT (", "); 2294 1.1 mrg DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]); 2295 1.1 mrg } 2296 1.1 mrg DEBUG_PRINT ("], wdims:["); 2297 1.1 mrg for (i = 0; i < kla->ndim; ++i) 2298 1.1 mrg { 2299 1.1 mrg if (i) 2300 1.1 mrg DEBUG_PRINT (", "); 2301 1.1 mrg DEBUG_PRINT ("%u", kla->wdims[i]); 2302 1.1 mrg } 2303 1.1 mrg DEBUG_PRINT ("]\n"); 2304 1.1 mrg DEBUG_FLUSH (); 2305 1.1 mrg 2306 1.1 mrg if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock)) 2307 1.1 mrg GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock"); 2308 1.1 mrg 2309 1.1 mrg if (!agent->initialized) 2310 1.1 mrg GOMP_PLUGIN_fatal ("Agent must be initialized"); 2311 1.1 mrg 2312 1.1 mrg if (!kernel->initialized) 2313 1.1 mrg GOMP_PLUGIN_fatal ("Called kernel must be initialized"); 2314 1.1 mrg 2315 1.1 mrg hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue); 2316 1.1 mrg 2317 1.1 mrg uint64_t index 2318 1.1 mrg = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1); 2319 1.1 mrg GCN_DEBUG ("Got AQL index %llu\n", (long long int) index); 2320 1.1 mrg 2321 1.1 mrg /* Wait until the queue is not full before writing the packet. */ 2322 1.1 mrg while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q) 2323 1.1 mrg >= command_q->size) 2324 1.1 mrg ; 2325 1.1 mrg 2326 1.1 mrg /* Do not allow the dimensions to be overridden when running 2327 1.1 mrg constructors or destructors. */ 2328 1.1 mrg int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim; 2329 1.1 mrg int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim; 2330 1.1 mrg 2331 1.1 mrg hsa_kernel_dispatch_packet_t *packet; 2332 1.1 mrg packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address) 2333 1.1 mrg + index % command_q->size; 2334 1.1 mrg 2335 1.1 mrg memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4); 2336 1.1 mrg packet->grid_size_x = override_x ? : kla->gdims[0]; 2337 1.1 mrg packet->workgroup_size_x = get_group_size (kla->ndim, 2338 1.1 mrg packet->grid_size_x, 2339 1.1 mrg kla->wdims[0]); 2340 1.1 mrg 2341 1.1 mrg if (kla->ndim >= 2) 2342 1.1 mrg { 2343 1.1 mrg packet->grid_size_y = kla->gdims[1]; 2344 1.1 mrg packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1], 2345 1.1 mrg kla->wdims[1]); 2346 1.1 mrg } 2347 1.1 mrg else 2348 1.1 mrg { 2349 1.1 mrg packet->grid_size_y = 1; 2350 1.1 mrg packet->workgroup_size_y = 1; 2351 1.1 mrg } 2352 1.1 mrg 2353 1.1 mrg if (kla->ndim == 3) 2354 1.1 mrg { 2355 1.1 mrg packet->grid_size_z = limit_worker_threads (override_z 2356 1.1 mrg ? : kla->gdims[2]); 2357 1.1 mrg packet->workgroup_size_z = get_group_size (kla->ndim, 2358 1.1 mrg packet->grid_size_z, 2359 1.1 mrg kla->wdims[2]); 2360 1.1 mrg } 2361 1.1 mrg else 2362 1.1 mrg { 2363 1.1 mrg packet->grid_size_z = 1; 2364 1.1 mrg packet->workgroup_size_z = 1; 2365 1.1 mrg } 2366 1.1 mrg 2367 1.1 mrg GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u]," 2368 1.1 mrg " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n", 2369 1.1 mrg packet->grid_size_x, packet->grid_size_y, packet->grid_size_z, 2370 1.1 mrg packet->grid_size_x / packet->workgroup_size_x, 2371 1.1 mrg packet->grid_size_y / packet->workgroup_size_y, 2372 1.1 mrg packet->grid_size_z / packet->workgroup_size_z, 2373 1.1 mrg packet->workgroup_size_x, packet->workgroup_size_y, 2374 1.1 mrg packet->workgroup_size_z); 2375 1.1 mrg 2376 1.1 mrg struct kernel_dispatch *shadow 2377 1.1.1.3 mrg = create_kernel_dispatch (kernel, packet->grid_size_x, 2378 1.1.1.3 mrg packet->grid_size_z); 2379 1.1 mrg shadow->queue = command_q; 2380 1.1 mrg 2381 1.1 mrg if (debug) 2382 1.1 mrg { 2383 1.1 mrg fprintf (stderr, "\nKernel has following dependencies:\n"); 2384 1.1 mrg print_kernel_dispatch (shadow, 2); 2385 1.1 mrg } 2386 1.1 mrg 2387 1.1.1.3 mrg packet->private_segment_size = shadow->private_segment_size; 2388 1.1.1.3 mrg packet->group_segment_size = shadow->group_segment_size; 2389 1.1.1.3 mrg packet->kernel_object = shadow->object; 2390 1.1 mrg packet->kernarg_address = shadow->kernarg_address; 2391 1.1 mrg hsa_signal_t s; 2392 1.1 mrg s.handle = shadow->signal; 2393 1.1 mrg packet->completion_signal = s; 2394 1.1 mrg hsa_fns.hsa_signal_store_relaxed_fn (s, 1); 2395 1.1 mrg memcpy (shadow->kernarg_address, &vars, sizeof (vars)); 2396 1.1 mrg 2397 1.1 mrg GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n"); 2398 1.1 mrg 2399 1.1 mrg uint16_t header; 2400 1.1 mrg header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; 2401 1.1 mrg header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; 2402 1.1 mrg header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; 2403 1.1 mrg 2404 1.1 mrg GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name, 2405 1.1 mrg agent->device_id); 2406 1.1 mrg 2407 1.1 mrg packet_store_release ((uint32_t *) packet, header, 2408 1.1 mrg (uint16_t) kla->ndim 2409 1.1 mrg << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS); 2410 1.1 mrg 2411 1.1 mrg hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal, 2412 1.1 mrg index); 2413 1.1 mrg 2414 1.1 mrg GCN_DEBUG ("Kernel dispatched, waiting for completion\n"); 2415 1.1 mrg 2416 1.1 mrg /* Root signal waits with 1ms timeout. */ 2417 1.1 mrg while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1, 2418 1.1 mrg 1000 * 1000, 2419 1.1 mrg HSA_WAIT_STATE_BLOCKED) != 0) 2420 1.1 mrg { 2421 1.1 mrg console_output (kernel, shadow->kernarg_address, false); 2422 1.1 mrg } 2423 1.1 mrg console_output (kernel, shadow->kernarg_address, true); 2424 1.1 mrg 2425 1.1 mrg struct kernargs *kernargs = shadow->kernarg_address; 2426 1.1 mrg unsigned int return_value = (unsigned int)kernargs->output_data.return_value; 2427 1.1 mrg 2428 1.1 mrg release_kernel_dispatch (shadow); 2429 1.1 mrg 2430 1.1 mrg if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock)) 2431 1.1 mrg GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock"); 2432 1.1 mrg 2433 1.1 mrg unsigned int upper = (return_value & ~0xffff) >> 16; 2434 1.1 mrg if (upper == 0xcafe) 2435 1.1 mrg ; // exit not called, normal termination. 2436 1.1 mrg else if (upper == 0xffff) 2437 1.1 mrg ; // exit called. 2438 1.1 mrg else 2439 1.1 mrg { 2440 1.1 mrg GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most" 2441 1.1 mrg " significant bytes aren't 0xffff or 0xcafe: 0x%x\n", 2442 1.1 mrg return_value); 2443 1.1 mrg abort (); 2444 1.1 mrg } 2445 1.1 mrg 2446 1.1 mrg if (upper == 0xffff) 2447 1.1 mrg { 2448 1.1 mrg unsigned int signal = (return_value >> 8) & 0xff; 2449 1.1 mrg 2450 1.1 mrg if (signal == SIGABRT) 2451 1.1 mrg { 2452 1.1 mrg GCN_WARNING ("GCN Kernel aborted\n"); 2453 1.1 mrg abort (); 2454 1.1 mrg } 2455 1.1 mrg else if (signal != 0) 2456 1.1 mrg { 2457 1.1 mrg GCN_WARNING ("GCN Kernel received unknown signal\n"); 2458 1.1 mrg abort (); 2459 1.1 mrg } 2460 1.1 mrg 2461 1.1 mrg GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff); 2462 1.1 mrg exit (return_value & 0xff); 2463 1.1 mrg } 2464 1.1 mrg } 2465 1.1 mrg 2466 1.1 mrg /* }}} */ 2467 1.1 mrg /* {{{ Load/Unload */ 2468 1.1 mrg 2469 1.1 mrg /* Initialize KERNEL from D and other parameters. Return true on success. */ 2470 1.1 mrg 2471 1.1 mrg static bool 2472 1.1 mrg init_basic_kernel_info (struct kernel_info *kernel, 2473 1.1 mrg struct hsa_kernel_description *d, 2474 1.1 mrg struct agent_info *agent, 2475 1.1 mrg struct module_info *module) 2476 1.1 mrg { 2477 1.1 mrg kernel->agent = agent; 2478 1.1 mrg kernel->module = module; 2479 1.1 mrg kernel->name = d->name; 2480 1.1 mrg kernel->description = d; 2481 1.1 mrg if (pthread_mutex_init (&kernel->init_mutex, NULL)) 2482 1.1 mrg { 2483 1.1 mrg GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex"); 2484 1.1 mrg return false; 2485 1.1 mrg } 2486 1.1 mrg return true; 2487 1.1 mrg } 2488 1.1 mrg 2489 1.1 mrg /* Check that the GCN ISA of the given image matches the ISA of the agent. */ 2490 1.1 mrg 2491 1.1 mrg static bool 2492 1.1 mrg isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image) 2493 1.1 mrg { 2494 1.1 mrg int isa_field = elf_gcn_isa_field (image); 2495 1.1 mrg const char* isa_s = isa_hsa_name (isa_field); 2496 1.1 mrg if (!isa_s) 2497 1.1 mrg { 2498 1.1 mrg hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR); 2499 1.1 mrg return false; 2500 1.1 mrg } 2501 1.1 mrg 2502 1.1 mrg if (isa_field != agent->device_isa) 2503 1.1 mrg { 2504 1.1 mrg char msg[120]; 2505 1.1 mrg const char *agent_isa_s = isa_hsa_name (agent->device_isa); 2506 1.1 mrg const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa); 2507 1.1 mrg assert (agent_isa_s); 2508 1.1 mrg assert (agent_isa_gcc_s); 2509 1.1 mrg 2510 1.1 mrg snprintf (msg, sizeof msg, 2511 1.1 mrg "GCN code object ISA '%s' does not match GPU ISA '%s'.\n" 2512 1.1.1.3 mrg "Try to recompile with '-foffload-options=-march=%s'.\n", 2513 1.1 mrg isa_s, agent_isa_s, agent_isa_gcc_s); 2514 1.1 mrg 2515 1.1 mrg hsa_error (msg, HSA_STATUS_ERROR); 2516 1.1 mrg return false; 2517 1.1 mrg } 2518 1.1 mrg 2519 1.1 mrg return true; 2520 1.1 mrg } 2521 1.1 mrg 2522 1.1 mrg /* Create and finalize the program consisting of all loaded modules. */ 2523 1.1 mrg 2524 1.1 mrg static bool 2525 1.1 mrg create_and_finalize_hsa_program (struct agent_info *agent) 2526 1.1 mrg { 2527 1.1 mrg hsa_status_t status; 2528 1.1 mrg bool res = true; 2529 1.1 mrg if (pthread_mutex_lock (&agent->prog_mutex)) 2530 1.1 mrg { 2531 1.1 mrg GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex"); 2532 1.1 mrg return false; 2533 1.1 mrg } 2534 1.1 mrg if (agent->prog_finalized) 2535 1.1 mrg goto final; 2536 1.1 mrg 2537 1.1 mrg status 2538 1.1 mrg = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL, 2539 1.1 mrg HSA_EXECUTABLE_STATE_UNFROZEN, 2540 1.1 mrg "", &agent->executable); 2541 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2542 1.1 mrg { 2543 1.1 mrg hsa_error ("Could not create GCN executable", status); 2544 1.1 mrg goto fail; 2545 1.1 mrg } 2546 1.1 mrg 2547 1.1 mrg /* Load any GCN modules. */ 2548 1.1 mrg struct module_info *module = agent->module; 2549 1.1 mrg if (module) 2550 1.1 mrg { 2551 1.1 mrg Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image; 2552 1.1 mrg 2553 1.1 mrg if (!isa_matches_agent (agent, image)) 2554 1.1 mrg goto fail; 2555 1.1 mrg 2556 1.1 mrg hsa_code_object_t co = { 0 }; 2557 1.1 mrg status = hsa_fns.hsa_code_object_deserialize_fn 2558 1.1 mrg (module->image_desc->gcn_image->image, 2559 1.1 mrg module->image_desc->gcn_image->size, 2560 1.1 mrg NULL, &co); 2561 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2562 1.1 mrg { 2563 1.1 mrg hsa_error ("Could not deserialize GCN code object", status); 2564 1.1 mrg goto fail; 2565 1.1 mrg } 2566 1.1 mrg 2567 1.1 mrg status = hsa_fns.hsa_executable_load_code_object_fn 2568 1.1 mrg (agent->executable, agent->id, co, ""); 2569 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2570 1.1 mrg { 2571 1.1 mrg hsa_error ("Could not load GCN code object", status); 2572 1.1 mrg goto fail; 2573 1.1 mrg } 2574 1.1 mrg 2575 1.1 mrg if (!module->heap) 2576 1.1 mrg { 2577 1.1 mrg status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, 2578 1.1 mrg gcn_kernel_heap_size, 2579 1.1 mrg (void**)&module->heap); 2580 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2581 1.1 mrg { 2582 1.1 mrg hsa_error ("Could not allocate memory for GCN heap", status); 2583 1.1 mrg goto fail; 2584 1.1 mrg } 2585 1.1 mrg 2586 1.1 mrg status = hsa_fns.hsa_memory_assign_agent_fn 2587 1.1 mrg (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW); 2588 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2589 1.1 mrg { 2590 1.1 mrg hsa_error ("Could not assign GCN heap memory to device", status); 2591 1.1 mrg goto fail; 2592 1.1 mrg } 2593 1.1 mrg 2594 1.1 mrg hsa_fns.hsa_memory_copy_fn (&module->heap->size, 2595 1.1 mrg &gcn_kernel_heap_size, 2596 1.1 mrg sizeof (gcn_kernel_heap_size)); 2597 1.1 mrg } 2598 1.1 mrg 2599 1.1 mrg } 2600 1.1 mrg 2601 1.1 mrg if (debug) 2602 1.1 mrg dump_executable_symbols (agent->executable); 2603 1.1 mrg 2604 1.1 mrg status = hsa_fns.hsa_executable_freeze_fn (agent->executable, ""); 2605 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2606 1.1 mrg { 2607 1.1 mrg hsa_error ("Could not freeze the GCN executable", status); 2608 1.1 mrg goto fail; 2609 1.1 mrg } 2610 1.1 mrg 2611 1.1 mrg final: 2612 1.1 mrg agent->prog_finalized = true; 2613 1.1 mrg 2614 1.1 mrg if (pthread_mutex_unlock (&agent->prog_mutex)) 2615 1.1 mrg { 2616 1.1 mrg GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex"); 2617 1.1 mrg res = false; 2618 1.1 mrg } 2619 1.1 mrg 2620 1.1 mrg return res; 2621 1.1 mrg 2622 1.1 mrg fail: 2623 1.1 mrg res = false; 2624 1.1 mrg goto final; 2625 1.1 mrg } 2626 1.1 mrg 2627 1.1 mrg /* Free the HSA program in agent and everything associated with it and set 2628 1.1 mrg agent->prog_finalized and the initialized flags of all kernels to false. 2629 1.1 mrg Return TRUE on success. */ 2630 1.1 mrg 2631 1.1 mrg static bool 2632 1.1 mrg destroy_hsa_program (struct agent_info *agent) 2633 1.1 mrg { 2634 1.1 mrg if (!agent->prog_finalized) 2635 1.1 mrg return true; 2636 1.1 mrg 2637 1.1 mrg hsa_status_t status; 2638 1.1 mrg 2639 1.1 mrg GCN_DEBUG ("Destroying the current GCN program.\n"); 2640 1.1 mrg 2641 1.1 mrg status = hsa_fns.hsa_executable_destroy_fn (agent->executable); 2642 1.1 mrg if (status != HSA_STATUS_SUCCESS) 2643 1.1 mrg return hsa_error ("Could not destroy GCN executable", status); 2644 1.1 mrg 2645 1.1 mrg if (agent->module) 2646 1.1 mrg { 2647 1.1 mrg int i; 2648 1.1 mrg for (i = 0; i < agent->module->kernel_count; i++) 2649 1.1 mrg agent->module->kernels[i].initialized = false; 2650 1.1 mrg 2651 1.1 mrg if (agent->module->heap) 2652 1.1 mrg { 2653 1.1 mrg hsa_fns.hsa_memory_free_fn (agent->module->heap); 2654 1.1 mrg agent->module->heap = NULL; 2655 1.1 mrg } 2656 1.1 mrg } 2657 1.1 mrg agent->prog_finalized = false; 2658 1.1 mrg return true; 2659 1.1 mrg } 2660 1.1 mrg 2661 1.1 mrg /* Deinitialize all information associated with MODULE and kernels within 2662 1.1 mrg it. Return TRUE on success. */ 2663 1.1 mrg 2664 1.1 mrg static bool 2665 1.1 mrg destroy_module (struct module_info *module, bool locked) 2666 1.1 mrg { 2667 1.1 mrg /* Run destructors before destroying module. */ 2668 1.1 mrg struct GOMP_kernel_launch_attributes kla = 2669 1.1 mrg { 3, 2670 1.1 mrg /* Grid size. */ 2671 1.1 mrg { 1, 64, 1 }, 2672 1.1 mrg /* Work-group size. */ 2673 1.1 mrg { 1, 64, 1 } 2674 1.1 mrg }; 2675 1.1 mrg 2676 1.1 mrg if (module->fini_array_func) 2677 1.1 mrg { 2678 1.1 mrg init_kernel (module->fini_array_func); 2679 1.1 mrg run_kernel (module->fini_array_func, NULL, &kla, NULL, locked); 2680 1.1 mrg } 2681 1.1 mrg module->constructors_run_p = false; 2682 1.1 mrg 2683 1.1 mrg int i; 2684 1.1 mrg for (i = 0; i < module->kernel_count; i++) 2685 1.1 mrg if (pthread_mutex_destroy (&module->kernels[i].init_mutex)) 2686 1.1 mrg { 2687 1.1 mrg GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization " 2688 1.1 mrg "mutex"); 2689 1.1 mrg return false; 2690 1.1 mrg } 2691 1.1 mrg 2692 1.1 mrg return true; 2693 1.1 mrg } 2694 1.1 mrg 2695 1.1 mrg /* }}} */ 2696 1.1 mrg /* {{{ Async */ 2697 1.1 mrg 2698 1.1 mrg /* Callback of dispatch queues to report errors. */ 2699 1.1 mrg 2700 1.1 mrg static void 2701 1.1 mrg execute_queue_entry (struct goacc_asyncqueue *aq, int index) 2702 1.1 mrg { 2703 1.1 mrg struct queue_entry *entry = &aq->queue[index]; 2704 1.1 mrg 2705 1.1 mrg switch (entry->type) 2706 1.1 mrg { 2707 1.1 mrg case KERNEL_LAUNCH: 2708 1.1 mrg if (DEBUG_QUEUES) 2709 1.1 mrg GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n", 2710 1.1 mrg aq->agent->device_id, aq->id, index); 2711 1.1 mrg run_kernel (entry->u.launch.kernel, 2712 1.1 mrg entry->u.launch.vars, 2713 1.1 mrg &entry->u.launch.kla, aq, false); 2714 1.1 mrg if (DEBUG_QUEUES) 2715 1.1 mrg GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n", 2716 1.1 mrg aq->agent->device_id, aq->id, index); 2717 1.1 mrg break; 2718 1.1 mrg 2719 1.1 mrg case CALLBACK: 2720 1.1 mrg if (DEBUG_QUEUES) 2721 1.1 mrg GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n", 2722 1.1 mrg aq->agent->device_id, aq->id, index); 2723 1.1 mrg entry->u.callback.fn (entry->u.callback.data); 2724 1.1 mrg if (DEBUG_QUEUES) 2725 1.1 mrg GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n", 2726 1.1 mrg aq->agent->device_id, aq->id, index); 2727 1.1 mrg break; 2728 1.1 mrg 2729 1.1 mrg case ASYNC_WAIT: 2730 1.1 mrg { 2731 1.1 mrg /* FIXME: is it safe to access a placeholder that may already have 2732 1.1 mrg been executed? */ 2733 1.1 mrg struct placeholder *placeholderp = entry->u.asyncwait.placeholderp; 2734 1.1 mrg 2735 1.1 mrg if (DEBUG_QUEUES) 2736 1.1 mrg GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n", 2737 1.1 mrg aq->agent->device_id, aq->id, index); 2738 1.1 mrg 2739 1.1 mrg pthread_mutex_lock (&placeholderp->mutex); 2740 1.1 mrg 2741 1.1 mrg while (!placeholderp->executed) 2742 1.1 mrg pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex); 2743 1.1 mrg 2744 1.1 mrg pthread_mutex_unlock (&placeholderp->mutex); 2745 1.1 mrg 2746 1.1 mrg if (pthread_cond_destroy (&placeholderp->cond)) 2747 1.1 mrg GOMP_PLUGIN_error ("Failed to destroy serialization cond"); 2748 1.1 mrg 2749 1.1 mrg if (pthread_mutex_destroy (&placeholderp->mutex)) 2750 1.1 mrg GOMP_PLUGIN_error ("Failed to destroy serialization mutex"); 2751 1.1 mrg 2752 1.1 mrg if (DEBUG_QUEUES) 2753 1.1 mrg GCN_DEBUG ("Async thread %d:%d: Executing async wait " 2754 1.1 mrg "entry (%d) done\n", aq->agent->device_id, aq->id, index); 2755 1.1 mrg } 2756 1.1 mrg break; 2757 1.1 mrg 2758 1.1 mrg case ASYNC_PLACEHOLDER: 2759 1.1 mrg pthread_mutex_lock (&entry->u.placeholder.mutex); 2760 1.1 mrg entry->u.placeholder.executed = 1; 2761 1.1 mrg pthread_cond_signal (&entry->u.placeholder.cond); 2762 1.1 mrg pthread_mutex_unlock (&entry->u.placeholder.mutex); 2763 1.1 mrg break; 2764 1.1 mrg 2765 1.1 mrg default: 2766 1.1 mrg GOMP_PLUGIN_fatal ("Unknown queue element"); 2767 1.1 mrg } 2768 1.1 mrg } 2769 1.1 mrg 2770 1.1 mrg /* This function is run as a thread to service an async queue in the 2771 1.1 mrg background. It runs continuously until the stop flag is set. */ 2772 1.1 mrg 2773 1.1 mrg static void * 2774 1.1 mrg drain_queue (void *thread_arg) 2775 1.1 mrg { 2776 1.1 mrg struct goacc_asyncqueue *aq = thread_arg; 2777 1.1 mrg 2778 1.1 mrg if (DRAIN_QUEUE_SYNCHRONOUS_P) 2779 1.1 mrg { 2780 1.1 mrg aq->drain_queue_stop = 2; 2781 1.1 mrg return NULL; 2782 1.1 mrg } 2783 1.1 mrg 2784 1.1 mrg pthread_mutex_lock (&aq->mutex); 2785 1.1 mrg 2786 1.1 mrg while (true) 2787 1.1 mrg { 2788 1.1 mrg if (aq->drain_queue_stop) 2789 1.1 mrg break; 2790 1.1 mrg 2791 1.1 mrg if (aq->queue_n > 0) 2792 1.1 mrg { 2793 1.1 mrg pthread_mutex_unlock (&aq->mutex); 2794 1.1 mrg execute_queue_entry (aq, aq->queue_first); 2795 1.1 mrg 2796 1.1 mrg pthread_mutex_lock (&aq->mutex); 2797 1.1 mrg aq->queue_first = ((aq->queue_first + 1) 2798 1.1 mrg % ASYNC_QUEUE_SIZE); 2799 1.1 mrg aq->queue_n--; 2800 1.1 mrg 2801 1.1 mrg if (DEBUG_THREAD_SIGNAL) 2802 1.1 mrg GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n", 2803 1.1 mrg aq->agent->device_id, aq->id); 2804 1.1 mrg pthread_cond_broadcast (&aq->queue_cond_out); 2805 1.1 mrg pthread_mutex_unlock (&aq->mutex); 2806 1.1 mrg 2807 1.1 mrg if (DEBUG_QUEUES) 2808 1.1 mrg GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id, 2809 1.1 mrg aq->id); 2810 1.1 mrg pthread_mutex_lock (&aq->mutex); 2811 1.1 mrg } 2812 1.1 mrg else 2813 1.1 mrg { 2814 1.1 mrg if (DEBUG_THREAD_SLEEP) 2815 1.1 mrg GCN_DEBUG ("Async thread %d:%d: going to sleep\n", 2816 1.1 mrg aq->agent->device_id, aq->id); 2817 1.1 mrg pthread_cond_wait (&aq->queue_cond_in, &aq->mutex); 2818 1.1 mrg if (DEBUG_THREAD_SLEEP) 2819 1.1 mrg GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n", 2820 1.1 mrg aq->agent->device_id, aq->id); 2821 1.1 mrg } 2822 1.1 mrg } 2823 1.1 mrg 2824 1.1 mrg aq->drain_queue_stop = 2; 2825 1.1 mrg if (DEBUG_THREAD_SIGNAL) 2826 1.1 mrg GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n", 2827 1.1 mrg aq->agent->device_id, aq->id); 2828 1.1 mrg pthread_cond_broadcast (&aq->queue_cond_out); 2829 1.1 mrg pthread_mutex_unlock (&aq->mutex); 2830 1.1 mrg 2831 1.1 mrg GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id); 2832 1.1 mrg return NULL; 2833 1.1 mrg } 2834 1.1 mrg 2835 1.1 mrg /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which 2836 1.1 mrg is not usually the case. This is just a debug tool. */ 2837 1.1 mrg 2838 1.1 mrg static void 2839 1.1 mrg drain_queue_synchronous (struct goacc_asyncqueue *aq) 2840 1.1 mrg { 2841 1.1 mrg pthread_mutex_lock (&aq->mutex); 2842 1.1 mrg 2843 1.1 mrg while (aq->queue_n > 0) 2844 1.1 mrg { 2845 1.1 mrg execute_queue_entry (aq, aq->queue_first); 2846 1.1 mrg 2847 1.1 mrg aq->queue_first = ((aq->queue_first + 1) 2848 1.1 mrg % ASYNC_QUEUE_SIZE); 2849 1.1 mrg aq->queue_n--; 2850 1.1 mrg } 2851 1.1 mrg 2852 1.1 mrg pthread_mutex_unlock (&aq->mutex); 2853 1.1 mrg } 2854 1.1 mrg 2855 1.1 mrg /* Block the current thread until an async queue is writable. The aq->mutex 2856 1.1 mrg lock should be held on entry, and remains locked on exit. */ 2857 1.1 mrg 2858 1.1 mrg static void 2859 1.1 mrg wait_for_queue_nonfull (struct goacc_asyncqueue *aq) 2860 1.1 mrg { 2861 1.1 mrg if (aq->queue_n == ASYNC_QUEUE_SIZE) 2862 1.1 mrg { 2863 1.1 mrg /* Queue is full. Wait for it to not be full. */ 2864 1.1 mrg while (aq->queue_n == ASYNC_QUEUE_SIZE) 2865 1.1 mrg pthread_cond_wait (&aq->queue_cond_out, &aq->mutex); 2866 1.1 mrg } 2867 1.1 mrg } 2868 1.1 mrg 2869 1.1 mrg /* Request an asynchronous kernel launch on the specified queue. This 2870 1.1 mrg may block if the queue is full, but returns without waiting for the 2871 1.1 mrg kernel to run. */ 2872 1.1 mrg 2873 1.1 mrg static void 2874 1.1 mrg queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel, 2875 1.1 mrg void *vars, struct GOMP_kernel_launch_attributes *kla) 2876 1.1 mrg { 2877 1.1 mrg assert (aq->agent == kernel->agent); 2878 1.1 mrg 2879 1.1 mrg pthread_mutex_lock (&aq->mutex); 2880 1.1 mrg 2881 1.1 mrg wait_for_queue_nonfull (aq); 2882 1.1 mrg 2883 1.1 mrg int queue_last = ((aq->queue_first + aq->queue_n) 2884 1.1 mrg % ASYNC_QUEUE_SIZE); 2885 1.1 mrg if (DEBUG_QUEUES) 2886 1.1 mrg GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id, 2887 1.1 mrg aq->id, queue_last); 2888 1.1 mrg 2889 1.1 mrg aq->queue[queue_last].type = KERNEL_LAUNCH; 2890 1.1 mrg aq->queue[queue_last].u.launch.kernel = kernel; 2891 1.1 mrg aq->queue[queue_last].u.launch.vars = vars; 2892 1.1 mrg aq->queue[queue_last].u.launch.kla = *kla; 2893 1.1 mrg 2894 1.1 mrg aq->queue_n++; 2895 1.1 mrg 2896 1.1 mrg if (DEBUG_THREAD_SIGNAL) 2897 1.1 mrg GCN_DEBUG ("signalling async thread %d:%d: cond_in\n", 2898 1.1 mrg aq->agent->device_id, aq->id); 2899 1.1 mrg pthread_cond_signal (&aq->queue_cond_in); 2900 1.1 mrg 2901 1.1 mrg pthread_mutex_unlock (&aq->mutex); 2902 1.1 mrg } 2903 1.1 mrg 2904 1.1 mrg /* Request an asynchronous callback on the specified queue. The callback 2905 1.1 mrg function will be called, with the given opaque data, from the appropriate 2906 1.1 mrg async thread, when all previous items on that queue are complete. */ 2907 1.1 mrg 2908 1.1 mrg static void 2909 1.1 mrg queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *), 2910 1.1 mrg void *data) 2911 1.1 mrg { 2912 1.1 mrg pthread_mutex_lock (&aq->mutex); 2913 1.1 mrg 2914 1.1 mrg wait_for_queue_nonfull (aq); 2915 1.1 mrg 2916 1.1 mrg int queue_last = ((aq->queue_first + aq->queue_n) 2917 1.1 mrg % ASYNC_QUEUE_SIZE); 2918 1.1 mrg if (DEBUG_QUEUES) 2919 1.1 mrg GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id, 2920 1.1 mrg aq->id, queue_last); 2921 1.1 mrg 2922 1.1 mrg aq->queue[queue_last].type = CALLBACK; 2923 1.1 mrg aq->queue[queue_last].u.callback.fn = fn; 2924 1.1 mrg aq->queue[queue_last].u.callback.data = data; 2925 1.1 mrg 2926 1.1 mrg aq->queue_n++; 2927 1.1 mrg 2928 1.1 mrg if (DEBUG_THREAD_SIGNAL) 2929 1.1 mrg GCN_DEBUG ("signalling async thread %d:%d: cond_in\n", 2930 1.1 mrg aq->agent->device_id, aq->id); 2931 1.1 mrg pthread_cond_signal (&aq->queue_cond_in); 2932 1.1 mrg 2933 1.1 mrg pthread_mutex_unlock (&aq->mutex); 2934 1.1 mrg } 2935 1.1 mrg 2936 1.1 mrg /* Request that a given async thread wait for another thread (unspecified) to 2937 1.1 mrg reach the given placeholder. The wait will occur when all previous entries 2938 1.1 mrg on the queue are complete. A placeholder is effectively a kind of signal 2939 1.1 mrg which simply sets a flag when encountered in a queue. */ 2940 1.1 mrg 2941 1.1 mrg static void 2942 1.1 mrg queue_push_asyncwait (struct goacc_asyncqueue *aq, 2943 1.1 mrg struct placeholder *placeholderp) 2944 1.1 mrg { 2945 1.1 mrg pthread_mutex_lock (&aq->mutex); 2946 1.1 mrg 2947 1.1 mrg wait_for_queue_nonfull (aq); 2948 1.1 mrg 2949 1.1 mrg int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE); 2950 1.1 mrg if (DEBUG_QUEUES) 2951 1.1 mrg GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id, 2952 1.1 mrg aq->id, queue_last); 2953 1.1 mrg 2954 1.1 mrg aq->queue[queue_last].type = ASYNC_WAIT; 2955 1.1 mrg aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp; 2956 1.1 mrg 2957 1.1 mrg aq->queue_n++; 2958 1.1 mrg 2959 1.1 mrg if (DEBUG_THREAD_SIGNAL) 2960 1.1 mrg GCN_DEBUG ("signalling async thread %d:%d: cond_in\n", 2961 1.1 mrg aq->agent->device_id, aq->id); 2962 1.1 mrg pthread_cond_signal (&aq->queue_cond_in); 2963 1.1 mrg 2964 1.1 mrg pthread_mutex_unlock (&aq->mutex); 2965 1.1 mrg } 2966 1.1 mrg 2967 1.1 mrg /* Add a placeholder into an async queue. When the async thread reaches the 2968 1.1 mrg placeholder it will set the "executed" flag to true and continue. 2969 1.1 mrg Another thread may be waiting on this thread reaching the placeholder. */ 2970 1.1 mrg 2971 1.1 mrg static struct placeholder * 2972 1.1 mrg queue_push_placeholder (struct goacc_asyncqueue *aq) 2973 1.1 mrg { 2974 1.1 mrg struct placeholder *placeholderp; 2975 1.1 mrg 2976 1.1 mrg pthread_mutex_lock (&aq->mutex); 2977 1.1 mrg 2978 1.1 mrg wait_for_queue_nonfull (aq); 2979 1.1 mrg 2980 1.1 mrg int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE); 2981 1.1 mrg if (DEBUG_QUEUES) 2982 1.1 mrg GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id, 2983 1.1 mrg aq->id, queue_last); 2984 1.1 mrg 2985 1.1 mrg aq->queue[queue_last].type = ASYNC_PLACEHOLDER; 2986 1.1 mrg placeholderp = &aq->queue[queue_last].u.placeholder; 2987 1.1 mrg 2988 1.1 mrg if (pthread_mutex_init (&placeholderp->mutex, NULL)) 2989 1.1 mrg { 2990 1.1 mrg pthread_mutex_unlock (&aq->mutex); 2991 1.1 mrg GOMP_PLUGIN_error ("Failed to initialize serialization mutex"); 2992 1.1 mrg } 2993 1.1 mrg 2994 1.1 mrg if (pthread_cond_init (&placeholderp->cond, NULL)) 2995 1.1 mrg { 2996 1.1 mrg pthread_mutex_unlock (&aq->mutex); 2997 1.1 mrg GOMP_PLUGIN_error ("Failed to initialize serialization cond"); 2998 1.1 mrg } 2999 1.1 mrg 3000 1.1 mrg placeholderp->executed = 0; 3001 1.1 mrg 3002 1.1 mrg aq->queue_n++; 3003 1.1 mrg 3004 1.1 mrg if (DEBUG_THREAD_SIGNAL) 3005 1.1 mrg GCN_DEBUG ("signalling async thread %d:%d: cond_in\n", 3006 1.1 mrg aq->agent->device_id, aq->id); 3007 1.1 mrg pthread_cond_signal (&aq->queue_cond_in); 3008 1.1 mrg 3009 1.1 mrg pthread_mutex_unlock (&aq->mutex); 3010 1.1 mrg 3011 1.1 mrg return placeholderp; 3012 1.1 mrg } 3013 1.1 mrg 3014 1.1 mrg /* Signal an asynchronous thread to terminate, and wait for it to do so. */ 3015 1.1 mrg 3016 1.1 mrg static void 3017 1.1 mrg finalize_async_thread (struct goacc_asyncqueue *aq) 3018 1.1 mrg { 3019 1.1 mrg pthread_mutex_lock (&aq->mutex); 3020 1.1 mrg if (aq->drain_queue_stop == 2) 3021 1.1 mrg { 3022 1.1 mrg pthread_mutex_unlock (&aq->mutex); 3023 1.1 mrg return; 3024 1.1 mrg } 3025 1.1 mrg 3026 1.1 mrg aq->drain_queue_stop = 1; 3027 1.1 mrg 3028 1.1 mrg if (DEBUG_THREAD_SIGNAL) 3029 1.1 mrg GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n", 3030 1.1 mrg aq->agent->device_id, aq->id); 3031 1.1 mrg pthread_cond_signal (&aq->queue_cond_in); 3032 1.1 mrg 3033 1.1 mrg while (aq->drain_queue_stop != 2) 3034 1.1 mrg { 3035 1.1 mrg if (DEBUG_THREAD_SLEEP) 3036 1.1 mrg GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread" 3037 1.1 mrg " to sleep\n", aq->agent->device_id, aq->id); 3038 1.1 mrg pthread_cond_wait (&aq->queue_cond_out, &aq->mutex); 3039 1.1 mrg if (DEBUG_THREAD_SLEEP) 3040 1.1 mrg GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n", 3041 1.1 mrg aq->agent->device_id, aq->id); 3042 1.1 mrg } 3043 1.1 mrg 3044 1.1 mrg GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id, 3045 1.1 mrg aq->id); 3046 1.1 mrg pthread_mutex_unlock (&aq->mutex); 3047 1.1 mrg 3048 1.1 mrg int err = pthread_join (aq->thread_drain_queue, NULL); 3049 1.1 mrg if (err != 0) 3050 1.1 mrg GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s", 3051 1.1 mrg aq->agent->device_id, aq->id, strerror (err)); 3052 1.1 mrg GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id); 3053 1.1 mrg } 3054 1.1 mrg 3055 1.1 mrg /* Set up an async queue for OpenMP. There will be only one. The 3056 1.1 mrg implementation simply uses an OpenACC async queue. 3057 1.1 mrg FIXME: is this thread-safe if two threads call this function? */ 3058 1.1 mrg 3059 1.1 mrg static void 3060 1.1 mrg maybe_init_omp_async (struct agent_info *agent) 3061 1.1 mrg { 3062 1.1 mrg if (!agent->omp_async_queue) 3063 1.1 mrg agent->omp_async_queue 3064 1.1 mrg = GOMP_OFFLOAD_openacc_async_construct (agent->device_id); 3065 1.1 mrg } 3066 1.1 mrg 3067 1.1 mrg /* A wrapper that works around an issue in the HSA runtime with host-to-device 3068 1.1 mrg copies from read-only pages. */ 3069 1.1 mrg 3070 1.1 mrg static void 3071 1.1 mrg hsa_memory_copy_wrapper (void *dst, const void *src, size_t len) 3072 1.1 mrg { 3073 1.1 mrg hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len); 3074 1.1 mrg 3075 1.1 mrg if (status == HSA_STATUS_SUCCESS) 3076 1.1 mrg return; 3077 1.1 mrg 3078 1.1 mrg /* It appears that the copy fails if the source data is in a read-only page. 3079 1.1 mrg We can't detect that easily, so try copying the data to a temporary buffer 3080 1.1 mrg and doing the copy again if we got an error above. */ 3081 1.1 mrg 3082 1.1 mrg GCN_WARNING ("Read-only data transfer bug workaround triggered for " 3083 1.1 mrg "[%p:+%d]\n", (void *) src, (int) len); 3084 1.1 mrg 3085 1.1 mrg void *src_copy = malloc (len); 3086 1.1 mrg memcpy (src_copy, src, len); 3087 1.1 mrg status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len); 3088 1.1 mrg free (src_copy); 3089 1.1 mrg if (status != HSA_STATUS_SUCCESS) 3090 1.1 mrg GOMP_PLUGIN_error ("memory copy failed"); 3091 1.1 mrg } 3092 1.1 mrg 3093 1.1 mrg /* Copy data to or from a device. This is intended for use as an async 3094 1.1 mrg callback event. */ 3095 1.1 mrg 3096 1.1 mrg static void 3097 1.1 mrg copy_data (void *data_) 3098 1.1 mrg { 3099 1.1 mrg struct copy_data *data = (struct copy_data *)data_; 3100 1.1 mrg GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n", 3101 1.1 mrg data->aq->agent->device_id, data->aq->id, data->len, data->src, 3102 1.1 mrg data->dst); 3103 1.1 mrg hsa_memory_copy_wrapper (data->dst, data->src, data->len); 3104 1.1 mrg free (data); 3105 1.1 mrg } 3106 1.1 mrg 3107 1.1 mrg /* Request an asynchronous data copy, to or from a device, on a given queue. 3108 1.1.1.2 mrg The event will be registered as a callback. */ 3109 1.1 mrg 3110 1.1 mrg static void 3111 1.1 mrg queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src, 3112 1.1.1.2 mrg size_t len) 3113 1.1 mrg { 3114 1.1 mrg if (DEBUG_QUEUES) 3115 1.1 mrg GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n", 3116 1.1 mrg aq->agent->device_id, aq->id, len, src, dst); 3117 1.1 mrg struct copy_data *data 3118 1.1 mrg = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data)); 3119 1.1 mrg data->dst = dst; 3120 1.1 mrg data->src = src; 3121 1.1 mrg data->len = len; 3122 1.1 mrg data->aq = aq; 3123 1.1 mrg queue_push_callback (aq, copy_data, data); 3124 1.1 mrg } 3125 1.1 mrg 3126 1.1 mrg /* Return true if the given queue is currently empty. */ 3127 1.1 mrg 3128 1.1 mrg static int 3129 1.1 mrg queue_empty (struct goacc_asyncqueue *aq) 3130 1.1 mrg { 3131 1.1 mrg pthread_mutex_lock (&aq->mutex); 3132 1.1 mrg int res = aq->queue_n == 0 ? 1 : 0; 3133 1.1 mrg pthread_mutex_unlock (&aq->mutex); 3134 1.1 mrg 3135 1.1 mrg return res; 3136 1.1 mrg } 3137 1.1 mrg 3138 1.1 mrg /* Wait for a given queue to become empty. This implements an OpenACC wait 3139 1.1 mrg directive. */ 3140 1.1 mrg 3141 1.1 mrg static void 3142 1.1 mrg wait_queue (struct goacc_asyncqueue *aq) 3143 1.1 mrg { 3144 1.1 mrg if (DRAIN_QUEUE_SYNCHRONOUS_P) 3145 1.1 mrg { 3146 1.1 mrg drain_queue_synchronous (aq); 3147 1.1 mrg return; 3148 1.1 mrg } 3149 1.1 mrg 3150 1.1 mrg pthread_mutex_lock (&aq->mutex); 3151 1.1 mrg 3152 1.1 mrg while (aq->queue_n > 0) 3153 1.1 mrg { 3154 1.1 mrg if (DEBUG_THREAD_SLEEP) 3155 1.1 mrg GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n", 3156 1.1 mrg aq->agent->device_id, aq->id); 3157 1.1 mrg pthread_cond_wait (&aq->queue_cond_out, &aq->mutex); 3158 1.1 mrg if (DEBUG_THREAD_SLEEP) 3159 1.1 mrg GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id, 3160 1.1 mrg aq->id); 3161 1.1 mrg } 3162 1.1 mrg 3163 1.1 mrg pthread_mutex_unlock (&aq->mutex); 3164 1.1 mrg GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id); 3165 1.1 mrg } 3166 1.1 mrg 3167 1.1 mrg /* }}} */ 3168 1.1 mrg /* {{{ OpenACC support */ 3169 1.1 mrg 3170 1.1 mrg /* Execute an OpenACC kernel, synchronously or asynchronously. */ 3171 1.1 mrg 3172 1.1 mrg static void 3173 1.1.1.3 mrg gcn_exec (struct kernel_info *kernel, 3174 1.1 mrg void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async, 3175 1.1 mrg struct goacc_asyncqueue *aq) 3176 1.1 mrg { 3177 1.1 mrg if (!GOMP_OFFLOAD_can_run (kernel)) 3178 1.1 mrg GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented."); 3179 1.1 mrg 3180 1.1 mrg /* If we get here then this must be an OpenACC kernel. */ 3181 1.1 mrg kernel->kind = KIND_OPENACC; 3182 1.1 mrg 3183 1.1 mrg struct hsa_kernel_description *hsa_kernel_desc = NULL; 3184 1.1 mrg for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++) 3185 1.1 mrg { 3186 1.1 mrg struct hsa_kernel_description *d 3187 1.1 mrg = &kernel->module->image_desc->kernel_infos[i]; 3188 1.1 mrg if (d->name == kernel->name) 3189 1.1 mrg { 3190 1.1 mrg hsa_kernel_desc = d; 3191 1.1 mrg break; 3192 1.1 mrg } 3193 1.1 mrg } 3194 1.1 mrg 3195 1.1 mrg /* We may have statically-determined dimensions in 3196 1.1 mrg hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel 3197 1.1 mrg invocation at runtime in dims[]. We allow static dimensions to take 3198 1.1 mrg priority over dynamic dimensions when present (non-zero). */ 3199 1.1 mrg if (hsa_kernel_desc->oacc_dims[0] > 0) 3200 1.1 mrg dims[0] = hsa_kernel_desc->oacc_dims[0]; 3201 1.1 mrg if (hsa_kernel_desc->oacc_dims[1] > 0) 3202 1.1 mrg dims[1] = hsa_kernel_desc->oacc_dims[1]; 3203 1.1 mrg if (hsa_kernel_desc->oacc_dims[2] > 0) 3204 1.1 mrg dims[2] = hsa_kernel_desc->oacc_dims[2]; 3205 1.1 mrg 3206 1.1.1.2 mrg /* Ideally, when a dimension isn't explicitly specified, we should 3207 1.1.1.2 mrg tune it to run 40 (or 32?) threads per CU with no threads getting queued. 3208 1.1.1.2 mrg In practice, we tune for peak performance on BabelStream, which 3209 1.1.1.2 mrg for OpenACC is currently 32 threads per CU. */ 3210 1.1.1.2 mrg if (dims[0] == 0 && dims[1] == 0) 3211 1.1.1.2 mrg { 3212 1.1.1.2 mrg /* If any of the OpenACC dimensions remain 0 then we get to pick a 3213 1.1.1.2 mrg number. There isn't really a correct answer for this without a clue 3214 1.1.1.2 mrg about the problem size, so let's do a reasonable number of workers 3215 1.1.1.2 mrg and gangs. */ 3216 1.1.1.2 mrg 3217 1.1.1.2 mrg dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */ 3218 1.1.1.2 mrg dims[1] = 8; /* Workers. */ 3219 1.1.1.2 mrg } 3220 1.1.1.2 mrg else if (dims[0] == 0 && dims[1] > 0) 3221 1.1.1.2 mrg { 3222 1.1.1.2 mrg /* Auto-scale the number of gangs with the requested number of workers. */ 3223 1.1.1.2 mrg dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]); 3224 1.1.1.2 mrg } 3225 1.1.1.2 mrg else if (dims[0] > 0 && dims[1] == 0) 3226 1.1.1.2 mrg { 3227 1.1.1.2 mrg /* Auto-scale the number of workers with the requested number of gangs. */ 3228 1.1.1.2 mrg dims[1] = get_cu_count (kernel->agent) * 32 / dims[0]; 3229 1.1.1.2 mrg if (dims[1] == 0) 3230 1.1.1.2 mrg dims[1] = 1; 3231 1.1.1.2 mrg if (dims[1] > 16) 3232 1.1.1.2 mrg dims[1] = 16; 3233 1.1.1.2 mrg } 3234 1.1 mrg 3235 1.1 mrg /* The incoming dimensions are expressed in terms of gangs, workers, and 3236 1.1 mrg vectors. The HSA dimensions are expressed in terms of "work-items", 3237 1.1 mrg which means multiples of vector lanes. 3238 1.1 mrg 3239 1.1 mrg The "grid size" specifies the size of the problem space, and the 3240 1.1 mrg "work-group size" specifies how much of that we want a single compute 3241 1.1 mrg unit to chew on at once. 3242 1.1 mrg 3243 1.1 mrg The three dimensions do not really correspond to hardware, but the 3244 1.1 mrg important thing is that the HSA runtime will launch as many 3245 1.1 mrg work-groups as it takes to process the entire grid, and each 3246 1.1 mrg work-group will contain as many wave-fronts as it takes to process 3247 1.1 mrg the work-items in that group. 3248 1.1 mrg 3249 1.1 mrg Essentially, as long as we set the Y dimension to 64 (the number of 3250 1.1 mrg vector lanes in hardware), and the Z group size to the maximum (16), 3251 1.1 mrg then we will get the gangs (X) and workers (Z) launched as we expect. 3252 1.1 mrg 3253 1.1 mrg The reason for the apparent reversal of vector and worker dimension 3254 1.1 mrg order is to do with the way the run-time distributes work-items across 3255 1.1 mrg v1 and v2. */ 3256 1.1 mrg struct GOMP_kernel_launch_attributes kla = 3257 1.1 mrg {3, 3258 1.1 mrg /* Grid size. */ 3259 1.1 mrg {dims[0], 64, dims[1]}, 3260 1.1 mrg /* Work-group size. */ 3261 1.1 mrg {1, 64, 16} 3262 1.1 mrg }; 3263 1.1 mrg 3264 1.1 mrg struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 3265 1.1 mrg acc_prof_info *prof_info = thr->prof_info; 3266 1.1 mrg acc_event_info enqueue_launch_event_info; 3267 1.1 mrg acc_api_info *api_info = thr->api_info; 3268 1.1 mrg bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false); 3269 1.1 mrg if (profiling_dispatch_p) 3270 1.1 mrg { 3271 1.1 mrg prof_info->event_type = acc_ev_enqueue_launch_start; 3272 1.1 mrg 3273 1.1 mrg enqueue_launch_event_info.launch_event.event_type 3274 1.1 mrg = prof_info->event_type; 3275 1.1 mrg enqueue_launch_event_info.launch_event.valid_bytes 3276 1.1 mrg = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES; 3277 1.1 mrg enqueue_launch_event_info.launch_event.parent_construct 3278 1.1 mrg = acc_construct_parallel; 3279 1.1 mrg enqueue_launch_event_info.launch_event.implicit = 1; 3280 1.1 mrg enqueue_launch_event_info.launch_event.tool_info = NULL; 3281 1.1 mrg enqueue_launch_event_info.launch_event.kernel_name 3282 1.1 mrg = (char *) kernel->name; 3283 1.1 mrg enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0]; 3284 1.1 mrg enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2]; 3285 1.1 mrg enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1]; 3286 1.1 mrg 3287 1.1 mrg api_info->device_api = acc_device_api_other; 3288 1.1 mrg 3289 1.1 mrg GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, 3290 1.1 mrg &enqueue_launch_event_info, api_info); 3291 1.1 mrg } 3292 1.1 mrg 3293 1.1 mrg if (!async) 3294 1.1.1.3 mrg run_kernel (kernel, devaddrs, &kla, NULL, false); 3295 1.1 mrg else 3296 1.1.1.3 mrg queue_push_launch (aq, kernel, devaddrs, &kla); 3297 1.1 mrg 3298 1.1 mrg if (profiling_dispatch_p) 3299 1.1 mrg { 3300 1.1 mrg prof_info->event_type = acc_ev_enqueue_launch_end; 3301 1.1 mrg enqueue_launch_event_info.launch_event.event_type = prof_info->event_type; 3302 1.1 mrg GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, 3303 1.1 mrg &enqueue_launch_event_info, 3304 1.1 mrg api_info); 3305 1.1 mrg } 3306 1.1 mrg } 3307 1.1 mrg 3308 1.1 mrg /* }}} */ 3309 1.1 mrg /* {{{ Generic Plugin API */ 3310 1.1 mrg 3311 1.1 mrg /* Return the name of the accelerator, which is "gcn". */ 3312 1.1 mrg 3313 1.1 mrg const char * 3314 1.1 mrg GOMP_OFFLOAD_get_name (void) 3315 1.1 mrg { 3316 1.1 mrg return "gcn"; 3317 1.1 mrg } 3318 1.1 mrg 3319 1.1 mrg /* Return the specific capabilities the HSA accelerator have. */ 3320 1.1 mrg 3321 1.1 mrg unsigned int 3322 1.1 mrg GOMP_OFFLOAD_get_caps (void) 3323 1.1 mrg { 3324 1.1 mrg /* FIXME: Enable shared memory for APU, but not discrete GPU. */ 3325 1.1 mrg return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400 3326 1.1 mrg | GOMP_OFFLOAD_CAP_OPENACC_200; 3327 1.1 mrg } 3328 1.1 mrg 3329 1.1 mrg /* Identify as GCN accelerator. */ 3330 1.1 mrg 3331 1.1 mrg int 3332 1.1 mrg GOMP_OFFLOAD_get_type (void) 3333 1.1 mrg { 3334 1.1 mrg return OFFLOAD_TARGET_TYPE_GCN; 3335 1.1 mrg } 3336 1.1 mrg 3337 1.1 mrg /* Return the libgomp version number we're compatible with. There is 3338 1.1 mrg no requirement for cross-version compatibility. */ 3339 1.1 mrg 3340 1.1 mrg unsigned 3341 1.1 mrg GOMP_OFFLOAD_version (void) 3342 1.1 mrg { 3343 1.1 mrg return GOMP_VERSION; 3344 1.1 mrg } 3345 1.1 mrg 3346 1.1 mrg /* Return the number of GCN devices on the system. */ 3347 1.1 mrg 3348 1.1 mrg int 3349 1.1.1.3 mrg GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask) 3350 1.1 mrg { 3351 1.1.1.3 mrg if (!init_hsa_context (true)) 3352 1.1.1.3 mrg exit (EXIT_FAILURE); 3353 1.1.1.3 mrg /* Return -1 if no omp_requires_mask cannot be fulfilled but 3354 1.1.1.3 mrg devices were present. */ 3355 1.1.1.3 mrg if (hsa_context.agent_count > 0 3356 1.1.1.3 mrg && ((omp_requires_mask 3357 1.1.1.3 mrg & ~(GOMP_REQUIRES_UNIFIED_ADDRESS 3358 1.1.1.3 mrg | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0)) 3359 1.1.1.3 mrg return -1; 3360 1.1 mrg return hsa_context.agent_count; 3361 1.1 mrg } 3362 1.1 mrg 3363 1.1 mrg /* Initialize device (agent) number N so that it can be used for computation. 3364 1.1 mrg Return TRUE on success. */ 3365 1.1 mrg 3366 1.1 mrg bool 3367 1.1 mrg GOMP_OFFLOAD_init_device (int n) 3368 1.1 mrg { 3369 1.1.1.3 mrg if (!init_hsa_context (false)) 3370 1.1 mrg return false; 3371 1.1 mrg if (n >= hsa_context.agent_count) 3372 1.1 mrg { 3373 1.1 mrg GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n); 3374 1.1 mrg return false; 3375 1.1 mrg } 3376 1.1 mrg struct agent_info *agent = &hsa_context.agents[n]; 3377 1.1 mrg 3378 1.1 mrg if (agent->initialized) 3379 1.1 mrg return true; 3380 1.1 mrg 3381 1.1 mrg agent->device_id = n; 3382 1.1 mrg 3383 1.1 mrg if (pthread_rwlock_init (&agent->module_rwlock, NULL)) 3384 1.1 mrg { 3385 1.1 mrg GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock"); 3386 1.1 mrg return false; 3387 1.1 mrg } 3388 1.1 mrg if (pthread_mutex_init (&agent->prog_mutex, NULL)) 3389 1.1 mrg { 3390 1.1 mrg GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex"); 3391 1.1 mrg return false; 3392 1.1 mrg } 3393 1.1 mrg if (pthread_mutex_init (&agent->async_queues_mutex, NULL)) 3394 1.1 mrg { 3395 1.1 mrg GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex"); 3396 1.1 mrg return false; 3397 1.1 mrg } 3398 1.1.1.3 mrg if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL)) 3399 1.1 mrg { 3400 1.1 mrg GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex"); 3401 1.1 mrg return false; 3402 1.1 mrg } 3403 1.1 mrg agent->async_queues = NULL; 3404 1.1 mrg agent->omp_async_queue = NULL; 3405 1.1.1.3 mrg agent->ephemeral_memories_list = NULL; 3406 1.1 mrg 3407 1.1 mrg uint32_t queue_size; 3408 1.1 mrg hsa_status_t status; 3409 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent->id, 3410 1.1 mrg HSA_AGENT_INFO_QUEUE_MAX_SIZE, 3411 1.1 mrg &queue_size); 3412 1.1 mrg if (status != HSA_STATUS_SUCCESS) 3413 1.1 mrg return hsa_error ("Error requesting maximum queue size of the GCN agent", 3414 1.1 mrg status); 3415 1.1 mrg 3416 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME, 3417 1.1 mrg &agent->name); 3418 1.1 mrg if (status != HSA_STATUS_SUCCESS) 3419 1.1 mrg return hsa_error ("Error querying the name of the agent", status); 3420 1.1 mrg 3421 1.1 mrg agent->device_isa = isa_code (agent->name); 3422 1.1.1.3 mrg if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED) 3423 1.1 mrg return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR); 3424 1.1 mrg 3425 1.1 mrg status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME, 3426 1.1 mrg &agent->vendor_name); 3427 1.1 mrg if (status != HSA_STATUS_SUCCESS) 3428 1.1 mrg return hsa_error ("Error querying the vendor name of the agent", status); 3429 1.1 mrg 3430 1.1 mrg status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size, 3431 1.1 mrg HSA_QUEUE_TYPE_MULTI, 3432 1.1 mrg hsa_queue_callback, NULL, UINT32_MAX, 3433 1.1 mrg UINT32_MAX, &agent->sync_queue); 3434 1.1 mrg if (status != HSA_STATUS_SUCCESS) 3435 1.1 mrg return hsa_error ("Error creating command queue", status); 3436 1.1 mrg 3437 1.1 mrg agent->kernarg_region.handle = (uint64_t) -1; 3438 1.1 mrg status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id, 3439 1.1 mrg get_kernarg_memory_region, 3440 1.1 mrg &agent->kernarg_region); 3441 1.1 mrg if (status != HSA_STATUS_SUCCESS 3442 1.1 mrg && status != HSA_STATUS_INFO_BREAK) 3443 1.1 mrg hsa_error ("Scanning memory regions failed", status); 3444 1.1 mrg if (agent->kernarg_region.handle == (uint64_t) -1) 3445 1.1 mrg { 3446 1.1 mrg GOMP_PLUGIN_error ("Could not find suitable memory region for kernel " 3447 1.1 mrg "arguments"); 3448 1.1 mrg return false; 3449 1.1 mrg } 3450 1.1 mrg GCN_DEBUG ("Selected kernel arguments memory region:\n"); 3451 1.1 mrg dump_hsa_region (agent->kernarg_region, NULL); 3452 1.1 mrg 3453 1.1 mrg agent->data_region.handle = (uint64_t) -1; 3454 1.1 mrg status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id, 3455 1.1 mrg get_data_memory_region, 3456 1.1 mrg &agent->data_region); 3457 1.1 mrg if (status != HSA_STATUS_SUCCESS 3458 1.1 mrg && status != HSA_STATUS_INFO_BREAK) 3459 1.1 mrg hsa_error ("Scanning memory regions failed", status); 3460 1.1 mrg if (agent->data_region.handle == (uint64_t) -1) 3461 1.1 mrg { 3462 1.1 mrg GOMP_PLUGIN_error ("Could not find suitable memory region for device " 3463 1.1 mrg "data"); 3464 1.1 mrg return false; 3465 1.1 mrg } 3466 1.1 mrg GCN_DEBUG ("Selected device data memory region:\n"); 3467 1.1 mrg dump_hsa_region (agent->data_region, NULL); 3468 1.1 mrg 3469 1.1 mrg GCN_DEBUG ("GCN agent %d initialized\n", n); 3470 1.1 mrg 3471 1.1 mrg agent->initialized = true; 3472 1.1 mrg return true; 3473 1.1 mrg } 3474 1.1 mrg 3475 1.1 mrg /* Load GCN object-code module described by struct gcn_image_desc in 3476 1.1 mrg TARGET_DATA and return references to kernel descriptors in TARGET_TABLE. 3477 1.1.1.3 mrg If there are any constructors then run them. If not NULL, REV_FN_TABLE will 3478 1.1.1.3 mrg contain the on-device addresses of the functions for reverse offload. To be 3479 1.1.1.3 mrg freed by the caller. */ 3480 1.1 mrg 3481 1.1 mrg int 3482 1.1 mrg GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, 3483 1.1.1.3 mrg struct addr_pair **target_table, 3484 1.1.1.3 mrg uint64_t **rev_fn_table, 3485 1.1.1.3 mrg uint64_t *host_ind_fn_table) 3486 1.1 mrg { 3487 1.1 mrg if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN) 3488 1.1 mrg { 3489 1.1 mrg GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin" 3490 1.1 mrg " (expected %u, received %u)", 3491 1.1 mrg GOMP_VERSION_GCN, GOMP_VERSION_DEV (version)); 3492 1.1 mrg return -1; 3493 1.1 mrg } 3494 1.1 mrg 3495 1.1 mrg struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data; 3496 1.1 mrg struct agent_info *agent; 3497 1.1 mrg struct addr_pair *pair; 3498 1.1 mrg struct module_info *module; 3499 1.1 mrg struct kernel_info *kernel; 3500 1.1 mrg int kernel_count = image_desc->kernel_count; 3501 1.1.1.3 mrg unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version) 3502 1.1.1.3 mrg ? image_desc->ind_func_count : 0; 3503 1.1 mrg unsigned var_count = image_desc->global_variable_count; 3504 1.1.1.3 mrg /* Currently, "others" is a struct of ICVS. */ 3505 1.1.1.2 mrg int other_count = 1; 3506 1.1 mrg 3507 1.1 mrg agent = get_agent_info (ord); 3508 1.1 mrg if (!agent) 3509 1.1 mrg return -1; 3510 1.1 mrg 3511 1.1 mrg if (pthread_rwlock_wrlock (&agent->module_rwlock)) 3512 1.1 mrg { 3513 1.1 mrg GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock"); 3514 1.1 mrg return -1; 3515 1.1 mrg } 3516 1.1 mrg if (agent->prog_finalized 3517 1.1 mrg && !destroy_hsa_program (agent)) 3518 1.1 mrg return -1; 3519 1.1 mrg 3520 1.1 mrg GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count); 3521 1.1.1.3 mrg GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count); 3522 1.1 mrg GCN_DEBUG ("Encountered %u global variables in an image\n", var_count); 3523 1.1.1.2 mrg GCN_DEBUG ("Expect %d other variables in an image\n", other_count); 3524 1.1.1.2 mrg pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2) 3525 1.1 mrg * sizeof (struct addr_pair)); 3526 1.1 mrg *target_table = pair; 3527 1.1 mrg module = (struct module_info *) 3528 1.1 mrg GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info) 3529 1.1 mrg + kernel_count * sizeof (struct kernel_info)); 3530 1.1 mrg module->image_desc = image_desc; 3531 1.1 mrg module->kernel_count = kernel_count; 3532 1.1 mrg module->heap = NULL; 3533 1.1 mrg module->constructors_run_p = false; 3534 1.1 mrg 3535 1.1 mrg kernel = &module->kernels[0]; 3536 1.1 mrg 3537 1.1 mrg /* Allocate memory for kernel dependencies. */ 3538 1.1 mrg for (unsigned i = 0; i < kernel_count; i++) 3539 1.1 mrg { 3540 1.1 mrg struct hsa_kernel_description *d = &image_desc->kernel_infos[i]; 3541 1.1 mrg if (!init_basic_kernel_info (kernel, d, agent, module)) 3542 1.1 mrg return -1; 3543 1.1 mrg if (strcmp (d->name, "_init_array") == 0) 3544 1.1 mrg module->init_array_func = kernel; 3545 1.1 mrg else if (strcmp (d->name, "_fini_array") == 0) 3546 1.1 mrg module->fini_array_func = kernel; 3547 1.1 mrg else 3548 1.1 mrg { 3549 1.1 mrg pair->start = (uintptr_t) kernel; 3550 1.1 mrg pair->end = (uintptr_t) (kernel + 1); 3551 1.1 mrg pair++; 3552 1.1 mrg } 3553 1.1 mrg kernel++; 3554 1.1 mrg } 3555 1.1 mrg 3556 1.1 mrg agent->module = module; 3557 1.1 mrg if (pthread_rwlock_unlock (&agent->module_rwlock)) 3558 1.1 mrg { 3559 1.1 mrg GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock"); 3560 1.1 mrg return -1; 3561 1.1 mrg } 3562 1.1 mrg 3563 1.1 mrg if (!create_and_finalize_hsa_program (agent)) 3564 1.1 mrg return -1; 3565 1.1 mrg 3566 1.1.1.2 mrg if (var_count > 0) 3567 1.1 mrg { 3568 1.1 mrg hsa_status_t status; 3569 1.1 mrg hsa_executable_symbol_t var_symbol; 3570 1.1 mrg status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL, 3571 1.1.1.2 mrg ".offload_var_table", 3572 1.1.1.2 mrg agent->id, 3573 1.1 mrg 0, &var_symbol); 3574 1.1 mrg 3575 1.1 mrg if (status != HSA_STATUS_SUCCESS) 3576 1.1 mrg hsa_fatal ("Could not find symbol for variable in the code object", 3577 1.1 mrg status); 3578 1.1 mrg 3579 1.1.1.2 mrg uint64_t var_table_addr; 3580 1.1.1.2 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn 3581 1.1.1.2 mrg (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 3582 1.1.1.2 mrg &var_table_addr); 3583 1.1.1.2 mrg if (status != HSA_STATUS_SUCCESS) 3584 1.1.1.2 mrg hsa_fatal ("Could not extract a variable from its symbol", status); 3585 1.1.1.2 mrg 3586 1.1.1.2 mrg struct { 3587 1.1.1.2 mrg uint64_t addr; 3588 1.1.1.2 mrg uint64_t size; 3589 1.1.1.2 mrg } var_table[var_count]; 3590 1.1.1.2 mrg GOMP_OFFLOAD_dev2host (agent->device_id, var_table, 3591 1.1.1.2 mrg (void*)var_table_addr, sizeof (var_table)); 3592 1.1.1.2 mrg 3593 1.1.1.2 mrg for (unsigned i = 0; i < var_count; i++) 3594 1.1.1.2 mrg { 3595 1.1.1.2 mrg pair->start = var_table[i].addr; 3596 1.1.1.2 mrg pair->end = var_table[i].addr + var_table[i].size; 3597 1.1.1.2 mrg GCN_DEBUG ("Found variable at %p with size %lu\n", 3598 1.1.1.2 mrg (void *)var_table[i].addr, var_table[i].size); 3599 1.1.1.2 mrg pair++; 3600 1.1.1.2 mrg } 3601 1.1.1.2 mrg } 3602 1.1.1.2 mrg 3603 1.1.1.3 mrg if (ind_func_count > 0) 3604 1.1.1.3 mrg { 3605 1.1.1.3 mrg hsa_status_t status; 3606 1.1.1.3 mrg 3607 1.1.1.3 mrg /* Read indirect function table from image. */ 3608 1.1.1.3 mrg hsa_executable_symbol_t ind_funcs_symbol; 3609 1.1.1.3 mrg status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL, 3610 1.1.1.3 mrg ".offload_ind_func_table", 3611 1.1.1.3 mrg agent->id, 3612 1.1.1.3 mrg 0, &ind_funcs_symbol); 3613 1.1.1.3 mrg 3614 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 3615 1.1.1.3 mrg hsa_fatal ("Could not find .offload_ind_func_table symbol in the " 3616 1.1.1.3 mrg "code object", status); 3617 1.1.1.3 mrg 3618 1.1.1.3 mrg uint64_t ind_funcs_table_addr; 3619 1.1.1.3 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn 3620 1.1.1.3 mrg (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 3621 1.1.1.3 mrg &ind_funcs_table_addr); 3622 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 3623 1.1.1.3 mrg hsa_fatal ("Could not extract a variable from its symbol", status); 3624 1.1.1.3 mrg 3625 1.1.1.3 mrg uint64_t ind_funcs_table[ind_func_count]; 3626 1.1.1.3 mrg GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table, 3627 1.1.1.3 mrg (void*) ind_funcs_table_addr, 3628 1.1.1.3 mrg sizeof (ind_funcs_table)); 3629 1.1.1.3 mrg 3630 1.1.1.3 mrg /* Build host->target address map for indirect functions. */ 3631 1.1.1.3 mrg uint64_t ind_fn_map[ind_func_count * 2 + 1]; 3632 1.1.1.3 mrg for (unsigned i = 0; i < ind_func_count; i++) 3633 1.1.1.3 mrg { 3634 1.1.1.3 mrg ind_fn_map[i * 2] = host_ind_fn_table[i]; 3635 1.1.1.3 mrg ind_fn_map[i * 2 + 1] = ind_funcs_table[i]; 3636 1.1.1.3 mrg GCN_DEBUG ("Indirect function %d: %lx->%lx\n", 3637 1.1.1.3 mrg i, host_ind_fn_table[i], ind_funcs_table[i]); 3638 1.1.1.3 mrg } 3639 1.1.1.3 mrg ind_fn_map[ind_func_count * 2] = 0; 3640 1.1.1.3 mrg 3641 1.1.1.3 mrg /* Write the map onto the target. */ 3642 1.1.1.3 mrg void *map_target_addr 3643 1.1.1.3 mrg = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map)); 3644 1.1.1.3 mrg GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr); 3645 1.1.1.3 mrg 3646 1.1.1.3 mrg GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr, 3647 1.1.1.3 mrg (void*) ind_fn_map, 3648 1.1.1.3 mrg sizeof (ind_fn_map)); 3649 1.1.1.3 mrg 3650 1.1.1.3 mrg /* Write address of the map onto the target. */ 3651 1.1.1.3 mrg hsa_executable_symbol_t symbol; 3652 1.1.1.3 mrg 3653 1.1.1.3 mrg status 3654 1.1.1.3 mrg = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL, 3655 1.1.1.3 mrg XSTRING (GOMP_INDIRECT_ADDR_MAP), 3656 1.1.1.3 mrg agent->id, 0, &symbol); 3657 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 3658 1.1.1.3 mrg hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object", 3659 1.1.1.3 mrg status); 3660 1.1.1.3 mrg 3661 1.1.1.3 mrg uint64_t varptr; 3662 1.1.1.3 mrg uint32_t varsize; 3663 1.1.1.3 mrg 3664 1.1.1.3 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn 3665 1.1.1.3 mrg (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 3666 1.1.1.3 mrg &varptr); 3667 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 3668 1.1.1.3 mrg hsa_fatal ("Could not extract a variable from its symbol", status); 3669 1.1.1.3 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn 3670 1.1.1.3 mrg (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, 3671 1.1.1.3 mrg &varsize); 3672 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 3673 1.1.1.3 mrg hsa_fatal ("Could not extract a variable size from its symbol", 3674 1.1.1.3 mrg status); 3675 1.1.1.3 mrg 3676 1.1.1.3 mrg GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n", 3677 1.1.1.3 mrg varptr, varsize); 3678 1.1.1.3 mrg 3679 1.1.1.3 mrg GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr, 3680 1.1.1.3 mrg &map_target_addr, 3681 1.1.1.3 mrg sizeof (map_target_addr)); 3682 1.1.1.3 mrg } 3683 1.1.1.3 mrg 3684 1.1.1.3 mrg GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS)); 3685 1.1.1.2 mrg 3686 1.1.1.2 mrg hsa_status_t status; 3687 1.1.1.2 mrg hsa_executable_symbol_t var_symbol; 3688 1.1.1.2 mrg status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL, 3689 1.1.1.3 mrg XSTRING (GOMP_ADDITIONAL_ICVS), 3690 1.1.1.2 mrg agent->id, 0, &var_symbol); 3691 1.1.1.2 mrg if (status == HSA_STATUS_SUCCESS) 3692 1.1.1.2 mrg { 3693 1.1.1.3 mrg uint64_t varptr; 3694 1.1.1.3 mrg uint32_t varsize; 3695 1.1.1.2 mrg 3696 1.1 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn 3697 1.1.1.2 mrg (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 3698 1.1.1.3 mrg &varptr); 3699 1.1 mrg if (status != HSA_STATUS_SUCCESS) 3700 1.1 mrg hsa_fatal ("Could not extract a variable from its symbol", status); 3701 1.1 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn 3702 1.1.1.2 mrg (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, 3703 1.1.1.3 mrg &varsize); 3704 1.1 mrg if (status != HSA_STATUS_SUCCESS) 3705 1.1.1.3 mrg hsa_fatal ("Could not extract a variable size from its symbol", 3706 1.1.1.3 mrg status); 3707 1.1 mrg 3708 1.1.1.3 mrg pair->start = varptr; 3709 1.1.1.3 mrg pair->end = varptr + varsize; 3710 1.1 mrg } 3711 1.1.1.2 mrg else 3712 1.1.1.3 mrg { 3713 1.1.1.3 mrg /* The variable was not in this image. */ 3714 1.1.1.3 mrg GCN_DEBUG ("Variable not found in image: %s\n", 3715 1.1.1.3 mrg XSTRING (GOMP_ADDITIONAL_ICVS)); 3716 1.1.1.3 mrg pair->start = pair->end = 0; 3717 1.1.1.3 mrg } 3718 1.1 mrg 3719 1.1 mrg /* Ensure that constructors are run first. */ 3720 1.1 mrg struct GOMP_kernel_launch_attributes kla = 3721 1.1 mrg { 3, 3722 1.1 mrg /* Grid size. */ 3723 1.1 mrg { 1, 64, 1 }, 3724 1.1 mrg /* Work-group size. */ 3725 1.1 mrg { 1, 64, 1 } 3726 1.1 mrg }; 3727 1.1 mrg 3728 1.1 mrg if (module->init_array_func) 3729 1.1 mrg { 3730 1.1 mrg init_kernel (module->init_array_func); 3731 1.1 mrg run_kernel (module->init_array_func, NULL, &kla, NULL, false); 3732 1.1 mrg } 3733 1.1 mrg module->constructors_run_p = true; 3734 1.1 mrg 3735 1.1 mrg /* Don't report kernels that libgomp need not know about. */ 3736 1.1 mrg if (module->init_array_func) 3737 1.1 mrg kernel_count--; 3738 1.1 mrg if (module->fini_array_func) 3739 1.1 mrg kernel_count--; 3740 1.1 mrg 3741 1.1.1.3 mrg if (rev_fn_table != NULL && kernel_count == 0) 3742 1.1.1.3 mrg *rev_fn_table = NULL; 3743 1.1.1.3 mrg else if (rev_fn_table != NULL) 3744 1.1.1.3 mrg { 3745 1.1.1.3 mrg hsa_status_t status; 3746 1.1.1.3 mrg hsa_executable_symbol_t var_symbol; 3747 1.1.1.3 mrg status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL, 3748 1.1.1.3 mrg ".offload_func_table", 3749 1.1.1.3 mrg agent->id, 0, &var_symbol); 3750 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 3751 1.1.1.3 mrg hsa_fatal ("Could not find symbol for variable in the code object", 3752 1.1.1.3 mrg status); 3753 1.1.1.3 mrg uint64_t fn_table_addr; 3754 1.1.1.3 mrg status = hsa_fns.hsa_executable_symbol_get_info_fn 3755 1.1.1.3 mrg (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, 3756 1.1.1.3 mrg &fn_table_addr); 3757 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 3758 1.1.1.3 mrg hsa_fatal ("Could not extract a variable from its symbol", status); 3759 1.1.1.3 mrg *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t)); 3760 1.1.1.3 mrg GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table, 3761 1.1.1.3 mrg (void*) fn_table_addr, 3762 1.1.1.3 mrg kernel_count * sizeof (uint64_t)); 3763 1.1.1.3 mrg } 3764 1.1.1.3 mrg 3765 1.1.1.2 mrg return kernel_count + var_count + other_count; 3766 1.1 mrg } 3767 1.1 mrg 3768 1.1 mrg /* Unload GCN object-code module described by struct gcn_image_desc in 3769 1.1 mrg TARGET_DATA from agent number N. Return TRUE on success. */ 3770 1.1 mrg 3771 1.1 mrg bool 3772 1.1 mrg GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data) 3773 1.1 mrg { 3774 1.1 mrg if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN) 3775 1.1 mrg { 3776 1.1 mrg GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin" 3777 1.1 mrg " (expected %u, received %u)", 3778 1.1 mrg GOMP_VERSION_GCN, GOMP_VERSION_DEV (version)); 3779 1.1 mrg return false; 3780 1.1 mrg } 3781 1.1 mrg 3782 1.1 mrg struct agent_info *agent; 3783 1.1 mrg agent = get_agent_info (n); 3784 1.1 mrg if (!agent) 3785 1.1 mrg return false; 3786 1.1 mrg 3787 1.1 mrg if (pthread_rwlock_wrlock (&agent->module_rwlock)) 3788 1.1 mrg { 3789 1.1 mrg GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock"); 3790 1.1 mrg return false; 3791 1.1 mrg } 3792 1.1 mrg 3793 1.1 mrg if (!agent->module || agent->module->image_desc != target_data) 3794 1.1 mrg { 3795 1.1 mrg GOMP_PLUGIN_error ("Attempt to unload an image that has never been " 3796 1.1 mrg "loaded before"); 3797 1.1 mrg return false; 3798 1.1 mrg } 3799 1.1 mrg 3800 1.1 mrg if (!destroy_module (agent->module, true)) 3801 1.1 mrg return false; 3802 1.1 mrg free (agent->module); 3803 1.1 mrg agent->module = NULL; 3804 1.1 mrg if (!destroy_hsa_program (agent)) 3805 1.1 mrg return false; 3806 1.1 mrg if (pthread_rwlock_unlock (&agent->module_rwlock)) 3807 1.1 mrg { 3808 1.1 mrg GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock"); 3809 1.1 mrg return false; 3810 1.1 mrg } 3811 1.1 mrg return true; 3812 1.1 mrg } 3813 1.1 mrg 3814 1.1 mrg /* Deinitialize all information and status associated with agent number N. We 3815 1.1 mrg do not attempt any synchronization, assuming the user and libgomp will not 3816 1.1 mrg attempt deinitialization of a device that is in any way being used at the 3817 1.1 mrg same time. Return TRUE on success. */ 3818 1.1 mrg 3819 1.1 mrg bool 3820 1.1 mrg GOMP_OFFLOAD_fini_device (int n) 3821 1.1 mrg { 3822 1.1 mrg struct agent_info *agent = get_agent_info (n); 3823 1.1 mrg if (!agent) 3824 1.1 mrg return false; 3825 1.1 mrg 3826 1.1 mrg if (!agent->initialized) 3827 1.1 mrg return true; 3828 1.1 mrg 3829 1.1 mrg if (agent->omp_async_queue) 3830 1.1 mrg { 3831 1.1 mrg GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue); 3832 1.1 mrg agent->omp_async_queue = NULL; 3833 1.1 mrg } 3834 1.1 mrg 3835 1.1 mrg if (agent->module) 3836 1.1 mrg { 3837 1.1 mrg if (!destroy_module (agent->module, false)) 3838 1.1 mrg return false; 3839 1.1 mrg free (agent->module); 3840 1.1 mrg agent->module = NULL; 3841 1.1 mrg } 3842 1.1 mrg 3843 1.1.1.3 mrg if (!destroy_ephemeral_memories (agent)) 3844 1.1 mrg return false; 3845 1.1 mrg 3846 1.1 mrg if (!destroy_hsa_program (agent)) 3847 1.1 mrg return false; 3848 1.1 mrg 3849 1.1 mrg hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue); 3850 1.1 mrg if (status != HSA_STATUS_SUCCESS) 3851 1.1 mrg return hsa_error ("Error destroying command queue", status); 3852 1.1 mrg 3853 1.1 mrg if (pthread_mutex_destroy (&agent->prog_mutex)) 3854 1.1 mrg { 3855 1.1 mrg GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex"); 3856 1.1 mrg return false; 3857 1.1 mrg } 3858 1.1 mrg if (pthread_rwlock_destroy (&agent->module_rwlock)) 3859 1.1 mrg { 3860 1.1 mrg GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock"); 3861 1.1 mrg return false; 3862 1.1 mrg } 3863 1.1 mrg 3864 1.1 mrg if (pthread_mutex_destroy (&agent->async_queues_mutex)) 3865 1.1 mrg { 3866 1.1 mrg GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex"); 3867 1.1 mrg return false; 3868 1.1 mrg } 3869 1.1.1.3 mrg if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock)) 3870 1.1 mrg { 3871 1.1.1.3 mrg GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex"); 3872 1.1 mrg return false; 3873 1.1 mrg } 3874 1.1 mrg agent->initialized = false; 3875 1.1 mrg return true; 3876 1.1 mrg } 3877 1.1 mrg 3878 1.1 mrg /* Return true if the HSA runtime can run function FN_PTR. */ 3879 1.1 mrg 3880 1.1 mrg bool 3881 1.1 mrg GOMP_OFFLOAD_can_run (void *fn_ptr) 3882 1.1 mrg { 3883 1.1 mrg struct kernel_info *kernel = (struct kernel_info *) fn_ptr; 3884 1.1 mrg 3885 1.1 mrg init_kernel (kernel); 3886 1.1 mrg if (kernel->initialization_failed) 3887 1.1.1.3 mrg GOMP_PLUGIN_fatal ("kernel initialization failed"); 3888 1.1 mrg 3889 1.1 mrg return true; 3890 1.1 mrg } 3891 1.1 mrg 3892 1.1 mrg /* Allocate memory on device N. */ 3893 1.1 mrg 3894 1.1 mrg void * 3895 1.1 mrg GOMP_OFFLOAD_alloc (int n, size_t size) 3896 1.1 mrg { 3897 1.1 mrg struct agent_info *agent = get_agent_info (n); 3898 1.1 mrg return alloc_by_agent (agent, size); 3899 1.1 mrg } 3900 1.1 mrg 3901 1.1 mrg /* Free memory from device N. */ 3902 1.1 mrg 3903 1.1 mrg bool 3904 1.1 mrg GOMP_OFFLOAD_free (int device, void *ptr) 3905 1.1 mrg { 3906 1.1 mrg GCN_DEBUG ("Freeing memory on device %d\n", device); 3907 1.1 mrg 3908 1.1 mrg hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr); 3909 1.1 mrg if (status != HSA_STATUS_SUCCESS) 3910 1.1 mrg { 3911 1.1 mrg hsa_error ("Could not free device memory", status); 3912 1.1 mrg return false; 3913 1.1 mrg } 3914 1.1 mrg 3915 1.1 mrg struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); 3916 1.1 mrg bool profiling_dispatch_p 3917 1.1 mrg = __builtin_expect (thr != NULL && thr->prof_info != NULL, false); 3918 1.1 mrg if (profiling_dispatch_p) 3919 1.1 mrg { 3920 1.1 mrg acc_prof_info *prof_info = thr->prof_info; 3921 1.1 mrg acc_event_info data_event_info; 3922 1.1 mrg acc_api_info *api_info = thr->api_info; 3923 1.1 mrg 3924 1.1 mrg prof_info->event_type = acc_ev_free; 3925 1.1 mrg 3926 1.1 mrg data_event_info.data_event.event_type = prof_info->event_type; 3927 1.1 mrg data_event_info.data_event.valid_bytes 3928 1.1 mrg = _ACC_DATA_EVENT_INFO_VALID_BYTES; 3929 1.1 mrg data_event_info.data_event.parent_construct 3930 1.1 mrg = acc_construct_parallel; 3931 1.1 mrg data_event_info.data_event.implicit = 1; 3932 1.1 mrg data_event_info.data_event.tool_info = NULL; 3933 1.1 mrg data_event_info.data_event.var_name = NULL; 3934 1.1 mrg data_event_info.data_event.bytes = 0; 3935 1.1 mrg data_event_info.data_event.host_ptr = NULL; 3936 1.1 mrg data_event_info.data_event.device_ptr = (void *) ptr; 3937 1.1 mrg 3938 1.1 mrg api_info->device_api = acc_device_api_other; 3939 1.1 mrg 3940 1.1 mrg GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, 3941 1.1 mrg api_info); 3942 1.1 mrg } 3943 1.1 mrg 3944 1.1 mrg return true; 3945 1.1 mrg } 3946 1.1 mrg 3947 1.1 mrg /* Copy data from DEVICE to host. */ 3948 1.1 mrg 3949 1.1 mrg bool 3950 1.1 mrg GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n) 3951 1.1 mrg { 3952 1.1 mrg GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device, 3953 1.1 mrg src, dst); 3954 1.1 mrg hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n); 3955 1.1 mrg if (status != HSA_STATUS_SUCCESS) 3956 1.1 mrg GOMP_PLUGIN_error ("memory copy failed"); 3957 1.1 mrg return true; 3958 1.1 mrg } 3959 1.1 mrg 3960 1.1 mrg /* Copy data from host to DEVICE. */ 3961 1.1 mrg 3962 1.1 mrg bool 3963 1.1 mrg GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n) 3964 1.1 mrg { 3965 1.1 mrg GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src, 3966 1.1 mrg device, dst); 3967 1.1 mrg hsa_memory_copy_wrapper (dst, src, n); 3968 1.1 mrg return true; 3969 1.1 mrg } 3970 1.1 mrg 3971 1.1 mrg /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */ 3972 1.1 mrg 3973 1.1 mrg bool 3974 1.1 mrg GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n) 3975 1.1 mrg { 3976 1.1 mrg struct gcn_thread *thread_data = gcn_thread (); 3977 1.1 mrg 3978 1.1 mrg if (thread_data && !async_synchronous_p (thread_data->async)) 3979 1.1 mrg { 3980 1.1 mrg struct agent_info *agent = get_agent_info (device); 3981 1.1 mrg maybe_init_omp_async (agent); 3982 1.1.1.2 mrg queue_push_copy (agent->omp_async_queue, dst, src, n); 3983 1.1 mrg return true; 3984 1.1 mrg } 3985 1.1 mrg 3986 1.1 mrg GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n, 3987 1.1 mrg device, src, device, dst); 3988 1.1 mrg hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n); 3989 1.1 mrg if (status != HSA_STATUS_SUCCESS) 3990 1.1 mrg GOMP_PLUGIN_error ("memory copy failed"); 3991 1.1 mrg return true; 3992 1.1 mrg } 3993 1.1 mrg 3994 1.1.1.3 mrg /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e. 3995 1.1.1.3 mrg measured in bytes. So we have: 3996 1.1.1.3 mrg 3997 1.1.1.3 mrg dim1_size: number of bytes to copy on innermost dimension ("row") 3998 1.1.1.3 mrg dim0_len: number of rows to copy 3999 1.1.1.3 mrg dst: base pointer for destination of copy 4000 1.1.1.3 mrg dst_offset1_size: innermost row offset (for dest), in bytes 4001 1.1.1.3 mrg dst_offset0_len: offset, number of rows (for dest) 4002 1.1.1.3 mrg dst_dim1_size: whole-array dest row length, in bytes (pitch) 4003 1.1.1.3 mrg src: base pointer for source of copy 4004 1.1.1.3 mrg src_offset1_size: innermost row offset (for source), in bytes 4005 1.1.1.3 mrg src_offset0_len: offset, number of rows (for source) 4006 1.1.1.3 mrg src_dim1_size: whole-array source row length, in bytes (pitch) 4007 1.1.1.3 mrg */ 4008 1.1.1.3 mrg 4009 1.1.1.3 mrg int 4010 1.1.1.3 mrg GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size, 4011 1.1.1.3 mrg size_t dim0_len, void *dst, size_t dst_offset1_size, 4012 1.1.1.3 mrg size_t dst_offset0_len, size_t dst_dim1_size, 4013 1.1.1.3 mrg const void *src, size_t src_offset1_size, 4014 1.1.1.3 mrg size_t src_offset0_len, size_t src_dim1_size) 4015 1.1.1.3 mrg { 4016 1.1.1.3 mrg if (!hsa_fns.hsa_amd_memory_lock_fn 4017 1.1.1.3 mrg || !hsa_fns.hsa_amd_memory_unlock_fn 4018 1.1.1.3 mrg || !hsa_fns.hsa_amd_memory_async_copy_rect_fn) 4019 1.1.1.3 mrg return -1; 4020 1.1.1.3 mrg 4021 1.1.1.3 mrg /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail 4022 1.1.1.3 mrg out quietly if we have anything oddly-aligned rather than letting the 4023 1.1.1.3 mrg driver raise an error. */ 4024 1.1.1.3 mrg if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0) 4025 1.1.1.3 mrg return -1; 4026 1.1.1.3 mrg 4027 1.1.1.3 mrg if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0) 4028 1.1.1.3 mrg return -1; 4029 1.1.1.3 mrg 4030 1.1.1.3 mrg /* Only handle host to device or device to host transfers here. */ 4031 1.1.1.3 mrg if ((dst_ord == -1 && src_ord == -1) 4032 1.1.1.3 mrg || (dst_ord != -1 && src_ord != -1)) 4033 1.1.1.3 mrg return -1; 4034 1.1.1.3 mrg 4035 1.1.1.3 mrg hsa_amd_copy_direction_t dir 4036 1.1.1.3 mrg = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost; 4037 1.1.1.3 mrg hsa_agent_t copy_agent; 4038 1.1.1.3 mrg 4039 1.1.1.3 mrg /* We need to pin (lock) host memory before we start the transfer. Try to 4040 1.1.1.3 mrg lock the minimum size necessary, i.e. using partial first/last rows of the 4041 1.1.1.3 mrg whole array. Something like this: 4042 1.1.1.3 mrg 4043 1.1.1.3 mrg rows --> 4044 1.1.1.3 mrg .............. 4045 1.1.1.3 mrg c | ..#######+++++ <- first row apart from {src,dst}_offset1_size 4046 1.1.1.3 mrg o | ++#######+++++ <- whole row 4047 1.1.1.3 mrg l | ++#######+++++ <- " 4048 1.1.1.3 mrg s v ++#######..... <- last row apart from trailing remainder 4049 1.1.1.3 mrg .............. 4050 1.1.1.3 mrg 4051 1.1.1.3 mrg We could split very large transfers into several rectangular copies, but 4052 1.1.1.3 mrg that is unimplemented for now. */ 4053 1.1.1.3 mrg 4054 1.1.1.3 mrg size_t bounded_size_host, first_elem_offset_host; 4055 1.1.1.3 mrg void *host_ptr; 4056 1.1.1.3 mrg if (dir == hsaHostToDevice) 4057 1.1.1.3 mrg { 4058 1.1.1.3 mrg bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size; 4059 1.1.1.3 mrg first_elem_offset_host = src_offset0_len * src_dim1_size 4060 1.1.1.3 mrg + src_offset1_size; 4061 1.1.1.3 mrg host_ptr = (void *) src; 4062 1.1.1.3 mrg struct agent_info *agent = get_agent_info (dst_ord); 4063 1.1.1.3 mrg copy_agent = agent->id; 4064 1.1.1.3 mrg } 4065 1.1.1.3 mrg else 4066 1.1.1.3 mrg { 4067 1.1.1.3 mrg bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size; 4068 1.1.1.3 mrg first_elem_offset_host = dst_offset0_len * dst_dim1_size 4069 1.1.1.3 mrg + dst_offset1_size; 4070 1.1.1.3 mrg host_ptr = dst; 4071 1.1.1.3 mrg struct agent_info *agent = get_agent_info (src_ord); 4072 1.1.1.3 mrg copy_agent = agent->id; 4073 1.1.1.3 mrg } 4074 1.1.1.3 mrg 4075 1.1.1.3 mrg void *agent_ptr; 4076 1.1.1.3 mrg 4077 1.1.1.3 mrg hsa_status_t status 4078 1.1.1.3 mrg = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host, 4079 1.1.1.3 mrg bounded_size_host, NULL, 0, &agent_ptr); 4080 1.1.1.3 mrg /* We can't lock the host memory: don't give up though, we might still be 4081 1.1.1.3 mrg able to use the slow path in our caller. So, don't make this an 4082 1.1.1.3 mrg error. */ 4083 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 4084 1.1.1.3 mrg return -1; 4085 1.1.1.3 mrg 4086 1.1.1.3 mrg hsa_pitched_ptr_t dstpp, srcpp; 4087 1.1.1.3 mrg hsa_dim3_t dst_offsets, src_offsets, ranges; 4088 1.1.1.3 mrg 4089 1.1.1.3 mrg int retval = 1; 4090 1.1.1.3 mrg 4091 1.1.1.3 mrg hsa_signal_t completion_signal; 4092 1.1.1.3 mrg status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal); 4093 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 4094 1.1.1.3 mrg { 4095 1.1.1.3 mrg retval = -1; 4096 1.1.1.3 mrg goto unlock; 4097 1.1.1.3 mrg } 4098 1.1.1.3 mrg 4099 1.1.1.3 mrg if (dir == hsaHostToDevice) 4100 1.1.1.3 mrg { 4101 1.1.1.3 mrg srcpp.base = agent_ptr - first_elem_offset_host; 4102 1.1.1.3 mrg dstpp.base = dst; 4103 1.1.1.3 mrg } 4104 1.1.1.3 mrg else 4105 1.1.1.3 mrg { 4106 1.1.1.3 mrg srcpp.base = (void *) src; 4107 1.1.1.3 mrg dstpp.base = agent_ptr - first_elem_offset_host; 4108 1.1.1.3 mrg } 4109 1.1.1.3 mrg 4110 1.1.1.3 mrg srcpp.pitch = src_dim1_size; 4111 1.1.1.3 mrg srcpp.slice = 0; 4112 1.1.1.3 mrg 4113 1.1.1.3 mrg src_offsets.x = src_offset1_size; 4114 1.1.1.3 mrg src_offsets.y = src_offset0_len; 4115 1.1.1.3 mrg src_offsets.z = 0; 4116 1.1.1.3 mrg 4117 1.1.1.3 mrg dstpp.pitch = dst_dim1_size; 4118 1.1.1.3 mrg dstpp.slice = 0; 4119 1.1.1.3 mrg 4120 1.1.1.3 mrg dst_offsets.x = dst_offset1_size; 4121 1.1.1.3 mrg dst_offsets.y = dst_offset0_len; 4122 1.1.1.3 mrg dst_offsets.z = 0; 4123 1.1.1.3 mrg 4124 1.1.1.3 mrg ranges.x = dim1_size; 4125 1.1.1.3 mrg ranges.y = dim0_len; 4126 1.1.1.3 mrg ranges.z = 1; 4127 1.1.1.3 mrg 4128 1.1.1.3 mrg status 4129 1.1.1.3 mrg = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp, 4130 1.1.1.3 mrg &src_offsets, &ranges, 4131 1.1.1.3 mrg copy_agent, dir, 0, NULL, 4132 1.1.1.3 mrg completion_signal); 4133 1.1.1.3 mrg /* If the rectangular copy fails, we might still be able to use the slow 4134 1.1.1.3 mrg path. We need to unlock the host memory though, so don't return 4135 1.1.1.3 mrg immediately. */ 4136 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 4137 1.1.1.3 mrg retval = -1; 4138 1.1.1.3 mrg else 4139 1.1.1.3 mrg hsa_fns.hsa_signal_wait_acquire_fn (completion_signal, 4140 1.1.1.3 mrg HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, 4141 1.1.1.3 mrg HSA_WAIT_STATE_ACTIVE); 4142 1.1.1.3 mrg 4143 1.1.1.3 mrg hsa_fns.hsa_signal_destroy_fn (completion_signal); 4144 1.1.1.3 mrg 4145 1.1.1.3 mrg unlock: 4146 1.1.1.3 mrg status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host); 4147 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 4148 1.1.1.3 mrg hsa_fatal ("Could not unlock host memory", status); 4149 1.1.1.3 mrg 4150 1.1.1.3 mrg return retval; 4151 1.1.1.3 mrg } 4152 1.1.1.3 mrg 4153 1.1.1.3 mrg /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e. 4154 1.1.1.3 mrg measured in bytes. So we have: 4155 1.1.1.3 mrg 4156 1.1.1.3 mrg dim2_size: number of bytes to copy on innermost dimension ("row") 4157 1.1.1.3 mrg dim1_len: number of rows per slice to copy 4158 1.1.1.3 mrg dim0_len: number of slices to copy 4159 1.1.1.3 mrg dst: base pointer for destination of copy 4160 1.1.1.3 mrg dst_offset2_size: innermost row offset (for dest), in bytes 4161 1.1.1.3 mrg dst_offset1_len: offset, number of rows (for dest) 4162 1.1.1.3 mrg dst_offset0_len: offset, number of slices (for dest) 4163 1.1.1.3 mrg dst_dim2_size: whole-array dest row length, in bytes (pitch) 4164 1.1.1.3 mrg dst_dim1_len: whole-array number of rows in slice (for dest) 4165 1.1.1.3 mrg src: base pointer for source of copy 4166 1.1.1.3 mrg src_offset2_size: innermost row offset (for source), in bytes 4167 1.1.1.3 mrg src_offset1_len: offset, number of rows (for source) 4168 1.1.1.3 mrg src_offset0_len: offset, number of slices (for source) 4169 1.1.1.3 mrg src_dim2_size: whole-array source row length, in bytes (pitch) 4170 1.1.1.3 mrg src_dim1_len: whole-array number of rows in slice (for source) 4171 1.1.1.3 mrg */ 4172 1.1.1.3 mrg 4173 1.1.1.3 mrg int 4174 1.1.1.3 mrg GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, 4175 1.1.1.3 mrg size_t dim1_len, size_t dim0_len, void *dst, 4176 1.1.1.3 mrg size_t dst_offset2_size, size_t dst_offset1_len, 4177 1.1.1.3 mrg size_t dst_offset0_len, size_t dst_dim2_size, 4178 1.1.1.3 mrg size_t dst_dim1_len, const void *src, 4179 1.1.1.3 mrg size_t src_offset2_size, size_t src_offset1_len, 4180 1.1.1.3 mrg size_t src_offset0_len, size_t src_dim2_size, 4181 1.1.1.3 mrg size_t src_dim1_len) 4182 1.1.1.3 mrg { 4183 1.1.1.3 mrg if (!hsa_fns.hsa_amd_memory_lock_fn 4184 1.1.1.3 mrg || !hsa_fns.hsa_amd_memory_unlock_fn 4185 1.1.1.3 mrg || !hsa_fns.hsa_amd_memory_async_copy_rect_fn) 4186 1.1.1.3 mrg return -1; 4187 1.1.1.3 mrg 4188 1.1.1.3 mrg /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail 4189 1.1.1.3 mrg out quietly if we have anything oddly-aligned rather than letting the 4190 1.1.1.3 mrg driver raise an error. */ 4191 1.1.1.3 mrg if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0) 4192 1.1.1.3 mrg return -1; 4193 1.1.1.3 mrg 4194 1.1.1.3 mrg if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0) 4195 1.1.1.3 mrg return -1; 4196 1.1.1.3 mrg 4197 1.1.1.3 mrg /* Only handle host to device or device to host transfers here. */ 4198 1.1.1.3 mrg if ((dst_ord == -1 && src_ord == -1) 4199 1.1.1.3 mrg || (dst_ord != -1 && src_ord != -1)) 4200 1.1.1.3 mrg return -1; 4201 1.1.1.3 mrg 4202 1.1.1.3 mrg hsa_amd_copy_direction_t dir 4203 1.1.1.3 mrg = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost; 4204 1.1.1.3 mrg hsa_agent_t copy_agent; 4205 1.1.1.3 mrg 4206 1.1.1.3 mrg /* We need to pin (lock) host memory before we start the transfer. Try to 4207 1.1.1.3 mrg lock the minimum size necessary, i.e. using partial first/last slices of 4208 1.1.1.3 mrg the whole 3D array. Something like this: 4209 1.1.1.3 mrg 4210 1.1.1.3 mrg slice 0: slice 1: slice 2: 4211 1.1.1.3 mrg __________ __________ __________ 4212 1.1.1.3 mrg ^ /+++++++++/ : /+++++++++/ : / / 4213 1.1.1.3 mrg column /+++##++++/| | /+++##++++/| | /+++## / # = subarray 4214 1.1.1.3 mrg / / ##++++/ | |/+++##++++/ | |/+++##++++/ + = area to pin 4215 1.1.1.3 mrg /_________/ : /_________/ : /_________/ 4216 1.1.1.3 mrg row ---> 4217 1.1.1.3 mrg 4218 1.1.1.3 mrg We could split very large transfers into several rectangular copies, but 4219 1.1.1.3 mrg that is unimplemented for now. */ 4220 1.1.1.3 mrg 4221 1.1.1.3 mrg size_t bounded_size_host, first_elem_offset_host; 4222 1.1.1.3 mrg void *host_ptr; 4223 1.1.1.3 mrg if (dir == hsaHostToDevice) 4224 1.1.1.3 mrg { 4225 1.1.1.3 mrg size_t slice_bytes = src_dim2_size * src_dim1_len; 4226 1.1.1.3 mrg bounded_size_host = slice_bytes * (dim0_len - 1) 4227 1.1.1.3 mrg + src_dim2_size * (dim1_len - 1) 4228 1.1.1.3 mrg + dim2_size; 4229 1.1.1.3 mrg first_elem_offset_host = src_offset0_len * slice_bytes 4230 1.1.1.3 mrg + src_offset1_len * src_dim2_size 4231 1.1.1.3 mrg + src_offset2_size; 4232 1.1.1.3 mrg host_ptr = (void *) src; 4233 1.1.1.3 mrg struct agent_info *agent = get_agent_info (dst_ord); 4234 1.1.1.3 mrg copy_agent = agent->id; 4235 1.1.1.3 mrg } 4236 1.1.1.3 mrg else 4237 1.1.1.3 mrg { 4238 1.1.1.3 mrg size_t slice_bytes = dst_dim2_size * dst_dim1_len; 4239 1.1.1.3 mrg bounded_size_host = slice_bytes * (dim0_len - 1) 4240 1.1.1.3 mrg + dst_dim2_size * (dim1_len - 1) 4241 1.1.1.3 mrg + dim2_size; 4242 1.1.1.3 mrg first_elem_offset_host = dst_offset0_len * slice_bytes 4243 1.1.1.3 mrg + dst_offset1_len * dst_dim2_size 4244 1.1.1.3 mrg + dst_offset2_size; 4245 1.1.1.3 mrg host_ptr = dst; 4246 1.1.1.3 mrg struct agent_info *agent = get_agent_info (src_ord); 4247 1.1.1.3 mrg copy_agent = agent->id; 4248 1.1.1.3 mrg } 4249 1.1.1.3 mrg 4250 1.1.1.3 mrg void *agent_ptr; 4251 1.1.1.3 mrg 4252 1.1.1.3 mrg hsa_status_t status 4253 1.1.1.3 mrg = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host, 4254 1.1.1.3 mrg bounded_size_host, NULL, 0, &agent_ptr); 4255 1.1.1.3 mrg /* We can't lock the host memory: don't give up though, we might still be 4256 1.1.1.3 mrg able to use the slow path in our caller (maybe even with iterated memcpy2d 4257 1.1.1.3 mrg calls). So, don't make this an error. */ 4258 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 4259 1.1.1.3 mrg return -1; 4260 1.1.1.3 mrg 4261 1.1.1.3 mrg hsa_pitched_ptr_t dstpp, srcpp; 4262 1.1.1.3 mrg hsa_dim3_t dst_offsets, src_offsets, ranges; 4263 1.1.1.3 mrg 4264 1.1.1.3 mrg int retval = 1; 4265 1.1.1.3 mrg 4266 1.1.1.3 mrg hsa_signal_t completion_signal; 4267 1.1.1.3 mrg status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal); 4268 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 4269 1.1.1.3 mrg { 4270 1.1.1.3 mrg retval = -1; 4271 1.1.1.3 mrg goto unlock; 4272 1.1.1.3 mrg } 4273 1.1.1.3 mrg 4274 1.1.1.3 mrg if (dir == hsaHostToDevice) 4275 1.1.1.3 mrg { 4276 1.1.1.3 mrg srcpp.base = agent_ptr - first_elem_offset_host; 4277 1.1.1.3 mrg dstpp.base = dst; 4278 1.1.1.3 mrg } 4279 1.1.1.3 mrg else 4280 1.1.1.3 mrg { 4281 1.1.1.3 mrg srcpp.base = (void *) src; 4282 1.1.1.3 mrg dstpp.base = agent_ptr - first_elem_offset_host; 4283 1.1.1.3 mrg } 4284 1.1.1.3 mrg 4285 1.1.1.3 mrg /* Pitch is measured in bytes. */ 4286 1.1.1.3 mrg srcpp.pitch = src_dim2_size; 4287 1.1.1.3 mrg /* Slice is also measured in bytes (i.e. total per-slice). */ 4288 1.1.1.3 mrg srcpp.slice = src_dim2_size * src_dim1_len; 4289 1.1.1.3 mrg 4290 1.1.1.3 mrg src_offsets.x = src_offset2_size; 4291 1.1.1.3 mrg src_offsets.y = src_offset1_len; 4292 1.1.1.3 mrg src_offsets.z = src_offset0_len; 4293 1.1.1.3 mrg 4294 1.1.1.3 mrg /* As above. */ 4295 1.1.1.3 mrg dstpp.pitch = dst_dim2_size; 4296 1.1.1.3 mrg dstpp.slice = dst_dim2_size * dst_dim1_len; 4297 1.1.1.3 mrg 4298 1.1.1.3 mrg dst_offsets.x = dst_offset2_size; 4299 1.1.1.3 mrg dst_offsets.y = dst_offset1_len; 4300 1.1.1.3 mrg dst_offsets.z = dst_offset0_len; 4301 1.1.1.3 mrg 4302 1.1.1.3 mrg ranges.x = dim2_size; 4303 1.1.1.3 mrg ranges.y = dim1_len; 4304 1.1.1.3 mrg ranges.z = dim0_len; 4305 1.1.1.3 mrg 4306 1.1.1.3 mrg status 4307 1.1.1.3 mrg = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp, 4308 1.1.1.3 mrg &src_offsets, &ranges, 4309 1.1.1.3 mrg copy_agent, dir, 0, NULL, 4310 1.1.1.3 mrg completion_signal); 4311 1.1.1.3 mrg /* If the rectangular copy fails, we might still be able to use the slow 4312 1.1.1.3 mrg path. We need to unlock the host memory though, so don't return 4313 1.1.1.3 mrg immediately. */ 4314 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 4315 1.1.1.3 mrg retval = -1; 4316 1.1.1.3 mrg else 4317 1.1.1.3 mrg { 4318 1.1.1.3 mrg hsa_signal_value_t sv 4319 1.1.1.3 mrg = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal, 4320 1.1.1.3 mrg HSA_SIGNAL_CONDITION_LT, 1, 4321 1.1.1.3 mrg UINT64_MAX, 4322 1.1.1.3 mrg HSA_WAIT_STATE_ACTIVE); 4323 1.1.1.3 mrg if (sv < 0) 4324 1.1.1.3 mrg { 4325 1.1.1.3 mrg GCN_WARNING ("async copy rect failure"); 4326 1.1.1.3 mrg retval = -1; 4327 1.1.1.3 mrg } 4328 1.1.1.3 mrg } 4329 1.1.1.3 mrg 4330 1.1.1.3 mrg hsa_fns.hsa_signal_destroy_fn (completion_signal); 4331 1.1.1.3 mrg 4332 1.1.1.3 mrg unlock: 4333 1.1.1.3 mrg status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host); 4334 1.1.1.3 mrg if (status != HSA_STATUS_SUCCESS) 4335 1.1.1.3 mrg hsa_fatal ("Could not unlock host memory", status); 4336 1.1.1.3 mrg 4337 1.1.1.3 mrg return retval; 4338 1.1.1.3 mrg } 4339 1.1.1.3 mrg 4340 1.1 mrg /* }}} */ 4341 1.1 mrg /* {{{ OpenMP Plugin API */ 4342 1.1 mrg 4343 1.1 mrg /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers 4344 1.1 mrg in VARS as a parameter. The kernel is identified by FN_PTR which must point 4345 1.1 mrg to a kernel_info structure, and must have previously been loaded to the 4346 1.1 mrg specified device. */ 4347 1.1 mrg 4348 1.1 mrg void 4349 1.1 mrg GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args) 4350 1.1 mrg { 4351 1.1 mrg struct agent_info *agent = get_agent_info (device); 4352 1.1 mrg struct kernel_info *kernel = (struct kernel_info *) fn_ptr; 4353 1.1 mrg struct GOMP_kernel_launch_attributes def; 4354 1.1 mrg struct GOMP_kernel_launch_attributes *kla; 4355 1.1 mrg assert (agent == kernel->agent); 4356 1.1 mrg 4357 1.1 mrg /* If we get here then the kernel must be OpenMP. */ 4358 1.1 mrg kernel->kind = KIND_OPENMP; 4359 1.1 mrg 4360 1.1 mrg if (!parse_target_attributes (args, &def, &kla, agent)) 4361 1.1 mrg { 4362 1.1 mrg GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n"); 4363 1.1 mrg return; 4364 1.1 mrg } 4365 1.1 mrg run_kernel (kernel, vars, kla, NULL, false); 4366 1.1 mrg } 4367 1.1 mrg 4368 1.1 mrg /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to 4369 1.1 mrg GOMP_OFFLOAD_run except that the launch is queued and there is a call to 4370 1.1 mrg GOMP_PLUGIN_target_task_completion when it has finished. */ 4371 1.1 mrg 4372 1.1 mrg void 4373 1.1 mrg GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars, 4374 1.1 mrg void **args, void *async_data) 4375 1.1 mrg { 4376 1.1 mrg GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n"); 4377 1.1 mrg struct agent_info *agent = get_agent_info (device); 4378 1.1 mrg struct kernel_info *kernel = (struct kernel_info *) tgt_fn; 4379 1.1 mrg struct GOMP_kernel_launch_attributes def; 4380 1.1 mrg struct GOMP_kernel_launch_attributes *kla; 4381 1.1 mrg assert (agent == kernel->agent); 4382 1.1 mrg 4383 1.1 mrg /* If we get here then the kernel must be OpenMP. */ 4384 1.1 mrg kernel->kind = KIND_OPENMP; 4385 1.1 mrg 4386 1.1 mrg if (!parse_target_attributes (args, &def, &kla, agent)) 4387 1.1 mrg { 4388 1.1 mrg GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n"); 4389 1.1 mrg return; 4390 1.1 mrg } 4391 1.1 mrg 4392 1.1 mrg maybe_init_omp_async (agent); 4393 1.1 mrg queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla); 4394 1.1 mrg queue_push_callback (agent->omp_async_queue, 4395 1.1 mrg GOMP_PLUGIN_target_task_completion, async_data); 4396 1.1 mrg } 4397 1.1 mrg 4398 1.1 mrg /* }}} */ 4399 1.1 mrg /* {{{ OpenACC Plugin API */ 4400 1.1 mrg 4401 1.1 mrg /* Run a synchronous OpenACC kernel. The device number is inferred from the 4402 1.1 mrg already-loaded KERNEL. */ 4403 1.1 mrg 4404 1.1 mrg void 4405 1.1.1.3 mrg GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), 4406 1.1.1.3 mrg size_t mapnum __attribute__((unused)), 4407 1.1.1.3 mrg void **hostaddrs __attribute__((unused)), 4408 1.1.1.3 mrg void **devaddrs, unsigned *dims, 4409 1.1 mrg void *targ_mem_desc) 4410 1.1 mrg { 4411 1.1 mrg struct kernel_info *kernel = (struct kernel_info *) fn_ptr; 4412 1.1 mrg 4413 1.1.1.3 mrg gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL); 4414 1.1 mrg } 4415 1.1 mrg 4416 1.1 mrg /* Run an asynchronous OpenACC kernel on the specified queue. */ 4417 1.1 mrg 4418 1.1 mrg void 4419 1.1.1.3 mrg GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), 4420 1.1.1.3 mrg size_t mapnum __attribute__((unused)), 4421 1.1.1.3 mrg void **hostaddrs __attribute__((unused)), 4422 1.1.1.3 mrg void **devaddrs, 4423 1.1 mrg unsigned *dims, void *targ_mem_desc, 4424 1.1 mrg struct goacc_asyncqueue *aq) 4425 1.1 mrg { 4426 1.1 mrg struct kernel_info *kernel = (struct kernel_info *) fn_ptr; 4427 1.1 mrg 4428 1.1.1.3 mrg gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq); 4429 1.1 mrg } 4430 1.1 mrg 4431 1.1 mrg /* Create a new asynchronous thread and queue for running future kernels. */ 4432 1.1 mrg 4433 1.1 mrg struct goacc_asyncqueue * 4434 1.1 mrg GOMP_OFFLOAD_openacc_async_construct (int device) 4435 1.1 mrg { 4436 1.1 mrg struct agent_info *agent = get_agent_info (device); 4437 1.1 mrg 4438 1.1 mrg pthread_mutex_lock (&agent->async_queues_mutex); 4439 1.1 mrg 4440 1.1 mrg struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq)); 4441 1.1 mrg aq->agent = get_agent_info (device); 4442 1.1 mrg aq->prev = NULL; 4443 1.1 mrg aq->next = agent->async_queues; 4444 1.1 mrg if (aq->next) 4445 1.1 mrg { 4446 1.1 mrg aq->next->prev = aq; 4447 1.1 mrg aq->id = aq->next->id + 1; 4448 1.1 mrg } 4449 1.1 mrg else 4450 1.1 mrg aq->id = 1; 4451 1.1 mrg agent->async_queues = aq; 4452 1.1 mrg 4453 1.1 mrg aq->queue_first = 0; 4454 1.1 mrg aq->queue_n = 0; 4455 1.1 mrg aq->drain_queue_stop = 0; 4456 1.1 mrg 4457 1.1 mrg if (pthread_mutex_init (&aq->mutex, NULL)) 4458 1.1 mrg { 4459 1.1 mrg GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex"); 4460 1.1 mrg return false; 4461 1.1 mrg } 4462 1.1 mrg if (pthread_cond_init (&aq->queue_cond_in, NULL)) 4463 1.1 mrg { 4464 1.1 mrg GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond"); 4465 1.1 mrg return false; 4466 1.1 mrg } 4467 1.1 mrg if (pthread_cond_init (&aq->queue_cond_out, NULL)) 4468 1.1 mrg { 4469 1.1 mrg GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond"); 4470 1.1 mrg return false; 4471 1.1 mrg } 4472 1.1 mrg 4473 1.1 mrg hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id, 4474 1.1 mrg ASYNC_QUEUE_SIZE, 4475 1.1 mrg HSA_QUEUE_TYPE_MULTI, 4476 1.1 mrg hsa_queue_callback, NULL, 4477 1.1 mrg UINT32_MAX, UINT32_MAX, 4478 1.1 mrg &aq->hsa_queue); 4479 1.1 mrg if (status != HSA_STATUS_SUCCESS) 4480 1.1 mrg hsa_fatal ("Error creating command queue", status); 4481 1.1 mrg 4482 1.1 mrg int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq); 4483 1.1 mrg if (err != 0) 4484 1.1 mrg GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s", 4485 1.1 mrg strerror (err)); 4486 1.1 mrg GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id, 4487 1.1 mrg aq->id); 4488 1.1 mrg 4489 1.1 mrg pthread_mutex_unlock (&agent->async_queues_mutex); 4490 1.1 mrg 4491 1.1 mrg return aq; 4492 1.1 mrg } 4493 1.1 mrg 4494 1.1 mrg /* Destroy an existing asynchronous thread and queue. Waits for any 4495 1.1 mrg currently-running task to complete, but cancels any queued tasks. */ 4496 1.1 mrg 4497 1.1 mrg bool 4498 1.1 mrg GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq) 4499 1.1 mrg { 4500 1.1 mrg struct agent_info *agent = aq->agent; 4501 1.1 mrg 4502 1.1 mrg finalize_async_thread (aq); 4503 1.1 mrg 4504 1.1 mrg pthread_mutex_lock (&agent->async_queues_mutex); 4505 1.1 mrg 4506 1.1 mrg int err; 4507 1.1 mrg if ((err = pthread_mutex_destroy (&aq->mutex))) 4508 1.1 mrg { 4509 1.1 mrg GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err); 4510 1.1 mrg goto fail; 4511 1.1 mrg } 4512 1.1 mrg if (pthread_cond_destroy (&aq->queue_cond_in)) 4513 1.1 mrg { 4514 1.1 mrg GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond"); 4515 1.1 mrg goto fail; 4516 1.1 mrg } 4517 1.1 mrg if (pthread_cond_destroy (&aq->queue_cond_out)) 4518 1.1 mrg { 4519 1.1 mrg GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond"); 4520 1.1 mrg goto fail; 4521 1.1 mrg } 4522 1.1 mrg hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue); 4523 1.1 mrg if (status != HSA_STATUS_SUCCESS) 4524 1.1 mrg { 4525 1.1 mrg hsa_error ("Error destroying command queue", status); 4526 1.1 mrg goto fail; 4527 1.1 mrg } 4528 1.1 mrg 4529 1.1 mrg if (aq->prev) 4530 1.1 mrg aq->prev->next = aq->next; 4531 1.1 mrg if (aq->next) 4532 1.1 mrg aq->next->prev = aq->prev; 4533 1.1 mrg if (agent->async_queues == aq) 4534 1.1 mrg agent->async_queues = aq->next; 4535 1.1 mrg 4536 1.1 mrg GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id); 4537 1.1 mrg 4538 1.1 mrg free (aq); 4539 1.1 mrg pthread_mutex_unlock (&agent->async_queues_mutex); 4540 1.1 mrg return true; 4541 1.1 mrg 4542 1.1 mrg fail: 4543 1.1 mrg pthread_mutex_unlock (&agent->async_queues_mutex); 4544 1.1 mrg return false; 4545 1.1 mrg } 4546 1.1 mrg 4547 1.1 mrg /* Return true if the specified async queue is currently empty. */ 4548 1.1 mrg 4549 1.1 mrg int 4550 1.1 mrg GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq) 4551 1.1 mrg { 4552 1.1 mrg return queue_empty (aq); 4553 1.1 mrg } 4554 1.1 mrg 4555 1.1 mrg /* Block until the specified queue has executed all its tasks and the 4556 1.1 mrg queue is empty. */ 4557 1.1 mrg 4558 1.1 mrg bool 4559 1.1 mrg GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq) 4560 1.1 mrg { 4561 1.1 mrg wait_queue (aq); 4562 1.1 mrg return true; 4563 1.1 mrg } 4564 1.1 mrg 4565 1.1 mrg /* Add a serialization point across two async queues. Any new tasks added to 4566 1.1 mrg AQ2, after this call, will not run until all tasks on AQ1, at the time 4567 1.1 mrg of this call, have completed. */ 4568 1.1 mrg 4569 1.1 mrg bool 4570 1.1 mrg GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1, 4571 1.1 mrg struct goacc_asyncqueue *aq2) 4572 1.1 mrg { 4573 1.1 mrg /* For serialize, stream aq2 waits for aq1 to complete work that has been 4574 1.1 mrg scheduled to run on it up to this point. */ 4575 1.1 mrg if (aq1 != aq2) 4576 1.1 mrg { 4577 1.1 mrg struct placeholder *placeholderp = queue_push_placeholder (aq1); 4578 1.1 mrg queue_push_asyncwait (aq2, placeholderp); 4579 1.1 mrg } 4580 1.1 mrg return true; 4581 1.1 mrg } 4582 1.1 mrg 4583 1.1 mrg /* Add an opaque callback to the given async queue. */ 4584 1.1 mrg 4585 1.1 mrg void 4586 1.1 mrg GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, 4587 1.1 mrg void (*fn) (void *), void *data) 4588 1.1 mrg { 4589 1.1 mrg queue_push_callback (aq, fn, data); 4590 1.1 mrg } 4591 1.1 mrg 4592 1.1 mrg /* Queue up an asynchronous data copy from host to DEVICE. */ 4593 1.1 mrg 4594 1.1 mrg bool 4595 1.1 mrg GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src, 4596 1.1 mrg size_t n, struct goacc_asyncqueue *aq) 4597 1.1 mrg { 4598 1.1 mrg struct agent_info *agent = get_agent_info (device); 4599 1.1 mrg assert (agent == aq->agent); 4600 1.1.1.2 mrg queue_push_copy (aq, dst, src, n); 4601 1.1 mrg return true; 4602 1.1 mrg } 4603 1.1 mrg 4604 1.1 mrg /* Queue up an asynchronous data copy from DEVICE to host. */ 4605 1.1 mrg 4606 1.1 mrg bool 4607 1.1 mrg GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src, 4608 1.1 mrg size_t n, struct goacc_asyncqueue *aq) 4609 1.1 mrg { 4610 1.1 mrg struct agent_info *agent = get_agent_info (device); 4611 1.1 mrg assert (agent == aq->agent); 4612 1.1.1.2 mrg queue_push_copy (aq, dst, src, n); 4613 1.1 mrg return true; 4614 1.1 mrg } 4615 1.1 mrg 4616 1.1 mrg union goacc_property_value 4617 1.1 mrg GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop) 4618 1.1 mrg { 4619 1.1 mrg struct agent_info *agent = get_agent_info (device); 4620 1.1 mrg 4621 1.1 mrg union goacc_property_value propval = { .val = 0 }; 4622 1.1 mrg 4623 1.1 mrg switch (prop) 4624 1.1 mrg { 4625 1.1 mrg case GOACC_PROPERTY_FREE_MEMORY: 4626 1.1 mrg /* Not supported. */ 4627 1.1 mrg break; 4628 1.1 mrg case GOACC_PROPERTY_MEMORY: 4629 1.1 mrg { 4630 1.1 mrg size_t size; 4631 1.1 mrg hsa_region_t region = agent->data_region; 4632 1.1 mrg hsa_status_t status = 4633 1.1 mrg hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size); 4634 1.1 mrg if (status == HSA_STATUS_SUCCESS) 4635 1.1 mrg propval.val = size; 4636 1.1 mrg break; 4637 1.1 mrg } 4638 1.1 mrg case GOACC_PROPERTY_NAME: 4639 1.1 mrg propval.ptr = agent->name; 4640 1.1 mrg break; 4641 1.1 mrg case GOACC_PROPERTY_VENDOR: 4642 1.1 mrg propval.ptr = agent->vendor_name; 4643 1.1 mrg break; 4644 1.1 mrg case GOACC_PROPERTY_DRIVER: 4645 1.1 mrg propval.ptr = hsa_context.driver_version_s; 4646 1.1 mrg break; 4647 1.1 mrg } 4648 1.1 mrg 4649 1.1 mrg return propval; 4650 1.1 mrg } 4651 1.1 mrg 4652 1.1 mrg /* Set up plugin-specific thread-local-data (host-side). */ 4653 1.1 mrg 4654 1.1 mrg void * 4655 1.1 mrg GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused))) 4656 1.1 mrg { 4657 1.1 mrg struct gcn_thread *thread_data 4658 1.1 mrg = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread)); 4659 1.1 mrg 4660 1.1 mrg thread_data->async = GOMP_ASYNC_SYNC; 4661 1.1 mrg 4662 1.1 mrg return (void *) thread_data; 4663 1.1 mrg } 4664 1.1 mrg 4665 1.1 mrg /* Clean up plugin-specific thread-local-data. */ 4666 1.1 mrg 4667 1.1 mrg void 4668 1.1 mrg GOMP_OFFLOAD_openacc_destroy_thread_data (void *data) 4669 1.1 mrg { 4670 1.1 mrg free (data); 4671 1.1 mrg } 4672 1.1 mrg 4673 1.1 mrg /* }}} */ 4674