1/* 2 * Copyright 2014 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sub license, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, 16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 18 * USE OR OTHER DEALINGS IN THE SOFTWARE. 19 * 20 * The above copyright notice and this permission notice (including the 21 * next paragraph) shall be included in all copies or substantial portions 22 * of the Software. 23 * 24 */ 25/* based on pieces from si_pipe.c and radeon_llvm_emit.c */ 26#include "ac_llvm_build.h" 27 28#include "ac_exp_param.h" 29#include "ac_llvm_util.h" 30#include "ac_shader_util.h" 31#include "c11/threads.h" 32#include "shader_enums.h" 33#include "sid.h" 34#include "util/bitscan.h" 35#include "util/macros.h" 36#include "util/u_atomic.h" 37#include "util/u_math.h" 38#include <llvm-c/Core.h> 39#include <llvm/Config/llvm-config.h> 40 41#include <assert.h> 42#include <stdio.h> 43 44#define AC_LLVM_INITIAL_CF_DEPTH 4 45 46/* Data for if/else/endif and bgnloop/endloop control flow structures. 47 */ 48struct ac_llvm_flow { 49 /* Loop exit or next part of if/else/endif. */ 50 LLVMBasicBlockRef next_block; 51 LLVMBasicBlockRef loop_entry_block; 52}; 53 54/* Initialize module-independent parts of the context. 55 * 56 * The caller is responsible for initializing ctx::module and ctx::builder. 57 */ 58void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler, 59 enum chip_class chip_class, enum radeon_family family, 60 const struct radeon_info *info, 61 enum ac_float_mode float_mode, unsigned wave_size, 62 unsigned ballot_mask_bits) 63{ 64 ctx->context = LLVMContextCreate(); 65 66 ctx->chip_class = chip_class; 67 ctx->family = family; 68 ctx->info = info; 69 ctx->wave_size = wave_size; 70 ctx->ballot_mask_bits = ballot_mask_bits; 71 ctx->float_mode = float_mode; 72 ctx->module = ac_create_module(compiler->tm, ctx->context); 73 ctx->builder = ac_create_builder(ctx->context, float_mode); 74 75 ctx->voidt = LLVMVoidTypeInContext(ctx->context); 76 ctx->i1 = LLVMInt1TypeInContext(ctx->context); 77 ctx->i8 = LLVMInt8TypeInContext(ctx->context); 78 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16); 79 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32); 80 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64); 81 ctx->i128 = LLVMIntTypeInContext(ctx->context, 128); 82 ctx->intptr = ctx->i32; 83 ctx->f16 = LLVMHalfTypeInContext(ctx->context); 84 ctx->f32 = LLVMFloatTypeInContext(ctx->context); 85 ctx->f64 = LLVMDoubleTypeInContext(ctx->context); 86 ctx->v2i16 = LLVMVectorType(ctx->i16, 2); 87 ctx->v4i16 = LLVMVectorType(ctx->i16, 4); 88 ctx->v2f16 = LLVMVectorType(ctx->f16, 2); 89 ctx->v4f16 = LLVMVectorType(ctx->f16, 4); 90 ctx->v2i32 = LLVMVectorType(ctx->i32, 2); 91 ctx->v3i32 = LLVMVectorType(ctx->i32, 3); 92 ctx->v4i32 = LLVMVectorType(ctx->i32, 4); 93 ctx->v2f32 = LLVMVectorType(ctx->f32, 2); 94 ctx->v3f32 = LLVMVectorType(ctx->f32, 3); 95 ctx->v4f32 = LLVMVectorType(ctx->f32, 4); 96 ctx->v8i32 = LLVMVectorType(ctx->i32, 8); 97 ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size); 98 ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits); 99 100 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false); 101 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false); 102 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false); 103 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false); 104 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false); 105 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false); 106 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false); 107 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false); 108 ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false); 109 ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false); 110 ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0); 111 ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0); 112 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0); 113 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0); 114 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0); 115 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0); 116 117 ctx->i1false = LLVMConstInt(ctx->i1, 0, false); 118 ctx->i1true = LLVMConstInt(ctx->i1, 1, false); 119 120 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5); 121 122 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14); 123 124 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14); 125 126 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0); 127 ctx->flow = calloc(1, sizeof(*ctx->flow)); 128} 129 130void ac_llvm_context_dispose(struct ac_llvm_context *ctx) 131{ 132 free(ctx->flow->stack); 133 free(ctx->flow); 134 ctx->flow = NULL; 135} 136 137int ac_get_llvm_num_components(LLVMValueRef value) 138{ 139 LLVMTypeRef type = LLVMTypeOf(value); 140 unsigned num_components = 141 LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1; 142 return num_components; 143} 144 145LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index) 146{ 147 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) { 148 assert(index == 0); 149 return value; 150 } 151 152 return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), ""); 153} 154 155int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type) 156{ 157 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) 158 type = LLVMGetElementType(type); 159 160 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind) 161 return LLVMGetIntTypeWidth(type); 162 163 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { 164 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS) 165 return 32; 166 } 167 168 if (type == ctx->f16) 169 return 16; 170 if (type == ctx->f32) 171 return 32; 172 if (type == ctx->f64) 173 return 64; 174 175 unreachable("Unhandled type kind in get_elem_bits"); 176} 177 178unsigned ac_get_type_size(LLVMTypeRef type) 179{ 180 LLVMTypeKind kind = LLVMGetTypeKind(type); 181 182 switch (kind) { 183 case LLVMIntegerTypeKind: 184 return LLVMGetIntTypeWidth(type) / 8; 185 case LLVMHalfTypeKind: 186 return 2; 187 case LLVMFloatTypeKind: 188 return 4; 189 case LLVMDoubleTypeKind: 190 return 8; 191 case LLVMPointerTypeKind: 192 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT) 193 return 4; 194 return 8; 195 case LLVMVectorTypeKind: 196 return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type)); 197 case LLVMArrayTypeKind: 198 return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type)); 199 default: 200 assert(0); 201 return 0; 202 } 203} 204 205static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) 206{ 207 if (t == ctx->i1) 208 return ctx->i1; 209 else if (t == ctx->i8) 210 return ctx->i8; 211 else if (t == ctx->f16 || t == ctx->i16) 212 return ctx->i16; 213 else if (t == ctx->f32 || t == ctx->i32) 214 return ctx->i32; 215 else if (t == ctx->f64 || t == ctx->i64) 216 return ctx->i64; 217 else 218 unreachable("Unhandled integer size"); 219} 220 221LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t) 222{ 223 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { 224 LLVMTypeRef elem_type = LLVMGetElementType(t); 225 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t)); 226 } 227 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) { 228 switch (LLVMGetPointerAddressSpace(t)) { 229 case AC_ADDR_SPACE_GLOBAL: 230 return ctx->i64; 231 case AC_ADDR_SPACE_CONST_32BIT: 232 case AC_ADDR_SPACE_LDS: 233 return ctx->i32; 234 default: 235 unreachable("unhandled address space"); 236 } 237 } 238 return to_integer_type_scalar(ctx, t); 239} 240 241LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v) 242{ 243 LLVMTypeRef type = LLVMTypeOf(v); 244 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { 245 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), ""); 246 } 247 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), ""); 248} 249 250LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v) 251{ 252 LLVMTypeRef type = LLVMTypeOf(v); 253 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) 254 return v; 255 return ac_to_integer(ctx, v); 256} 257 258static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) 259{ 260 if (t == ctx->i8) 261 return ctx->i8; 262 else if (t == ctx->i16 || t == ctx->f16) 263 return ctx->f16; 264 else if (t == ctx->i32 || t == ctx->f32) 265 return ctx->f32; 266 else if (t == ctx->i64 || t == ctx->f64) 267 return ctx->f64; 268 else 269 unreachable("Unhandled float size"); 270} 271 272LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t) 273{ 274 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { 275 LLVMTypeRef elem_type = LLVMGetElementType(t); 276 return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t)); 277 } 278 return to_float_type_scalar(ctx, t); 279} 280 281LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v) 282{ 283 LLVMTypeRef type = LLVMTypeOf(v); 284 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), ""); 285} 286 287LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, 288 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count, 289 unsigned attrib_mask) 290{ 291 LLVMValueRef function, call; 292 bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY); 293 294 function = LLVMGetNamedFunction(ctx->module, name); 295 if (!function) { 296 LLVMTypeRef param_types[32], function_type; 297 unsigned i; 298 299 assert(param_count <= 32); 300 301 for (i = 0; i < param_count; ++i) { 302 assert(params[i]); 303 param_types[i] = LLVMTypeOf(params[i]); 304 } 305 function_type = LLVMFunctionType(return_type, param_types, param_count, 0); 306 function = LLVMAddFunction(ctx->module, name, function_type); 307 308 LLVMSetFunctionCallConv(function, LLVMCCallConv); 309 LLVMSetLinkage(function, LLVMExternalLinkage); 310 311 if (!set_callsite_attrs) 312 ac_add_func_attributes(ctx->context, function, attrib_mask); 313 } 314 315 call = LLVMBuildCall(ctx->builder, function, params, param_count, ""); 316 if (set_callsite_attrs) 317 ac_add_func_attributes(ctx->context, call, attrib_mask); 318 return call; 319} 320 321/** 322 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with 323 * intrinsic names). 324 */ 325void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize) 326{ 327 LLVMTypeRef elem_type = type; 328 329 if (LLVMGetTypeKind(type) == LLVMStructTypeKind) { 330 unsigned count = LLVMCountStructElementTypes(type); 331 int ret = snprintf(buf, bufsize, "sl_"); 332 buf += ret; 333 bufsize -= ret; 334 335 LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef)); 336 LLVMGetStructElementTypes(type, elems); 337 338 for (unsigned i = 0; i < count; i++) { 339 ac_build_type_name_for_intr(elems[i], buf, bufsize); 340 ret = strlen(buf); 341 buf += ret; 342 bufsize -= ret; 343 } 344 345 snprintf(buf, bufsize, "s"); 346 return; 347 } 348 349 assert(bufsize >= 8); 350 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { 351 int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type)); 352 if (ret < 0) { 353 char *type_name = LLVMPrintTypeToString(type); 354 fprintf(stderr, "Error building type name for: %s\n", type_name); 355 LLVMDisposeMessage(type_name); 356 return; 357 } 358 elem_type = LLVMGetElementType(type); 359 buf += ret; 360 bufsize -= ret; 361 } 362 switch (LLVMGetTypeKind(elem_type)) { 363 default: 364 break; 365 case LLVMIntegerTypeKind: 366 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type)); 367 break; 368 case LLVMHalfTypeKind: 369 snprintf(buf, bufsize, "f16"); 370 break; 371 case LLVMFloatTypeKind: 372 snprintf(buf, bufsize, "f32"); 373 break; 374 case LLVMDoubleTypeKind: 375 snprintf(buf, bufsize, "f64"); 376 break; 377 } 378} 379 380/** 381 * Helper function that builds an LLVM IR PHI node and immediately adds 382 * incoming edges. 383 */ 384LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming, 385 LLVMValueRef *values, LLVMBasicBlockRef *blocks) 386{ 387 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, ""); 388 LLVMAddIncoming(phi, values, blocks, count_incoming); 389 return phi; 390} 391 392void ac_build_s_barrier(struct ac_llvm_context *ctx) 393{ 394 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT); 395} 396 397/* Prevent optimizations (at least of memory accesses) across the current 398 * point in the program by emitting empty inline assembly that is marked as 399 * having side effects. 400 * 401 * Optionally, a value can be passed through the inline assembly to prevent 402 * LLVM from hoisting calls to ReadNone functions. 403 */ 404void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr) 405{ 406 static int counter = 0; 407 408 LLVMBuilderRef builder = ctx->builder; 409 char code[16]; 410 const char *constraint = sgpr ? "=s,0" : "=v,0"; 411 412 snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter)); 413 414 if (!pgpr) { 415 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false); 416 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false); 417 LLVMBuildCall(builder, inlineasm, NULL, 0, ""); 418 } else if (LLVMTypeOf(*pgpr) == ctx->i32) { 419 /* Simple version for i32 that allows the caller to set LLVM metadata on the call 420 * instruction. */ 421 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false); 422 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false); 423 424 *pgpr = LLVMBuildCall(builder, inlineasm, pgpr, 1, ""); 425 } else if (LLVMTypeOf(*pgpr) == ctx->i16) { 426 /* Simple version for i16 that allows the caller to set LLVM metadata on the call 427 * instruction. */ 428 LLVMTypeRef ftype = LLVMFunctionType(ctx->i16, &ctx->i16, 1, false); 429 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false); 430 431 *pgpr = LLVMBuildCall(builder, inlineasm, pgpr, 1, ""); 432 } else if (LLVMGetTypeKind(LLVMTypeOf(*pgpr)) == LLVMPointerTypeKind) { 433 LLVMTypeRef type = LLVMTypeOf(*pgpr); 434 LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false); 435 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false); 436 437 *pgpr = LLVMBuildCall(builder, inlineasm, pgpr, 1, ""); 438 } else { 439 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false); 440 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false); 441 LLVMTypeRef type = LLVMTypeOf(*pgpr); 442 unsigned bitsize = ac_get_elem_bits(ctx, type); 443 LLVMValueRef vgpr = *pgpr; 444 LLVMTypeRef vgpr_type; 445 unsigned vgpr_size; 446 LLVMValueRef vgpr0; 447 448 if (bitsize < 32) 449 vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, ""); 450 451 vgpr_type = LLVMTypeOf(vgpr); 452 vgpr_size = ac_get_type_size(vgpr_type); 453 454 assert(vgpr_size % 4 == 0); 455 456 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), ""); 457 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, ""); 458 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, ""); 459 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, ""); 460 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, ""); 461 462 if (bitsize < 32) 463 vgpr = LLVMBuildTrunc(builder, vgpr, type, ""); 464 465 *pgpr = vgpr; 466 } 467} 468 469LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope) 470{ 471 const char *subgroup = "llvm.readcyclecounter"; 472 const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup; 473 474 LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0); 475 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, ""); 476} 477 478LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value) 479{ 480 const char *name; 481 482 if (LLVMTypeOf(value) == ctx->i1) 483 value = LLVMBuildZExt(ctx->builder, value, ctx->i32, ""); 484 485 if (ctx->wave_size == 64) 486 name = "llvm.amdgcn.icmp.i64.i32"; 487 else 488 name = "llvm.amdgcn.icmp.i32.i32"; 489 490 LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)}; 491 492 /* We currently have no other way to prevent LLVM from lifting the icmp 493 * calls to a dominating basic block. 494 */ 495 ac_build_optimization_barrier(ctx, &args[0], false); 496 497 args[0] = ac_to_integer(ctx, args[0]); 498 499 return ac_build_intrinsic( 500 ctx, name, ctx->iN_wavemask, args, 3, 501 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 502} 503 504LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value) 505{ 506 const char *name; 507 508 if (ctx->wave_size == 64) 509 name = "llvm.amdgcn.icmp.i64.i1"; 510 else 511 name = "llvm.amdgcn.icmp.i32.i1"; 512 513 LLVMValueRef args[3] = { 514 value, 515 ctx->i1false, 516 LLVMConstInt(ctx->i32, LLVMIntNE, 0), 517 }; 518 519 return ac_build_intrinsic( 520 ctx, name, ctx->iN_wavemask, args, 3, 521 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 522} 523 524LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value) 525{ 526 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); 527 LLVMValueRef vote_set = ac_build_ballot(ctx, value); 528 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, ""); 529} 530 531LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value) 532{ 533 LLVMValueRef vote_set = ac_build_ballot(ctx, value); 534 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), 535 ""); 536} 537 538LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value) 539{ 540 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); 541 LLVMValueRef vote_set = ac_build_ballot(ctx, value); 542 543 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, ""); 544 LLVMValueRef none = 545 LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), ""); 546 return LLVMBuildOr(ctx->builder, all, none, ""); 547} 548 549LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, 550 unsigned value_count, unsigned component) 551{ 552 LLVMValueRef vec = NULL; 553 554 if (value_count == 1) { 555 return values[component]; 556 } else if (!value_count) 557 unreachable("value_count is 0"); 558 559 for (unsigned i = component; i < value_count + component; i++) { 560 LLVMValueRef value = values[i]; 561 562 if (i == component) 563 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count)); 564 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false); 565 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, ""); 566 } 567 return vec; 568} 569 570LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values, 571 unsigned value_count, unsigned value_stride, bool load, 572 bool always_vector) 573{ 574 LLVMBuilderRef builder = ctx->builder; 575 LLVMValueRef vec = NULL; 576 unsigned i; 577 578 if (value_count == 1 && !always_vector) { 579 if (load) 580 return LLVMBuildLoad(builder, values[0], ""); 581 return values[0]; 582 } else if (!value_count) 583 unreachable("value_count is 0"); 584 585 for (i = 0; i < value_count; i++) { 586 LLVMValueRef value = values[i * value_stride]; 587 if (load) 588 value = LLVMBuildLoad(builder, value, ""); 589 590 if (!i) 591 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count)); 592 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false); 593 vec = LLVMBuildInsertElement(builder, vec, value, index, ""); 594 } 595 return vec; 596} 597 598LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, 599 unsigned value_count) 600{ 601 return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false); 602} 603 604LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 605{ 606 unsigned a_size = ac_get_llvm_num_components(a); 607 unsigned b_size = ac_get_llvm_num_components(b); 608 609 LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef)); 610 for (unsigned i = 0; i < a_size; i++) 611 elems[i] = ac_llvm_extract_elem(ctx, a, i); 612 for (unsigned i = 0; i < b_size; i++) 613 elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i); 614 615 return ac_build_gather_values(ctx, elems, a_size + b_size); 616} 617 618/* Expand a scalar or vector to <dst_channels x type> by filling the remaining 619 * channels with undef. Extract at most src_channels components from the input. 620 */ 621LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value, 622 unsigned src_channels, unsigned dst_channels) 623{ 624 LLVMTypeRef elemtype; 625 LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef)); 626 627 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) { 628 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value)); 629 630 if (src_channels == dst_channels && vec_size == dst_channels) 631 return value; 632 633 src_channels = MIN2(src_channels, vec_size); 634 635 for (unsigned i = 0; i < src_channels; i++) 636 chan[i] = ac_llvm_extract_elem(ctx, value, i); 637 638 elemtype = LLVMGetElementType(LLVMTypeOf(value)); 639 } else { 640 if (src_channels) { 641 assert(src_channels == 1); 642 chan[0] = value; 643 } 644 elemtype = LLVMTypeOf(value); 645 } 646 647 for (unsigned i = src_channels; i < dst_channels; i++) 648 chan[i] = LLVMGetUndef(elemtype); 649 650 return ac_build_gather_values(ctx, chan, dst_channels); 651} 652 653/* Extract components [start, start + channels) from a vector. 654 */ 655LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start, 656 unsigned channels) 657{ 658 LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef)); 659 660 for (unsigned i = 0; i < channels; i++) 661 chan[i] = ac_llvm_extract_elem(ctx, value, i + start); 662 663 return ac_build_gather_values(ctx, chan, channels); 664} 665 666/* Expand a scalar or vector to <4 x type> by filling the remaining channels 667 * with undef. Extract at most num_channels components from the input. 668 */ 669LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value, 670 unsigned num_channels) 671{ 672 return ac_build_expand(ctx, value, num_channels, 4); 673} 674 675LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value) 676{ 677 unsigned type_size = ac_get_type_size(LLVMTypeOf(value)); 678 const char *name; 679 680 if (type_size == 2) 681 name = "llvm.rint.f16"; 682 else if (type_size == 4) 683 name = "llvm.rint.f32"; 684 else 685 name = "llvm.rint.f64"; 686 687 return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE); 688} 689 690LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den) 691{ 692 unsigned type_size = ac_get_type_size(LLVMTypeOf(den)); 693 const char *name; 694 695 /* For doubles, we need precise division to pass GLCTS. */ 696 if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8) 697 return LLVMBuildFDiv(ctx->builder, num, den, ""); 698 699 if (type_size == 2) 700 name = "llvm.amdgcn.rcp.f16"; 701 else if (type_size == 4) 702 name = "llvm.amdgcn.rcp.f32"; 703 else 704 name = "llvm.amdgcn.rcp.f64"; 705 706 LLVMValueRef rcp = 707 ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE); 708 709 return LLVMBuildFMul(ctx->builder, num, rcp, ""); 710} 711 712/* See fast_idiv_by_const.h. */ 713/* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */ 714LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num, 715 LLVMValueRef multiplier, LLVMValueRef pre_shift, 716 LLVMValueRef post_shift, LLVMValueRef increment) 717{ 718 LLVMBuilderRef builder = ctx->builder; 719 720 num = LLVMBuildLShr(builder, num, pre_shift, ""); 721 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""), 722 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); 723 num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), ""); 724 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); 725 num = LLVMBuildTrunc(builder, num, ctx->i32, ""); 726 return LLVMBuildLShr(builder, num, post_shift, ""); 727} 728 729/* See fast_idiv_by_const.h. */ 730/* If num != UINT_MAX, this more efficient version can be used. */ 731/* Set: increment = util_fast_udiv_info::increment; */ 732LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num, 733 LLVMValueRef multiplier, LLVMValueRef pre_shift, 734 LLVMValueRef post_shift, LLVMValueRef increment) 735{ 736 LLVMBuilderRef builder = ctx->builder; 737 738 num = LLVMBuildLShr(builder, num, pre_shift, ""); 739 num = LLVMBuildNUWAdd(builder, num, increment, ""); 740 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""), 741 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); 742 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); 743 num = LLVMBuildTrunc(builder, num, ctx->i32, ""); 744 return LLVMBuildLShr(builder, num, post_shift, ""); 745} 746 747/* See fast_idiv_by_const.h. */ 748/* Both operands must fit in 31 bits and the divisor must not be 1. */ 749LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num, 750 LLVMValueRef multiplier, LLVMValueRef post_shift) 751{ 752 LLVMBuilderRef builder = ctx->builder; 753 754 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""), 755 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); 756 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); 757 num = LLVMBuildTrunc(builder, num, ctx->i32, ""); 758 return LLVMBuildLShr(builder, num, post_shift, ""); 759} 760 761/* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27 762 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is 763 * already multiplied by two. id is the cube face number. 764 */ 765struct cube_selection_coords { 766 LLVMValueRef stc[2]; 767 LLVMValueRef ma; 768 LLVMValueRef id; 769}; 770 771static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3], 772 struct cube_selection_coords *out) 773{ 774 LLVMTypeRef f32 = ctx->f32; 775 776 out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE); 777 out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE); 778 out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE); 779 out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE); 780} 781 782/** 783 * Build a manual selection sequence for cube face sc/tc coordinates and 784 * major axis vector (multiplied by 2 for consistency) for the given 785 * vec3 \p coords, for the face implied by \p selcoords. 786 * 787 * For the major axis, we always adjust the sign to be in the direction of 788 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards 789 * the selcoords major axis. 790 */ 791static void build_cube_select(struct ac_llvm_context *ctx, 792 const struct cube_selection_coords *selcoords, 793 const LLVMValueRef *coords, LLVMValueRef *out_st, 794 LLVMValueRef *out_ma) 795{ 796 LLVMBuilderRef builder = ctx->builder; 797 LLVMTypeRef f32 = LLVMTypeOf(coords[0]); 798 LLVMValueRef is_ma_positive; 799 LLVMValueRef sgn_ma; 800 LLVMValueRef is_ma_z, is_not_ma_z; 801 LLVMValueRef is_ma_y; 802 LLVMValueRef is_ma_x; 803 LLVMValueRef sgn; 804 LLVMValueRef tmp; 805 806 is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), ""); 807 sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0), 808 LLVMConstReal(f32, -1.0), ""); 809 810 is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), ""); 811 is_not_ma_z = LLVMBuildNot(builder, is_ma_z, ""); 812 is_ma_y = LLVMBuildAnd( 813 builder, is_not_ma_z, 814 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), ""); 815 is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), ""); 816 817 /* Select sc */ 818 tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], ""); 819 sgn = LLVMBuildSelect( 820 builder, is_ma_y, LLVMConstReal(f32, 1.0), 821 LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), ""); 822 out_st[0] = LLVMBuildFMul(builder, tmp, sgn, ""); 823 824 /* Select tc */ 825 tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], ""); 826 sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), ""); 827 out_st[1] = LLVMBuildFMul(builder, tmp, sgn, ""); 828 829 /* Select ma */ 830 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2], 831 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), ""); 832 tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE); 833 *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), ""); 834} 835 836void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod, 837 LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg) 838{ 839 840 LLVMBuilderRef builder = ctx->builder; 841 struct cube_selection_coords selcoords; 842 LLVMValueRef coords[3]; 843 LLVMValueRef invma; 844 845 if (is_array && !is_lod) { 846 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]); 847 848 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says: 849 * 850 * "For Array forms, the array layer used will be 851 * 852 * max(0, min(d−1, floor(layer+0.5))) 853 * 854 * where d is the depth of the texture array and layer 855 * comes from the component indicated in the tables below. 856 * Workaroudn for an issue where the layer is taken from a 857 * helper invocation which happens to fall on a different 858 * layer due to extrapolation." 859 * 860 * GFX8 and earlier attempt to implement this in hardware by 861 * clamping the value of coords[2] = (8 * layer) + face. 862 * Unfortunately, this means that the we end up with the wrong 863 * face when clamping occurs. 864 * 865 * Clamp the layer earlier to work around the issue. 866 */ 867 if (ctx->chip_class <= GFX8) { 868 LLVMValueRef ge0; 869 ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, ""); 870 tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, ""); 871 } 872 873 coords_arg[3] = tmp; 874 } 875 876 build_cube_intrinsic(ctx, coords_arg, &selcoords); 877 878 invma = 879 ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE); 880 invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma); 881 882 for (int i = 0; i < 2; ++i) 883 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, ""); 884 885 coords[2] = selcoords.id; 886 887 if (is_deriv && derivs_arg) { 888 LLVMValueRef derivs[4]; 889 int axis; 890 891 /* Convert cube derivatives to 2D derivatives. */ 892 for (axis = 0; axis < 2; axis++) { 893 LLVMValueRef deriv_st[2]; 894 LLVMValueRef deriv_ma; 895 896 /* Transform the derivative alongside the texture 897 * coordinate. Mathematically, the correct formula is 898 * as follows. Assume we're projecting onto the +Z face 899 * and denote by dx/dh the derivative of the (original) 900 * X texture coordinate with respect to horizontal 901 * window coordinates. The projection onto the +Z face 902 * plane is: 903 * 904 * f(x,z) = x/z 905 * 906 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh 907 * = 1/z * dx/dh - x/z * 1/z * dz/dh. 908 * 909 * This motivatives the implementation below. 910 * 911 * Whether this actually gives the expected results for 912 * apps that might feed in derivatives obtained via 913 * finite differences is anyone's guess. The OpenGL spec 914 * seems awfully quiet about how textureGrad for cube 915 * maps should be handled. 916 */ 917 build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma); 918 919 deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, ""); 920 921 for (int i = 0; i < 2; ++i) 922 derivs[axis * 2 + i] = 923 LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""), 924 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), ""); 925 } 926 927 memcpy(derivs_arg, derivs, sizeof(derivs)); 928 } 929 930 /* Shift the texture coordinate. This must be applied after the 931 * derivative calculation. 932 */ 933 for (int i = 0; i < 2; ++i) 934 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), ""); 935 936 if (is_array) { 937 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */ 938 /* coords_arg.w component - array_index for cube arrays */ 939 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]); 940 } 941 942 memcpy(coords_arg, coords, sizeof(coords)); 943} 944 945LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan, 946 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, 947 LLVMValueRef j) 948{ 949 LLVMValueRef args[5]; 950 LLVMValueRef p1; 951 952 args[0] = i; 953 args[1] = llvm_chan; 954 args[2] = attr_number; 955 args[3] = params; 956 957 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); 958 959 args[0] = p1; 960 args[1] = j; 961 args[2] = llvm_chan; 962 args[3] = attr_number; 963 args[4] = params; 964 965 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", ctx->f32, args, 5, 966 AC_FUNC_ATTR_READNONE); 967} 968 969LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan, 970 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, 971 LLVMValueRef j, bool high_16bits) 972{ 973 LLVMValueRef args[6]; 974 LLVMValueRef p1; 975 976 args[0] = i; 977 args[1] = llvm_chan; 978 args[2] = attr_number; 979 args[3] = high_16bits ? ctx->i1true : ctx->i1false; 980 args[4] = params; 981 982 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5, 983 AC_FUNC_ATTR_READNONE); 984 985 args[0] = p1; 986 args[1] = j; 987 args[2] = llvm_chan; 988 args[3] = attr_number; 989 args[4] = high_16bits ? ctx->i1true : ctx->i1false; 990 args[5] = params; 991 992 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6, 993 AC_FUNC_ATTR_READNONE); 994} 995 996LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter, 997 LLVMValueRef llvm_chan, LLVMValueRef attr_number, 998 LLVMValueRef params) 999{ 1000 LLVMValueRef args[4]; 1001 1002 args[0] = parameter; 1003 args[1] = llvm_chan; 1004 args[2] = attr_number; 1005 args[3] = params; 1006 1007 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4, 1008 AC_FUNC_ATTR_READNONE); 1009} 1010 1011LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, 1012 LLVMValueRef index) 1013{ 1014 return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); 1015} 1016 1017LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index) 1018{ 1019 LLVMValueRef indices[2] = { 1020 ctx->i32_0, 1021 index, 1022 }; 1023 return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, ""); 1024} 1025 1026LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index) 1027{ 1028 return LLVMBuildPointerCast(ctx->builder, LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""), 1029 LLVMTypeOf(ptr), ""); 1030} 1031 1032void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index, 1033 LLVMValueRef value) 1034{ 1035 LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index)); 1036} 1037 1038/** 1039 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad. 1040 * It's equivalent to doing a load from &base_ptr[index]. 1041 * 1042 * \param base_ptr Where the array starts. 1043 * \param index The element index into the array. 1044 * \param uniform Whether the base_ptr and index can be assumed to be 1045 * dynamically uniform (i.e. load to an SGPR) 1046 * \param invariant Whether the load is invariant (no other opcodes affect it) 1047 * \param no_unsigned_wraparound 1048 * For all possible re-associations and re-distributions of an expression 1049 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs 1050 * without inbounds in base_ptr), this parameter is true if "addr + offset" 1051 * does not result in an unsigned integer wraparound. This is used for 1052 * optimal code generation of 32-bit pointer arithmetic. 1053 * 1054 * For example, a 32-bit immediate offset that causes a 32-bit unsigned 1055 * integer wraparound can't be an imm offset in s_load_dword, because 1056 * the instruction performs "addr + offset" in 64 bits. 1057 * 1058 * Expected usage for bindless textures by chaining GEPs: 1059 * // possible unsigned wraparound, don't use InBounds: 1060 * ptr1 = LLVMBuildGEP(base_ptr, index); 1061 * image = load(ptr1); // becomes "s_load ptr1, 0" 1062 * 1063 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize); 1064 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds 1065 */ 1066static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, 1067 LLVMValueRef index, bool uniform, bool invariant, 1068 bool no_unsigned_wraparound) 1069{ 1070 LLVMValueRef pointer, result; 1071 1072 if (no_unsigned_wraparound && 1073 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT) 1074 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, ""); 1075 else 1076 pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); 1077 1078 if (uniform) 1079 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md); 1080 result = LLVMBuildLoad(ctx->builder, pointer, ""); 1081 if (invariant) 1082 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); 1083 LLVMSetAlignment(result, 4); 1084 return result; 1085} 1086 1087LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index) 1088{ 1089 return ac_build_load_custom(ctx, base_ptr, index, false, false, false); 1090} 1091 1092LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, 1093 LLVMValueRef index) 1094{ 1095 return ac_build_load_custom(ctx, base_ptr, index, false, true, false); 1096} 1097 1098/* This assumes that there is no unsigned integer wraparound during the address 1099 * computation, excluding all GEPs within base_ptr. */ 1100LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, 1101 LLVMValueRef index) 1102{ 1103 return ac_build_load_custom(ctx, base_ptr, index, true, true, true); 1104} 1105 1106/* See ac_build_load_custom() documentation. */ 1107LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, 1108 LLVMValueRef base_ptr, LLVMValueRef index) 1109{ 1110 return ac_build_load_custom(ctx, base_ptr, index, true, true, false); 1111} 1112 1113static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy) 1114{ 1115 return cache_policy | (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0); 1116} 1117 1118static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1119 LLVMValueRef data, LLVMValueRef vindex, 1120 LLVMValueRef voffset, LLVMValueRef soffset, 1121 unsigned cache_policy, bool use_format, bool structurized) 1122{ 1123 LLVMValueRef args[6]; 1124 int idx = 0; 1125 args[idx++] = data; 1126 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); 1127 if (structurized) 1128 args[idx++] = vindex ? vindex : ctx->i32_0; 1129 args[idx++] = voffset ? voffset : ctx->i32_0; 1130 args[idx++] = soffset ? soffset : ctx->i32_0; 1131 args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0); 1132 const char *indexing_kind = structurized ? "struct" : "raw"; 1133 char name[256], type_name[8]; 1134 1135 ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name)); 1136 1137 if (use_format) { 1138 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind, 1139 type_name); 1140 } else { 1141 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name); 1142 } 1143 1144 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); 1145} 1146 1147void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data, 1148 LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy) 1149{ 1150 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true, true); 1151} 1152 1153/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4. 1154 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2), 1155 * or v4i32 (num_channels=3,4). 1156 */ 1157void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, 1158 unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset, 1159 unsigned inst_offset, unsigned cache_policy) 1160{ 1161 /* Split 3 channel stores. */ 1162 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) { 1163 LLVMValueRef v[3], v01; 1164 1165 for (int i = 0; i < 3; i++) { 1166 v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), ""); 1167 } 1168 v01 = ac_build_gather_values(ctx, v, 2); 1169 1170 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, soffset, inst_offset, cache_policy); 1171 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, soffset, inst_offset + 8, 1172 cache_policy); 1173 return; 1174 } 1175 1176 /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset 1177 * (voffset is swizzled, but soffset isn't swizzled). 1178 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter. 1179 */ 1180 if (!(cache_policy & ac_swizzled)) { 1181 LLVMValueRef offset = soffset; 1182 1183 if (inst_offset) 1184 offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, inst_offset, 0), ""); 1185 1186 ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), ctx->i32_0, voffset, offset, 1187 cache_policy, false, false); 1188 return; 1189 } 1190 1191 static const unsigned dfmts[] = {V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32, 1192 V_008F0C_BUF_DATA_FORMAT_32_32_32, 1193 V_008F0C_BUF_DATA_FORMAT_32_32_32_32}; 1194 unsigned dfmt = dfmts[num_channels - 1]; 1195 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; 1196 LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0); 1197 1198 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, immoffset, num_channels, dfmt, 1199 nfmt, cache_policy); 1200} 1201 1202static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1203 LLVMValueRef vindex, LLVMValueRef voffset, 1204 LLVMValueRef soffset, unsigned num_channels, 1205 LLVMTypeRef channel_type, unsigned cache_policy, 1206 bool can_speculate, bool use_format, 1207 bool structurized) 1208{ 1209 LLVMValueRef args[5]; 1210 int idx = 0; 1211 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); 1212 if (structurized) 1213 args[idx++] = vindex ? vindex : ctx->i32_0; 1214 args[idx++] = voffset ? voffset : ctx->i32_0; 1215 args[idx++] = soffset ? soffset : ctx->i32_0; 1216 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); 1217 unsigned func = 1218 !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels; 1219 const char *indexing_kind = structurized ? "struct" : "raw"; 1220 char name[256], type_name[8]; 1221 1222 /* D16 is only supported on gfx8+ */ 1223 assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) || 1224 ctx->chip_class >= GFX8); 1225 1226 LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type; 1227 ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); 1228 1229 if (use_format) { 1230 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind, 1231 type_name); 1232 } else { 1233 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name); 1234 } 1235 1236 return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate)); 1237} 1238 1239LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels, 1240 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, 1241 unsigned inst_offset, LLVMTypeRef channel_type, 1242 unsigned cache_policy, bool can_speculate, bool allow_smem) 1243{ 1244 LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0); 1245 if (voffset) 1246 offset = LLVMBuildAdd(ctx->builder, offset, voffset, ""); 1247 if (soffset) 1248 offset = LLVMBuildAdd(ctx->builder, offset, soffset, ""); 1249 1250 if (allow_smem && !(cache_policy & ac_slc) && 1251 (!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) { 1252 assert(vindex == NULL); 1253 1254 LLVMValueRef result[8]; 1255 1256 for (int i = 0; i < num_channels; i++) { 1257 if (i) { 1258 offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), ""); 1259 } 1260 LLVMValueRef args[3] = { 1261 rsrc, 1262 offset, 1263 LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0), 1264 }; 1265 result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3, 1266 AC_FUNC_ATTR_READNONE); 1267 } 1268 if (num_channels == 1) 1269 return result[0]; 1270 1271 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) 1272 result[num_channels++] = LLVMGetUndef(ctx->f32); 1273 return ac_build_gather_values(ctx, result, num_channels); 1274 } 1275 1276 return ac_build_buffer_load_common(ctx, rsrc, vindex, offset, ctx->i32_0, num_channels, 1277 channel_type, cache_policy, can_speculate, false, false); 1278} 1279 1280LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1281 LLVMValueRef vindex, LLVMValueRef voffset, 1282 unsigned num_channels, unsigned cache_policy, 1283 bool can_speculate, bool d16, bool tfe) 1284{ 1285 if (tfe) { 1286 assert(!d16); 1287 1288 char code[256]; 1289 /* The definition in the assembly and the one in the constraint string 1290 * differs because of an assembler bug. 1291 */ 1292 snprintf(code, sizeof(code), 1293 "v_mov_b32 v0, 0\n" 1294 "v_mov_b32 v1, 0\n" 1295 "v_mov_b32 v2, 0\n" 1296 "v_mov_b32 v3, 0\n" 1297 "v_mov_b32 v4, 0\n" 1298 "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n" 1299 "s_waitcnt vmcnt(0)", 1300 cache_policy & ac_glc ? "glc" : "", 1301 cache_policy & ac_slc ? "slc" : "", 1302 cache_policy & ac_dlc ? "dlc" : ""); 1303 1304 LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32}; 1305 LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false); 1306 LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false); 1307 1308 LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0, 1309 voffset ? voffset : ctx->i32_0}; 1310 1311 LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2), 1312 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")}; 1313 LLVMValueRef res = LLVMBuildCall(ctx->builder, inlineasm, args, 2, ""); 1314 1315 return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels), 1316 ac_llvm_extract_elem(ctx, res, 4)); 1317 } 1318 1319 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels, 1320 d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true, 1321 true); 1322} 1323 1324static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1325 LLVMValueRef vindex, LLVMValueRef voffset, 1326 LLVMValueRef soffset, LLVMValueRef immoffset, 1327 unsigned num_channels, unsigned dfmt, unsigned nfmt, 1328 unsigned cache_policy, bool can_speculate, 1329 bool structurized) 1330{ 1331 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); 1332 1333 LLVMValueRef args[6]; 1334 int idx = 0; 1335 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); 1336 if (structurized) 1337 args[idx++] = vindex ? vindex : ctx->i32_0; 1338 args[idx++] = voffset ? voffset : ctx->i32_0; 1339 args[idx++] = soffset ? soffset : ctx->i32_0; 1340 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0); 1341 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); 1342 unsigned func = 1343 !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels; 1344 const char *indexing_kind = structurized ? "struct" : "raw"; 1345 char name[256], type_name[8]; 1346 1347 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; 1348 ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); 1349 1350 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name); 1351 1352 return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate)); 1353} 1354 1355LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1356 LLVMValueRef vindex, LLVMValueRef voffset, 1357 LLVMValueRef soffset, LLVMValueRef immoffset, 1358 unsigned num_channels, unsigned dfmt, unsigned nfmt, 1359 unsigned cache_policy, bool can_speculate) 1360{ 1361 return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, immoffset, num_channels, dfmt, 1362 nfmt, cache_policy, can_speculate, true); 1363} 1364 1365LLVMValueRef ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1366 LLVMValueRef voffset, LLVMValueRef soffset, 1367 LLVMValueRef immoffset, unsigned cache_policy) 1368{ 1369 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); 1370 1371 return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16, 1372 cache_policy, false, false, false); 1373} 1374 1375LLVMValueRef ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1376 LLVMValueRef voffset, LLVMValueRef soffset, 1377 LLVMValueRef immoffset, unsigned cache_policy) 1378{ 1379 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); 1380 1381 return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy, 1382 false, false, false); 1383} 1384 1385/** 1386 * Convert an 11- or 10-bit unsigned floating point number to an f32. 1387 * 1388 * The input exponent is expected to be biased analogous to IEEE-754, i.e. by 1389 * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs). 1390 */ 1391static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, 1392 unsigned exp_bits, unsigned mant_bits) 1393{ 1394 assert(LLVMTypeOf(src) == ctx->i32); 1395 1396 LLVMValueRef tmp; 1397 LLVMValueRef mantissa; 1398 mantissa = 1399 LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), ""); 1400 1401 /* Converting normal numbers is just a shift + correcting the exponent bias */ 1402 unsigned normal_shift = 23 - mant_bits; 1403 unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1); 1404 LLVMValueRef shifted, normal; 1405 1406 shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), ""); 1407 normal = 1408 LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), ""); 1409 1410 /* Converting nan/inf numbers is the same, but with a different exponent update */ 1411 LLVMValueRef naninf; 1412 naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), ""); 1413 1414 /* Converting denormals is the complex case: determine the leading zeros of the 1415 * mantissa to obtain the correct shift for the mantissa and exponent correction. 1416 */ 1417 LLVMValueRef denormal; 1418 LLVMValueRef params[2] = { 1419 mantissa, ctx->i1true, /* result can be undef when arg is 0 */ 1420 }; 1421 LLVMValueRef ctlz = 1422 ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE); 1423 1424 /* Shift such that the leading 1 ends up as the LSB of the exponent field. */ 1425 tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), ""); 1426 denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, ""); 1427 1428 unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1; 1429 tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, ""); 1430 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), ""); 1431 denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, ""); 1432 1433 /* Select the final result. */ 1434 LLVMValueRef result; 1435 1436 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, 1437 LLVMConstInt(ctx->i32, ((1ULL << exp_bits) - 1) << mant_bits, false), ""); 1438 result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, ""); 1439 1440 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, 1441 LLVMConstInt(ctx->i32, 1ULL << mant_bits, false), ""); 1442 result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, ""); 1443 1444 tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, ""); 1445 result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, ""); 1446 1447 return ac_to_float(ctx, result); 1448} 1449 1450/** 1451 * Generate a fully general open coded buffer format fetch with all required 1452 * fixups suitable for vertex fetch, using non-format buffer loads. 1453 * 1454 * Some combinations of argument values have special interpretations: 1455 * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT 1456 * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format 1457 * 1458 * \param log_size log(size of channel in bytes) 1459 * \param num_channels number of channels (1 to 4) 1460 * \param format AC_FETCH_FORMAT_xxx value 1461 * \param reverse whether XYZ channels are reversed 1462 * \param known_aligned whether the source is known to be aligned to hardware's 1463 * effective element size for loading the given format 1464 * (note: this means dword alignment for 8_8_8_8, 16_16, etc.) 1465 * \param rsrc buffer resource descriptor 1466 * \return the resulting vector of floats or integers bitcast to <4 x i32> 1467 */ 1468LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size, 1469 unsigned num_channels, unsigned format, bool reverse, 1470 bool known_aligned, LLVMValueRef rsrc, 1471 LLVMValueRef vindex, LLVMValueRef voffset, 1472 LLVMValueRef soffset, unsigned cache_policy, 1473 bool can_speculate) 1474{ 1475 LLVMValueRef tmp; 1476 unsigned load_log_size = log_size; 1477 unsigned load_num_channels = num_channels; 1478 if (log_size == 3) { 1479 load_log_size = 2; 1480 if (format == AC_FETCH_FORMAT_FLOAT) { 1481 load_num_channels = 2 * num_channels; 1482 } else { 1483 load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */ 1484 } 1485 } 1486 1487 int log_recombine = 0; 1488 if ((ctx->chip_class == GFX6 || ctx->chip_class >= GFX10) && !known_aligned) { 1489 /* Avoid alignment restrictions by loading one byte at a time. */ 1490 load_num_channels <<= load_log_size; 1491 log_recombine = load_log_size; 1492 load_log_size = 0; 1493 } else if (load_num_channels == 2 || load_num_channels == 4) { 1494 log_recombine = -util_logbase2(load_num_channels); 1495 load_num_channels = 1; 1496 load_log_size += -log_recombine; 1497 } 1498 1499 LLVMValueRef loads[32]; /* up to 32 bytes */ 1500 for (unsigned i = 0; i < load_num_channels; ++i) { 1501 tmp = 1502 LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), ""); 1503 LLVMTypeRef channel_type = 1504 load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32; 1505 unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2); 1506 loads[i] = 1507 ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type, 1508 cache_policy, can_speculate, false, true); 1509 if (load_log_size >= 2) 1510 loads[i] = ac_to_integer(ctx, loads[i]); 1511 } 1512 1513 if (log_recombine > 0) { 1514 /* Recombine bytes if necessary (GFX6 only) */ 1515 LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16; 1516 1517 for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) { 1518 LLVMValueRef accum = NULL; 1519 for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) { 1520 tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, ""); 1521 if (i == 0) { 1522 accum = tmp; 1523 } else { 1524 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), ""); 1525 accum = LLVMBuildOr(ctx->builder, accum, tmp, ""); 1526 } 1527 } 1528 loads[dst] = accum; 1529 } 1530 } else if (log_recombine < 0) { 1531 /* Split vectors of dwords */ 1532 if (load_log_size > 2) { 1533 assert(load_num_channels == 1); 1534 LLVMValueRef loaded = loads[0]; 1535 unsigned log_split = load_log_size - 2; 1536 log_recombine += log_split; 1537 load_num_channels = 1 << log_split; 1538 load_log_size = 2; 1539 for (unsigned i = 0; i < load_num_channels; ++i) { 1540 tmp = LLVMConstInt(ctx->i32, i, false); 1541 loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, ""); 1542 } 1543 } 1544 1545 /* Further split dwords and shorts if required */ 1546 if (log_recombine < 0) { 1547 for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0; 1548 --src) { 1549 unsigned dst_bits = 1 << (3 + load_log_size + log_recombine); 1550 LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits); 1551 LLVMValueRef loaded = loads[src - 1]; 1552 LLVMTypeRef loaded_type = LLVMTypeOf(loaded); 1553 for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) { 1554 tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false); 1555 tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, ""); 1556 loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, ""); 1557 } 1558 } 1559 } 1560 } 1561 1562 if (log_size == 3) { 1563 if (format == AC_FETCH_FORMAT_FLOAT) { 1564 for (unsigned i = 0; i < num_channels; ++i) { 1565 tmp = ac_build_gather_values(ctx, &loads[2 * i], 2); 1566 loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, ""); 1567 } 1568 } else if (format == AC_FETCH_FORMAT_FIXED) { 1569 /* 10_11_11_FLOAT */ 1570 LLVMValueRef data = loads[0]; 1571 LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false); 1572 LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, ""); 1573 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), ""); 1574 LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, ""); 1575 LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), ""); 1576 1577 loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6)); 1578 loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6)); 1579 loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5)); 1580 1581 num_channels = 3; 1582 log_size = 2; 1583 format = AC_FETCH_FORMAT_FLOAT; 1584 } else { 1585 /* 2_10_10_10 data formats */ 1586 LLVMValueRef data = loads[0]; 1587 LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10); 1588 LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2); 1589 loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, ""); 1590 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), ""); 1591 loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, ""); 1592 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), ""); 1593 loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, ""); 1594 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), ""); 1595 loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, ""); 1596 1597 num_channels = 4; 1598 } 1599 } 1600 1601 if (format == AC_FETCH_FORMAT_FLOAT) { 1602 if (log_size != 2) { 1603 for (unsigned chan = 0; chan < num_channels; ++chan) { 1604 tmp = ac_to_float(ctx, loads[chan]); 1605 if (log_size == 3) 1606 tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, ""); 1607 else if (log_size == 1) 1608 tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, ""); 1609 loads[chan] = ac_to_integer(ctx, tmp); 1610 } 1611 } 1612 } else if (format == AC_FETCH_FORMAT_UINT) { 1613 if (log_size != 2) { 1614 for (unsigned chan = 0; chan < num_channels; ++chan) 1615 loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, ""); 1616 } 1617 } else if (format == AC_FETCH_FORMAT_SINT) { 1618 if (log_size != 2) { 1619 for (unsigned chan = 0; chan < num_channels; ++chan) 1620 loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, ""); 1621 } 1622 } else { 1623 bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED || 1624 format == AC_FETCH_FORMAT_UINT; 1625 1626 for (unsigned chan = 0; chan < num_channels; ++chan) { 1627 if (unsign) { 1628 tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, ""); 1629 } else { 1630 tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, ""); 1631 } 1632 1633 LLVMValueRef scale = NULL; 1634 if (format == AC_FETCH_FORMAT_FIXED) { 1635 assert(log_size == 2); 1636 scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000); 1637 } else if (format == AC_FETCH_FORMAT_UNORM) { 1638 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan])); 1639 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1)); 1640 } else if (format == AC_FETCH_FORMAT_SNORM) { 1641 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan])); 1642 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1)); 1643 } 1644 if (scale) 1645 tmp = LLVMBuildFMul(ctx->builder, tmp, scale, ""); 1646 1647 if (format == AC_FETCH_FORMAT_SNORM) { 1648 /* Clamp to [-1, 1] */ 1649 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); 1650 LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, ""); 1651 tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, ""); 1652 } 1653 1654 loads[chan] = ac_to_integer(ctx, tmp); 1655 } 1656 } 1657 1658 while (num_channels < 4) { 1659 if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) { 1660 loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0; 1661 } else { 1662 loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0); 1663 } 1664 num_channels++; 1665 } 1666 1667 if (reverse) { 1668 tmp = loads[0]; 1669 loads[0] = loads[2]; 1670 loads[2] = tmp; 1671 } 1672 1673 return ac_build_gather_values(ctx, loads, 4); 1674} 1675 1676static void ac_build_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1677 LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset, 1678 LLVMValueRef soffset, LLVMValueRef immoffset, 1679 unsigned num_channels, unsigned dfmt, unsigned nfmt, 1680 unsigned cache_policy, bool structurized) 1681{ 1682 voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, immoffset, ""); 1683 1684 LLVMValueRef args[7]; 1685 int idx = 0; 1686 args[idx++] = vdata; 1687 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); 1688 if (structurized) 1689 args[idx++] = vindex ? vindex : ctx->i32_0; 1690 args[idx++] = voffset ? voffset : ctx->i32_0; 1691 args[idx++] = soffset ? soffset : ctx->i32_0; 1692 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0); 1693 args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0); 1694 unsigned func = 1695 !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels; 1696 const char *indexing_kind = structurized ? "struct" : "raw"; 1697 char name[256], type_name[8]; 1698 1699 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; 1700 ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); 1701 1702 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", indexing_kind, type_name); 1703 1704 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); 1705} 1706 1707void ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1708 LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset, 1709 LLVMValueRef soffset, LLVMValueRef immoffset, 1710 unsigned num_channels, unsigned dfmt, unsigned nfmt, 1711 unsigned cache_policy) 1712{ 1713 ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, immoffset, num_channels, dfmt, 1714 nfmt, cache_policy, true); 1715} 1716 1717void ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, 1718 LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset, 1719 unsigned num_channels, unsigned dfmt, unsigned nfmt, 1720 unsigned cache_policy) 1721{ 1722 ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, immoffset, num_channels, dfmt, 1723 nfmt, cache_policy, false); 1724} 1725 1726void ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc, 1727 LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset, 1728 unsigned cache_policy) 1729{ 1730 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, ""); 1731 1732 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false, 1733 false); 1734} 1735 1736void ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, 1737 LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy) 1738{ 1739 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, ""); 1740 1741 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false, 1742 false); 1743} 1744 1745/** 1746 * Set range metadata on an instruction. This can only be used on load and 1747 * call instructions. If you know an instruction can only produce the values 1748 * 0, 1, 2, you would do set_range_metadata(value, 0, 3); 1749 * \p lo is the minimum value inclusive. 1750 * \p hi is the maximum value exclusive. 1751 */ 1752void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo, 1753 unsigned hi) 1754{ 1755 LLVMValueRef range_md, md_args[2]; 1756 LLVMTypeRef type = LLVMTypeOf(value); 1757 LLVMContextRef context = LLVMGetTypeContext(type); 1758 1759 md_args[0] = LLVMConstInt(type, lo, false); 1760 md_args[1] = LLVMConstInt(type, hi, false); 1761 range_md = LLVMMDNodeInContext(context, md_args, 2); 1762 LLVMSetMetadata(value, ctx->range_md_kind, range_md); 1763} 1764 1765LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx) 1766{ 1767 return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0)); 1768} 1769 1770/* 1771 * AMD GCN implements derivatives using the local data store (LDS) 1772 * All writes to the LDS happen in all executing threads at 1773 * the same time. TID is the Thread ID for the current 1774 * thread and is a value between 0 and 63, representing 1775 * the thread's position in the wavefront. 1776 * 1777 * For the pixel shader threads are grouped into quads of four pixels. 1778 * The TIDs of the pixels of a quad are: 1779 * 1780 * +------+------+ 1781 * |4n + 0|4n + 1| 1782 * +------+------+ 1783 * |4n + 2|4n + 3| 1784 * +------+------+ 1785 * 1786 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel 1787 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of 1788 * the current pixel's column, and masking with 0xfffffffe yields the TID 1789 * of the left pixel of the current pixel's row. 1790 * 1791 * Adding 1 yields the TID of the pixel to the right of the left pixel, and 1792 * adding 2 yields the TID of the pixel below the top pixel. 1793 */ 1794LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val) 1795{ 1796 unsigned tl_lanes[4], trbl_lanes[4]; 1797 char name[32], type[8]; 1798 LLVMValueRef tl, trbl; 1799 LLVMTypeRef result_type; 1800 LLVMValueRef result; 1801 1802 result_type = ac_to_float_type(ctx, LLVMTypeOf(val)); 1803 1804 if (result_type == ctx->f16) 1805 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, ""); 1806 else if (result_type == ctx->v2f16) 1807 val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, ""); 1808 1809 for (unsigned i = 0; i < 4; ++i) { 1810 tl_lanes[i] = i & mask; 1811 trbl_lanes[i] = (i & mask) + idx; 1812 } 1813 1814 tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]); 1815 trbl = 1816 ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]); 1817 1818 if (result_type == ctx->f16) { 1819 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, ""); 1820 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, ""); 1821 } 1822 1823 tl = LLVMBuildBitCast(ctx->builder, tl, result_type, ""); 1824 trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, ""); 1825 result = LLVMBuildFSub(ctx->builder, trbl, tl, ""); 1826 1827 ac_build_type_name_for_intr(result_type, type, sizeof(type)); 1828 snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type); 1829 1830 return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0); 1831} 1832 1833void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id) 1834{ 1835 LLVMValueRef args[2]; 1836 args[0] = LLVMConstInt(ctx->i32, msg, false); 1837 args[1] = wave_id; 1838 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0); 1839} 1840 1841LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type) 1842{ 1843 LLVMValueRef msb = 1844 ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE); 1845 1846 /* The HW returns the last bit index from MSB, but NIR/TGSI wants 1847 * the index from LSB. Invert it by doing "31 - msb". */ 1848 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, ""); 1849 1850 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true); 1851 LLVMValueRef cond = 1852 LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""), 1853 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), ""); 1854 1855 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, ""); 1856} 1857 1858LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type) 1859{ 1860 const char *intrin_name; 1861 LLVMTypeRef type; 1862 LLVMValueRef highest_bit; 1863 LLVMValueRef zero; 1864 unsigned bitsize; 1865 1866 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg)); 1867 switch (bitsize) { 1868 case 64: 1869 intrin_name = "llvm.ctlz.i64"; 1870 type = ctx->i64; 1871 highest_bit = LLVMConstInt(ctx->i64, 63, false); 1872 zero = ctx->i64_0; 1873 break; 1874 case 32: 1875 intrin_name = "llvm.ctlz.i32"; 1876 type = ctx->i32; 1877 highest_bit = LLVMConstInt(ctx->i32, 31, false); 1878 zero = ctx->i32_0; 1879 break; 1880 case 16: 1881 intrin_name = "llvm.ctlz.i16"; 1882 type = ctx->i16; 1883 highest_bit = LLVMConstInt(ctx->i16, 15, false); 1884 zero = ctx->i16_0; 1885 break; 1886 case 8: 1887 intrin_name = "llvm.ctlz.i8"; 1888 type = ctx->i8; 1889 highest_bit = LLVMConstInt(ctx->i8, 7, false); 1890 zero = ctx->i8_0; 1891 break; 1892 default: 1893 unreachable(!"invalid bitsize"); 1894 break; 1895 } 1896 1897 LLVMValueRef params[2] = { 1898 arg, 1899 ctx->i1true, 1900 }; 1901 1902 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE); 1903 1904 /* The HW returns the last bit index from MSB, but TGSI/NIR wants 1905 * the index from LSB. Invert it by doing "31 - msb". */ 1906 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, ""); 1907 1908 if (bitsize == 64) { 1909 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, ""); 1910 } else if (bitsize < 32) { 1911 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, ""); 1912 } 1913 1914 /* check for zero */ 1915 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""), 1916 LLVMConstInt(ctx->i32, -1, true), msb, ""); 1917} 1918 1919LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 1920{ 1921 char name[64], type[64]; 1922 1923 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type)); 1924 snprintf(name, sizeof(name), "llvm.minnum.%s", type); 1925 LLVMValueRef args[2] = {a, b}; 1926 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE); 1927} 1928 1929LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 1930{ 1931 char name[64], type[64]; 1932 1933 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type)); 1934 snprintf(name, sizeof(name), "llvm.maxnum.%s", type); 1935 LLVMValueRef args[2] = {a, b}; 1936 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE); 1937} 1938 1939LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 1940{ 1941 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, ""); 1942 return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); 1943} 1944 1945LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 1946{ 1947 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, ""); 1948 return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); 1949} 1950 1951LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 1952{ 1953 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, ""); 1954 return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); 1955} 1956 1957LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) 1958{ 1959 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, ""); 1960 return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); 1961} 1962 1963LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value) 1964{ 1965 LLVMTypeRef t = LLVMTypeOf(value); 1966 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)), 1967 LLVMConstReal(t, 1.0)); 1968} 1969 1970void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a) 1971{ 1972 LLVMValueRef args[9]; 1973 1974 args[0] = LLVMConstInt(ctx->i32, a->target, 0); 1975 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0); 1976 1977 if (a->compr) { 1978 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, ""); 1979 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, ""); 1980 args[4] = LLVMConstInt(ctx->i1, a->done, 0); 1981 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0); 1982 1983 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0); 1984 } else { 1985 args[2] = a->out[0]; 1986 args[3] = a->out[1]; 1987 args[4] = a->out[2]; 1988 args[5] = a->out[3]; 1989 args[6] = LLVMConstInt(ctx->i1, a->done, 0); 1990 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0); 1991 1992 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0); 1993 } 1994} 1995 1996void ac_build_export_null(struct ac_llvm_context *ctx) 1997{ 1998 struct ac_export_args args; 1999 2000 args.enabled_channels = 0x0; /* enabled channels */ 2001 args.valid_mask = 1; /* whether the EXEC mask is valid */ 2002 args.done = 1; /* DONE bit */ 2003 args.target = V_008DFC_SQ_EXP_NULL; 2004 args.compr = 0; /* COMPR flag (0 = 32-bit export) */ 2005 args.out[0] = LLVMGetUndef(ctx->f32); /* R */ 2006 args.out[1] = LLVMGetUndef(ctx->f32); /* G */ 2007 args.out[2] = LLVMGetUndef(ctx->f32); /* B */ 2008 args.out[3] = LLVMGetUndef(ctx->f32); /* A */ 2009 2010 ac_build_export(ctx, &args); 2011} 2012 2013static unsigned ac_num_coords(enum ac_image_dim dim) 2014{ 2015 switch (dim) { 2016 case ac_image_1d: 2017 return 1; 2018 case ac_image_2d: 2019 case ac_image_1darray: 2020 return 2; 2021 case ac_image_3d: 2022 case ac_image_cube: 2023 case ac_image_2darray: 2024 case ac_image_2dmsaa: 2025 return 3; 2026 case ac_image_2darraymsaa: 2027 return 4; 2028 default: 2029 unreachable("ac_num_coords: bad dim"); 2030 } 2031} 2032 2033static unsigned ac_num_derivs(enum ac_image_dim dim) 2034{ 2035 switch (dim) { 2036 case ac_image_1d: 2037 case ac_image_1darray: 2038 return 2; 2039 case ac_image_2d: 2040 case ac_image_2darray: 2041 case ac_image_cube: 2042 return 4; 2043 case ac_image_3d: 2044 return 6; 2045 case ac_image_2dmsaa: 2046 case ac_image_2darraymsaa: 2047 default: 2048 unreachable("derivatives not supported"); 2049 } 2050} 2051 2052static const char *get_atomic_name(enum ac_atomic_op op) 2053{ 2054 switch (op) { 2055 case ac_atomic_swap: 2056 return "swap"; 2057 case ac_atomic_add: 2058 return "add"; 2059 case ac_atomic_sub: 2060 return "sub"; 2061 case ac_atomic_smin: 2062 return "smin"; 2063 case ac_atomic_umin: 2064 return "umin"; 2065 case ac_atomic_smax: 2066 return "smax"; 2067 case ac_atomic_umax: 2068 return "umax"; 2069 case ac_atomic_and: 2070 return "and"; 2071 case ac_atomic_or: 2072 return "or"; 2073 case ac_atomic_xor: 2074 return "xor"; 2075 case ac_atomic_inc_wrap: 2076 return "inc"; 2077 case ac_atomic_dec_wrap: 2078 return "dec"; 2079 case ac_atomic_fmin: 2080 return "fmin"; 2081 case ac_atomic_fmax: 2082 return "fmax"; 2083 } 2084 unreachable("bad atomic op"); 2085} 2086 2087LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a) 2088{ 2089 const char *overload[3] = {"", "", ""}; 2090 unsigned num_overloads = 0; 2091 LLVMValueRef args[18]; 2092 unsigned num_args = 0; 2093 enum ac_image_dim dim = a->dim; 2094 2095 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero); 2096 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip && 2097 a->opcode != ac_image_store_mip) || 2098 a->lod); 2099 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || 2100 (!a->compare && !a->offset)); 2101 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || 2102 a->opcode == ac_image_get_lod) || 2103 !a->bias); 2104 assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <= 2105 1); 2106 assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1); 2107 assert(!a->d16 || (ctx->chip_class >= GFX8 && a->opcode != ac_image_atomic && 2108 a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod && 2109 a->opcode != ac_image_get_resinfo)); 2110 assert(!a->a16 || ctx->chip_class >= GFX9); 2111 assert(a->g16 == a->a16 || ctx->chip_class >= GFX10); 2112 2113 assert(!a->offset || 2114 ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32); 2115 assert(!a->bias || 2116 ac_get_elem_bits(ctx, LLVMTypeOf(a->bias)) == 32); 2117 assert(!a->compare || 2118 ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32); 2119 assert(!a->derivs[0] || 2120 ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) && 2121 (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32))); 2122 assert(!a->coords[0] || 2123 ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) && 2124 (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32))); 2125 assert(!a->lod || 2126 ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) && 2127 (a->opcode == ac_image_get_resinfo || 2128 ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) == 2129 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0]))))); 2130 assert(!a->min_lod || 2131 ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) == 2132 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0]))); 2133 2134 if (a->opcode == ac_image_get_lod) { 2135 switch (dim) { 2136 case ac_image_1darray: 2137 dim = ac_image_1d; 2138 break; 2139 case ac_image_2darray: 2140 case ac_image_cube: 2141 dim = ac_image_2d; 2142 break; 2143 default: 2144 break; 2145 } 2146 } 2147 2148 bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || 2149 a->opcode == ac_image_get_lod; 2150 bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap; 2151 bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || 2152 a->opcode == ac_image_load || a->opcode == ac_image_load_mip; 2153 LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32); 2154 uint8_t dmask = a->dmask; 2155 LLVMTypeRef data_type; 2156 char data_type_str[32]; 2157 2158 if (atomic) { 2159 data_type = LLVMTypeOf(a->data[0]); 2160 } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) { 2161 /* Image stores might have been shrinked using the format. */ 2162 data_type = LLVMTypeOf(a->data[0]); 2163 dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1; 2164 } else { 2165 data_type = a->d16 ? ctx->v4f16 : ctx->v4f32; 2166 } 2167 2168 if (a->tfe) { 2169 data_type = LLVMStructTypeInContext( 2170 ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false); 2171 } 2172 2173 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) { 2174 args[num_args++] = a->data[0]; 2175 if (a->opcode == ac_image_atomic_cmpswap) 2176 args[num_args++] = a->data[1]; 2177 } 2178 2179 if (!atomic) 2180 args[num_args++] = LLVMConstInt(ctx->i32, dmask, false); 2181 2182 if (a->offset) 2183 args[num_args++] = ac_to_integer(ctx, a->offset); 2184 if (a->bias) { 2185 args[num_args++] = ac_to_float(ctx, a->bias); 2186 overload[num_overloads++] = ".f32"; 2187 } 2188 if (a->compare) 2189 args[num_args++] = ac_to_float(ctx, a->compare); 2190 if (a->derivs[0]) { 2191 unsigned count = ac_num_derivs(dim); 2192 for (unsigned i = 0; i < count; ++i) 2193 args[num_args++] = ac_to_float(ctx, a->derivs[i]); 2194 overload[num_overloads++] = a->g16 ? ".f16" : ".f32"; 2195 } 2196 unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0; 2197 for (unsigned i = 0; i < num_coords; ++i) 2198 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, ""); 2199 if (a->lod) 2200 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, ""); 2201 if (a->min_lod) 2202 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, ""); 2203 2204 overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32"); 2205 2206 args[num_args++] = a->resource; 2207 if (sample) { 2208 args[num_args++] = a->sampler; 2209 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false); 2210 } 2211 2212 args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */ 2213 args[num_args++] = LLVMConstInt( 2214 ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false); 2215 2216 const char *name; 2217 const char *atomic_subop = ""; 2218 switch (a->opcode) { 2219 case ac_image_sample: 2220 name = "sample"; 2221 break; 2222 case ac_image_gather4: 2223 name = "gather4"; 2224 break; 2225 case ac_image_load: 2226 name = "load"; 2227 break; 2228 case ac_image_load_mip: 2229 name = "load.mip"; 2230 break; 2231 case ac_image_store: 2232 name = "store"; 2233 break; 2234 case ac_image_store_mip: 2235 name = "store.mip"; 2236 break; 2237 case ac_image_atomic: 2238 name = "atomic."; 2239 atomic_subop = get_atomic_name(a->atomic); 2240 break; 2241 case ac_image_atomic_cmpswap: 2242 name = "atomic."; 2243 atomic_subop = "cmpswap"; 2244 break; 2245 case ac_image_get_lod: 2246 name = "getlod"; 2247 break; 2248 case ac_image_get_resinfo: 2249 name = "getresinfo"; 2250 break; 2251 default: 2252 unreachable("invalid image opcode"); 2253 } 2254 2255 const char *dimname; 2256 switch (dim) { 2257 case ac_image_1d: 2258 dimname = "1d"; 2259 break; 2260 case ac_image_2d: 2261 dimname = "2d"; 2262 break; 2263 case ac_image_3d: 2264 dimname = "3d"; 2265 break; 2266 case ac_image_cube: 2267 dimname = "cube"; 2268 break; 2269 case ac_image_1darray: 2270 dimname = "1darray"; 2271 break; 2272 case ac_image_2darray: 2273 dimname = "2darray"; 2274 break; 2275 case ac_image_2dmsaa: 2276 dimname = "2dmsaa"; 2277 break; 2278 case ac_image_2darraymsaa: 2279 dimname = "2darraymsaa"; 2280 break; 2281 default: 2282 unreachable("invalid dim"); 2283 } 2284 2285 ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str)); 2286 2287 bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4); 2288 char intr_name[96]; 2289 snprintf(intr_name, sizeof(intr_name), 2290 "llvm.amdgcn.image.%s%s" /* base name */ 2291 "%s%s%s%s" /* sample/gather modifiers */ 2292 ".%s.%s%s%s%s", /* dimension and type overloads */ 2293 name, atomic_subop, a->compare ? ".c" : "", 2294 a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "", 2295 a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname, 2296 data_type_str, overload[0], overload[1], overload[2]); 2297 2298 LLVMTypeRef retty; 2299 if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) 2300 retty = ctx->voidt; 2301 else 2302 retty = data_type; 2303 2304 LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes); 2305 if (a->tfe) { 2306 LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, ""); 2307 LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, ""); 2308 result = ac_build_concat(ctx, texel, ac_to_float(ctx, code)); 2309 } 2310 2311 if (!sample && !atomic && retty != ctx->voidt) 2312 result = ac_to_integer(ctx, result); 2313 2314 return result; 2315} 2316 2317LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc) 2318{ 2319 LLVMValueRef samples; 2320 2321 /* Read the samples from the descriptor directly. 2322 * Hardware doesn't have any instruction for this. 2323 */ 2324 samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), ""); 2325 samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), ""); 2326 samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), ""); 2327 samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, ""); 2328 return samples; 2329} 2330 2331LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2]) 2332{ 2333 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2, 2334 AC_FUNC_ATTR_READNONE); 2335} 2336 2337LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2]) 2338{ 2339 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2, 2340 AC_FUNC_ATTR_READNONE); 2341 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); 2342} 2343 2344LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2]) 2345{ 2346 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2, 2347 AC_FUNC_ATTR_READNONE); 2348 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); 2349} 2350 2351LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx, 2352 LLVMValueRef args[2]) 2353{ 2354 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16}; 2355 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false); 2356 LLVMValueRef code = LLVMConstInlineAsm(calltype, 2357 "v_cvt_pknorm_i16_f16 $0, $1, $2", "=v,v,v", 2358 false, false); 2359 return LLVMBuildCall(ctx->builder, code, args, 2, ""); 2360} 2361 2362LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx, 2363 LLVMValueRef args[2]) 2364{ 2365 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16}; 2366 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false); 2367 LLVMValueRef code = LLVMConstInlineAsm(calltype, 2368 "v_cvt_pknorm_u16_f16 $0, $1, $2", "=v,v,v", 2369 false, false); 2370 return LLVMBuildCall(ctx->builder, code, args, 2, ""); 2371} 2372 2373/* The 8-bit and 10-bit clamping is for HW workarounds. */ 2374LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, 2375 bool hi) 2376{ 2377 assert(bits == 8 || bits == 10 || bits == 16); 2378 2379 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0); 2380 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0); 2381 LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1; 2382 LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0); 2383 2384 /* Clamp. */ 2385 if (bits != 16) { 2386 for (int i = 0; i < 2; i++) { 2387 bool alpha = hi && i == 1; 2388 args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb); 2389 args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb); 2390 } 2391 } 2392 2393 LLVMValueRef res = 2394 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE); 2395 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); 2396} 2397 2398/* The 8-bit and 10-bit clamping is for HW workarounds. */ 2399LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, 2400 bool hi) 2401{ 2402 assert(bits == 8 || bits == 10 || bits == 16); 2403 2404 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0); 2405 LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0); 2406 2407 /* Clamp. */ 2408 if (bits != 16) { 2409 for (int i = 0; i < 2; i++) { 2410 bool alpha = hi && i == 1; 2411 args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb); 2412 } 2413 } 2414 2415 LLVMValueRef res = 2416 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE); 2417 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); 2418} 2419 2420LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1) 2421{ 2422 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE); 2423} 2424 2425void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1) 2426{ 2427 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0); 2428} 2429 2430LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset, 2431 LLVMValueRef width, bool is_signed) 2432{ 2433 LLVMValueRef args[] = { 2434 input, 2435 offset, 2436 width, 2437 }; 2438 2439 return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32", 2440 ctx->i32, args, 3, AC_FUNC_ATTR_READNONE); 2441} 2442 2443LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1, 2444 LLVMValueRef s2) 2445{ 2446 return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, ""); 2447} 2448 2449LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1, 2450 LLVMValueRef s2) 2451{ 2452 /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */ 2453 if (ctx->chip_class >= GFX10) { 2454 return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3, 2455 AC_FUNC_ATTR_READNONE); 2456 } 2457 2458 return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, ""); 2459} 2460 2461void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags) 2462{ 2463 if (!wait_flags) 2464 return; 2465 2466 unsigned lgkmcnt = 63; 2467 unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15; 2468 unsigned vscnt = 63; 2469 2470 if (wait_flags & AC_WAIT_LGKM) 2471 lgkmcnt = 0; 2472 if (wait_flags & AC_WAIT_VLOAD) 2473 vmcnt = 0; 2474 2475 if (wait_flags & AC_WAIT_VSTORE) { 2476 if (ctx->chip_class >= GFX10) 2477 vscnt = 0; 2478 else 2479 vmcnt = 0; 2480 } 2481 2482 /* There is no intrinsic for vscnt(0), so use a fence. */ 2483 if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) || 2484 vscnt == 0) { 2485 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, ""); 2486 return; 2487 } 2488 2489 unsigned simm16 = (lgkmcnt << 8) | (7 << 4) | /* expcnt */ 2490 (vmcnt & 0xf) | ((vmcnt >> 4) << 14); 2491 2492 LLVMValueRef args[1] = { 2493 LLVMConstInt(ctx->i32, simm16, false), 2494 }; 2495 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0); 2496} 2497 2498LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src, 2499 LLVMTypeRef type) 2500{ 2501 unsigned bitsize = ac_get_elem_bits(ctx, type); 2502 LLVMValueRef zero = LLVMConstReal(type, 0.0); 2503 LLVMValueRef one = LLVMConstReal(type, 1.0); 2504 LLVMValueRef result; 2505 2506 if (bitsize == 64 || (bitsize == 16 && ctx->chip_class <= GFX8) || type == ctx->v2f16) { 2507 /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM 2508 * doesn't expose an intrinsic. 2509 */ 2510 result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one); 2511 } else { 2512 LLVMTypeRef type; 2513 char *intr; 2514 2515 if (bitsize == 16) { 2516 intr = "llvm.amdgcn.fmed3.f16"; 2517 type = ctx->f16; 2518 } else { 2519 assert(bitsize == 32); 2520 intr = "llvm.amdgcn.fmed3.f32"; 2521 type = ctx->f32; 2522 } 2523 2524 LLVMValueRef params[] = { 2525 zero, 2526 one, 2527 src, 2528 }; 2529 2530 result = ac_build_intrinsic(ctx, intr, type, params, 3, 2531 AC_FUNC_ATTR_READNONE); 2532 } 2533 2534 if (ctx->chip_class < GFX9 && bitsize == 32) { 2535 /* Only pre-GFX9 chips do not flush denorms. */ 2536 result = ac_build_canonicalize(ctx, result, bitsize); 2537 } 2538 2539 return result; 2540} 2541 2542LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) 2543{ 2544 LLVMTypeRef type; 2545 char *intr; 2546 2547 if (bitsize == 16) { 2548 intr = "llvm.amdgcn.fract.f16"; 2549 type = ctx->f16; 2550 } else if (bitsize == 32) { 2551 intr = "llvm.amdgcn.fract.f32"; 2552 type = ctx->f32; 2553 } else { 2554 intr = "llvm.amdgcn.fract.f64"; 2555 type = ctx->f64; 2556 } 2557 2558 LLVMValueRef params[] = { 2559 src0, 2560 }; 2561 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); 2562} 2563 2564LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value) 2565{ 2566 2567 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { 2568 LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0); 2569 unsigned vec_size = LLVMGetVectorSize(type); 2570 LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef)); 2571 2572 for (unsigned i = 0; i < vec_size; i++) 2573 scalars[i] = scalar; 2574 return LLVMConstVector(scalars, vec_size); 2575 } 2576 return LLVMConstInt(type, value, 0); 2577} 2578 2579LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0) 2580{ 2581 LLVMTypeRef type = LLVMTypeOf(src0); 2582 LLVMValueRef val; 2583 2584 /* v_med3 is selected only when max is first. (LLVM bug?) */ 2585 val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1)); 2586 return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1)); 2587} 2588 2589static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val) 2590{ 2591 ac_enable_signed_zeros(ctx); 2592 /* (val + 0) converts negative zero to positive zero. */ 2593 val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), ""); 2594 ac_disable_signed_zeros(ctx); 2595 return val; 2596} 2597 2598LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src) 2599{ 2600 LLVMTypeRef type = LLVMTypeOf(src); 2601 LLVMValueRef pos, neg, dw[2], val; 2602 unsigned bitsize = ac_get_elem_bits(ctx, type); 2603 2604 /* The standard version leads to this: 2605 * v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004 2606 * v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2 2607 * v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880 2608 * v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3 2609 * 2610 * The isign version: 2611 * v_add_f32_e64 v4, s4, 0 ; D5030004 00010004 2612 * v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304 2613 * v_cvt_f32_i32_e32 v4, v4 ; 7E080B04 2614 * 2615 * (src0 + 0) converts negative zero to positive zero. 2616 * After that, int(fsign(x)) == isign(floatBitsToInt(x)). 2617 * 2618 * For FP64, use the standard version, which doesn't suffer from the huge DP rate 2619 * reduction. (FP64 comparisons are as fast as int64 comparisons) 2620 */ 2621 if (bitsize == 16 || bitsize == 32) { 2622 val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src)); 2623 val = ac_build_isign(ctx, val); 2624 return LLVMBuildSIToFP(ctx->builder, val, type, ""); 2625 } 2626 2627 assert(bitsize == 64); 2628 pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, ""); 2629 neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, ""); 2630 dw[0] = ctx->i32_0; 2631 dw[1] = LLVMBuildSelect( 2632 ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0), 2633 LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""), 2634 ""); 2635 return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, ""); 2636} 2637 2638LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0) 2639{ 2640 LLVMValueRef result; 2641 unsigned bitsize; 2642 2643 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); 2644 2645 switch (bitsize) { 2646 case 128: 2647 result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1, 2648 AC_FUNC_ATTR_READNONE); 2649 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); 2650 break; 2651 case 64: 2652 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 2653 AC_FUNC_ATTR_READNONE); 2654 2655 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); 2656 break; 2657 case 32: 2658 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 2659 AC_FUNC_ATTR_READNONE); 2660 break; 2661 case 16: 2662 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 2663 AC_FUNC_ATTR_READNONE); 2664 2665 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); 2666 break; 2667 case 8: 2668 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 2669 AC_FUNC_ATTR_READNONE); 2670 2671 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); 2672 break; 2673 default: 2674 unreachable(!"invalid bitsize"); 2675 break; 2676 } 2677 2678 return result; 2679} 2680 2681LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0) 2682{ 2683 LLVMValueRef result; 2684 unsigned bitsize; 2685 2686 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); 2687 2688 switch (bitsize) { 2689 case 64: 2690 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 2691 AC_FUNC_ATTR_READNONE); 2692 2693 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); 2694 break; 2695 case 32: 2696 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 2697 AC_FUNC_ATTR_READNONE); 2698 break; 2699 case 16: 2700 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 2701 AC_FUNC_ATTR_READNONE); 2702 2703 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); 2704 break; 2705 case 8: 2706 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 2707 AC_FUNC_ATTR_READNONE); 2708 2709 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); 2710 break; 2711 default: 2712 unreachable(!"invalid bitsize"); 2713 break; 2714 } 2715 2716 return result; 2717} 2718 2719#define AC_EXP_TARGET 0 2720#define AC_EXP_ENABLED_CHANNELS 1 2721#define AC_EXP_OUT0 2 2722 2723enum ac_ir_type 2724{ 2725 AC_IR_UNDEF, 2726 AC_IR_CONST, 2727 AC_IR_VALUE, 2728}; 2729 2730struct ac_vs_exp_chan { 2731 LLVMValueRef value; 2732 float const_float; 2733 enum ac_ir_type type; 2734}; 2735 2736struct ac_vs_exp_inst { 2737 unsigned offset; 2738 LLVMValueRef inst; 2739 struct ac_vs_exp_chan chan[4]; 2740}; 2741 2742struct ac_vs_exports { 2743 unsigned num; 2744 struct ac_vs_exp_inst exp[VARYING_SLOT_MAX]; 2745}; 2746 2747/* Return true if the PARAM export has been eliminated. */ 2748static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, uint32_t num_outputs, 2749 struct ac_vs_exp_inst *exp) 2750{ 2751 unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */ 2752 bool is_zero[4] = {0}, is_one[4] = {0}; 2753 2754 for (i = 0; i < 4; i++) { 2755 /* It's a constant expression. Undef outputs are eliminated too. */ 2756 if (exp->chan[i].type == AC_IR_UNDEF) { 2757 is_zero[i] = true; 2758 is_one[i] = true; 2759 } else if (exp->chan[i].type == AC_IR_CONST) { 2760 if (exp->chan[i].const_float == 0) 2761 is_zero[i] = true; 2762 else if (exp->chan[i].const_float == 1) 2763 is_one[i] = true; 2764 else 2765 return false; /* other constant */ 2766 } else 2767 return false; 2768 } 2769 2770 /* Only certain combinations of 0 and 1 can be eliminated. */ 2771 if (is_zero[0] && is_zero[1] && is_zero[2]) 2772 default_val = is_zero[3] ? 0 : 1; 2773 else if (is_one[0] && is_one[1] && is_one[2]) 2774 default_val = is_zero[3] ? 2 : 3; 2775 else 2776 return false; 2777 2778 /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */ 2779 LLVMInstructionEraseFromParent(exp->inst); 2780 2781 /* Change OFFSET to DEFAULT_VAL. */ 2782 for (i = 0; i < num_outputs; i++) { 2783 if (vs_output_param_offset[i] == exp->offset) { 2784 vs_output_param_offset[i] = AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val; 2785 break; 2786 } 2787 } 2788 return true; 2789} 2790 2791static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx, 2792 uint8_t *vs_output_param_offset, uint32_t num_outputs, 2793 struct ac_vs_exports *processed, 2794 struct ac_vs_exp_inst *exp) 2795{ 2796 unsigned p, copy_back_channels = 0; 2797 2798 /* See if the output is already in the list of processed outputs. 2799 * The LLVMValueRef comparison relies on SSA. 2800 */ 2801 for (p = 0; p < processed->num; p++) { 2802 bool different = false; 2803 2804 for (unsigned j = 0; j < 4; j++) { 2805 struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j]; 2806 struct ac_vs_exp_chan *c2 = &exp->chan[j]; 2807 2808 /* Treat undef as a match. */ 2809 if (c2->type == AC_IR_UNDEF) 2810 continue; 2811 2812 /* If c1 is undef but c2 isn't, we can copy c2 to c1 2813 * and consider the instruction duplicated. 2814 */ 2815 if (c1->type == AC_IR_UNDEF) { 2816 copy_back_channels |= 1 << j; 2817 continue; 2818 } 2819 2820 /* Test whether the channels are not equal. */ 2821 if (c1->type != c2->type || 2822 (c1->type == AC_IR_CONST && c1->const_float != c2->const_float) || 2823 (c1->type == AC_IR_VALUE && c1->value != c2->value)) { 2824 different = true; 2825 break; 2826 } 2827 } 2828 if (!different) 2829 break; 2830 2831 copy_back_channels = 0; 2832 } 2833 if (p == processed->num) 2834 return false; 2835 2836 /* If a match was found, but the matching export has undef where the new 2837 * one has a normal value, copy the normal value to the undef channel. 2838 */ 2839 struct ac_vs_exp_inst *match = &processed->exp[p]; 2840 2841 /* Get current enabled channels mask. */ 2842 LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS); 2843 unsigned enabled_channels = LLVMConstIntGetZExtValue(arg); 2844 2845 while (copy_back_channels) { 2846 unsigned chan = u_bit_scan(©_back_channels); 2847 2848 assert(match->chan[chan].type == AC_IR_UNDEF); 2849 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, exp->chan[chan].value); 2850 match->chan[chan] = exp->chan[chan]; 2851 2852 /* Update number of enabled channels because the original mask 2853 * is not always 0xf. 2854 */ 2855 enabled_channels |= (1 << chan); 2856 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS, 2857 LLVMConstInt(ctx->i32, enabled_channels, 0)); 2858 } 2859 2860 /* The PARAM export is duplicated. Kill it. */ 2861 LLVMInstructionEraseFromParent(exp->inst); 2862 2863 /* Change OFFSET to the matching export. */ 2864 for (unsigned i = 0; i < num_outputs; i++) { 2865 if (vs_output_param_offset[i] == exp->offset) { 2866 vs_output_param_offset[i] = match->offset; 2867 break; 2868 } 2869 } 2870 return true; 2871} 2872 2873void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, LLVMValueRef main_fn, 2874 uint8_t *vs_output_param_offset, uint32_t num_outputs, 2875 uint32_t skip_output_mask, uint8_t *num_param_exports) 2876{ 2877 LLVMBasicBlockRef bb; 2878 bool removed_any = false; 2879 struct ac_vs_exports exports; 2880 2881 exports.num = 0; 2882 2883 /* Process all LLVM instructions. */ 2884 bb = LLVMGetFirstBasicBlock(main_fn); 2885 while (bb) { 2886 LLVMValueRef inst = LLVMGetFirstInstruction(bb); 2887 2888 while (inst) { 2889 LLVMValueRef cur = inst; 2890 inst = LLVMGetNextInstruction(inst); 2891 struct ac_vs_exp_inst exp; 2892 2893 if (LLVMGetInstructionOpcode(cur) != LLVMCall) 2894 continue; 2895 2896 LLVMValueRef callee = ac_llvm_get_called_value(cur); 2897 2898 if (!ac_llvm_is_function(callee)) 2899 continue; 2900 2901 const char *name = LLVMGetValueName(callee); 2902 unsigned num_args = LLVMCountParams(callee); 2903 2904 /* Check if this is an export instruction. */ 2905 if ((num_args != 9 && num_args != 8) || 2906 (strcmp(name, "llvm.SI.export") && strcmp(name, "llvm.amdgcn.exp.f32"))) 2907 continue; 2908 2909 LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET); 2910 unsigned target = LLVMConstIntGetZExtValue(arg); 2911 2912 if (target < V_008DFC_SQ_EXP_PARAM) 2913 continue; 2914 2915 target -= V_008DFC_SQ_EXP_PARAM; 2916 2917 /* Parse the instruction. */ 2918 memset(&exp, 0, sizeof(exp)); 2919 exp.offset = target; 2920 exp.inst = cur; 2921 2922 for (unsigned i = 0; i < 4; i++) { 2923 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i); 2924 2925 exp.chan[i].value = v; 2926 2927 if (LLVMIsUndef(v)) { 2928 exp.chan[i].type = AC_IR_UNDEF; 2929 } else if (LLVMIsAConstantFP(v)) { 2930 LLVMBool loses_info; 2931 exp.chan[i].type = AC_IR_CONST; 2932 exp.chan[i].const_float = LLVMConstRealGetDouble(v, &loses_info); 2933 } else { 2934 exp.chan[i].type = AC_IR_VALUE; 2935 } 2936 } 2937 2938 /* Eliminate constant and duplicated PARAM exports. */ 2939 if (!((1u << target) & skip_output_mask) && 2940 (ac_eliminate_const_output(vs_output_param_offset, num_outputs, &exp) || 2941 ac_eliminate_duplicated_output(ctx, vs_output_param_offset, num_outputs, &exports, 2942 &exp))) { 2943 removed_any = true; 2944 } else { 2945 exports.exp[exports.num++] = exp; 2946 } 2947 } 2948 bb = LLVMGetNextBasicBlock(bb); 2949 } 2950 2951 /* Remove holes in export memory due to removed PARAM exports. 2952 * This is done by renumbering all PARAM exports. 2953 */ 2954 if (removed_any) { 2955 uint8_t old_offset[VARYING_SLOT_MAX]; 2956 unsigned out, i; 2957 2958 /* Make a copy of the offsets. We need the old version while 2959 * we are modifying some of them. */ 2960 memcpy(old_offset, vs_output_param_offset, sizeof(old_offset)); 2961 2962 for (i = 0; i < exports.num; i++) { 2963 unsigned offset = exports.exp[i].offset; 2964 2965 /* Update vs_output_param_offset. Multiple outputs can 2966 * have the same offset. 2967 */ 2968 for (out = 0; out < num_outputs; out++) { 2969 if (old_offset[out] == offset) 2970 vs_output_param_offset[out] = i; 2971 } 2972 2973 /* Change the PARAM offset in the instruction. */ 2974 LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET, 2975 LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_PARAM + i, 0)); 2976 } 2977 *num_param_exports = exports.num; 2978 } 2979} 2980 2981void ac_init_exec_full_mask(struct ac_llvm_context *ctx) 2982{ 2983 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); 2984 ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1, 2985 AC_FUNC_ATTR_CONVERGENT); 2986} 2987 2988void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx) 2989{ 2990 unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768; 2991 ctx->lds = LLVMBuildIntToPtr( 2992 ctx->builder, ctx->i32_0, 2993 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds"); 2994} 2995 2996LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr) 2997{ 2998 return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), ""); 2999} 3000 3001void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value) 3002{ 3003 value = ac_to_integer(ctx, value); 3004 ac_build_indexed_store(ctx, ctx->lds, dw_addr, value); 3005} 3006 3007LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0) 3008{ 3009 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); 3010 const char *intrin_name; 3011 LLVMTypeRef type; 3012 LLVMValueRef zero; 3013 3014 switch (src0_bitsize) { 3015 case 64: 3016 intrin_name = "llvm.cttz.i64"; 3017 type = ctx->i64; 3018 zero = ctx->i64_0; 3019 break; 3020 case 32: 3021 intrin_name = "llvm.cttz.i32"; 3022 type = ctx->i32; 3023 zero = ctx->i32_0; 3024 break; 3025 case 16: 3026 intrin_name = "llvm.cttz.i16"; 3027 type = ctx->i16; 3028 zero = ctx->i16_0; 3029 break; 3030 case 8: 3031 intrin_name = "llvm.cttz.i8"; 3032 type = ctx->i8; 3033 zero = ctx->i8_0; 3034 break; 3035 default: 3036 unreachable(!"invalid bitsize"); 3037 } 3038 3039 LLVMValueRef params[2] = { 3040 src0, 3041 3042 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't 3043 * add special code to check for x=0. The reason is that 3044 * the LLVM behavior for x=0 is different from what we 3045 * need here. However, LLVM also assumes that ffs(x) is 3046 * in [0, 31], but GLSL expects that ffs(0) = -1, so 3047 * a conditional assignment to handle 0 is still required. 3048 * 3049 * The hardware already implements the correct behavior. 3050 */ 3051 ctx->i1true, 3052 }; 3053 3054 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE); 3055 3056 if (src0_bitsize == 64) { 3057 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, ""); 3058 } else if (src0_bitsize < 32) { 3059 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, ""); 3060 } 3061 3062 /* TODO: We need an intrinsic to skip this conditional. */ 3063 /* Check for zero: */ 3064 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""), 3065 LLVMConstInt(ctx->i32, -1, 0), lsb, ""); 3066} 3067 3068LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type) 3069{ 3070 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST); 3071} 3072 3073LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type) 3074{ 3075 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT); 3076} 3077 3078static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx) 3079{ 3080 if (ctx->flow->depth > 0) 3081 return &ctx->flow->stack[ctx->flow->depth - 1]; 3082 return NULL; 3083} 3084 3085static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx) 3086{ 3087 for (unsigned i = ctx->flow->depth; i > 0; --i) { 3088 if (ctx->flow->stack[i - 1].loop_entry_block) 3089 return &ctx->flow->stack[i - 1]; 3090 } 3091 return NULL; 3092} 3093 3094static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx) 3095{ 3096 struct ac_llvm_flow *flow; 3097 3098 if (ctx->flow->depth >= ctx->flow->depth_max) { 3099 unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH); 3100 3101 ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack)); 3102 ctx->flow->depth_max = new_max; 3103 } 3104 3105 flow = &ctx->flow->stack[ctx->flow->depth]; 3106 ctx->flow->depth++; 3107 3108 flow->next_block = NULL; 3109 flow->loop_entry_block = NULL; 3110 return flow; 3111} 3112 3113static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id) 3114{ 3115 char buf[32]; 3116 snprintf(buf, sizeof(buf), "%s%d", base, label_id); 3117 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf); 3118} 3119 3120/* Append a basic block at the level of the parent flow. 3121 */ 3122static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name) 3123{ 3124 assert(ctx->flow->depth >= 1); 3125 3126 if (ctx->flow->depth >= 2) { 3127 struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2]; 3128 3129 return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name); 3130 } 3131 3132 LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder)); 3133 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name); 3134} 3135 3136/* Emit a branch to the given default target for the current block if 3137 * applicable -- that is, if the current block does not already contain a 3138 * branch from a break or continue. 3139 */ 3140static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target) 3141{ 3142 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder))) 3143 LLVMBuildBr(builder, target); 3144} 3145 3146void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id) 3147{ 3148 struct ac_llvm_flow *flow = push_flow(ctx); 3149 flow->loop_entry_block = append_basic_block(ctx, "LOOP"); 3150 flow->next_block = append_basic_block(ctx, "ENDLOOP"); 3151 set_basicblock_name(flow->loop_entry_block, "loop", label_id); 3152 LLVMBuildBr(ctx->builder, flow->loop_entry_block); 3153 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block); 3154} 3155 3156void ac_build_break(struct ac_llvm_context *ctx) 3157{ 3158 struct ac_llvm_flow *flow = get_innermost_loop(ctx); 3159 LLVMBuildBr(ctx->builder, flow->next_block); 3160} 3161 3162void ac_build_continue(struct ac_llvm_context *ctx) 3163{ 3164 struct ac_llvm_flow *flow = get_innermost_loop(ctx); 3165 LLVMBuildBr(ctx->builder, flow->loop_entry_block); 3166} 3167 3168void ac_build_else(struct ac_llvm_context *ctx, int label_id) 3169{ 3170 struct ac_llvm_flow *current_branch = get_current_flow(ctx); 3171 LLVMBasicBlockRef endif_block; 3172 3173 assert(!current_branch->loop_entry_block); 3174 3175 endif_block = append_basic_block(ctx, "ENDIF"); 3176 emit_default_branch(ctx->builder, endif_block); 3177 3178 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); 3179 set_basicblock_name(current_branch->next_block, "else", label_id); 3180 3181 current_branch->next_block = endif_block; 3182} 3183 3184/* Invoked after a branch is exited. */ 3185static void ac_branch_exited(struct ac_llvm_context *ctx) 3186{ 3187 if (ctx->flow->depth == 0 && ctx->conditional_demote_seen) { 3188 /* The previous conditional branch contained demote. Kill threads 3189 * after all conditional blocks because amdgcn.wqm.vote doesn't 3190 * return usable values inside the blocks. 3191 * 3192 * This is an optional optimization that only kills whole inactive quads. 3193 */ 3194 LLVMValueRef cond = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, ""); 3195 ac_build_kill_if_false(ctx, ac_build_wqm_vote(ctx, cond)); 3196 ctx->conditional_demote_seen = false; 3197 } 3198} 3199 3200void ac_build_endif(struct ac_llvm_context *ctx, int label_id) 3201{ 3202 struct ac_llvm_flow *current_branch = get_current_flow(ctx); 3203 3204 assert(!current_branch->loop_entry_block); 3205 3206 emit_default_branch(ctx->builder, current_branch->next_block); 3207 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); 3208 set_basicblock_name(current_branch->next_block, "endif", label_id); 3209 3210 ctx->flow->depth--; 3211 ac_branch_exited(ctx); 3212} 3213 3214void ac_build_endloop(struct ac_llvm_context *ctx, int label_id) 3215{ 3216 struct ac_llvm_flow *current_loop = get_current_flow(ctx); 3217 3218 assert(current_loop->loop_entry_block); 3219 3220 emit_default_branch(ctx->builder, current_loop->loop_entry_block); 3221 3222 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block); 3223 set_basicblock_name(current_loop->next_block, "endloop", label_id); 3224 ctx->flow->depth--; 3225 ac_branch_exited(ctx); 3226} 3227 3228void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id) 3229{ 3230 struct ac_llvm_flow *flow = push_flow(ctx); 3231 LLVMBasicBlockRef if_block; 3232 3233 if_block = append_basic_block(ctx, "IF"); 3234 flow->next_block = append_basic_block(ctx, "ELSE"); 3235 set_basicblock_name(if_block, "if", label_id); 3236 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block); 3237 LLVMPositionBuilderAtEnd(ctx->builder, if_block); 3238} 3239 3240LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name) 3241{ 3242 LLVMBuilderRef builder = ac->builder; 3243 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder); 3244 LLVMValueRef function = LLVMGetBasicBlockParent(current_block); 3245 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function); 3246 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block); 3247 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context); 3248 LLVMValueRef res; 3249 3250 if (first_instr) { 3251 LLVMPositionBuilderBefore(first_builder, first_instr); 3252 } else { 3253 LLVMPositionBuilderAtEnd(first_builder, first_block); 3254 } 3255 3256 res = LLVMBuildAlloca(first_builder, type, name); 3257 LLVMDisposeBuilder(first_builder); 3258 return res; 3259} 3260 3261LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name) 3262{ 3263 LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name); 3264 LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr); 3265 return ptr; 3266} 3267 3268LLVMValueRef ac_build_alloca_init(struct ac_llvm_context *ac, LLVMValueRef val, const char *name) 3269{ 3270 LLVMValueRef ptr = ac_build_alloca_undef(ac, LLVMTypeOf(val), name); 3271 LLVMBuildStore(ac->builder, val, ptr); 3272 return ptr; 3273} 3274 3275LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type) 3276{ 3277 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); 3278 return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), ""); 3279} 3280 3281LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count) 3282{ 3283 unsigned num_components = ac_get_llvm_num_components(value); 3284 if (count == num_components) 3285 return value; 3286 3287 LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef)); 3288 masks[0] = ctx->i32_0; 3289 masks[1] = ctx->i32_1; 3290 for (unsigned i = 2; i < count; i++) 3291 masks[i] = LLVMConstInt(ctx->i32, i, false); 3292 3293 if (count == 1) 3294 return LLVMBuildExtractElement(ctx->builder, value, masks[0], ""); 3295 3296 LLVMValueRef swizzle = LLVMConstVector(masks, count); 3297 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, ""); 3298} 3299 3300/* If param is i64 and bitwidth <= 32, the return value will be i32. */ 3301LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift, 3302 unsigned bitwidth) 3303{ 3304 LLVMValueRef value = param; 3305 if (rshift) 3306 value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), ""); 3307 3308 if (rshift + bitwidth < 32) { 3309 uint64_t mask = (1ull << bitwidth) - 1; 3310 value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), ""); 3311 } 3312 3313 if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64) 3314 value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, ""); 3315 return value; 3316} 3317 3318/* Adjust the sample index according to FMASK. 3319 * 3320 * For uncompressed MSAA surfaces, FMASK should return 0x76543210, 3321 * which is the identity mapping. Each nibble says which physical sample 3322 * should be fetched to get that sample. 3323 * 3324 * For example, 0x11111100 means there are only 2 samples stored and 3325 * the second sample covers 3/4 of the pixel. When reading samples 0 3326 * and 1, return physical sample 0 (determined by the first two 0s 3327 * in FMASK), otherwise return physical sample 1. 3328 * 3329 * The sample index should be adjusted as follows: 3330 * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF; 3331 */ 3332void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr, 3333 bool is_array_tex) 3334{ 3335 struct ac_image_args fmask_load = {0}; 3336 fmask_load.opcode = ac_image_load; 3337 fmask_load.resource = fmask; 3338 fmask_load.dmask = 0xf; 3339 fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d; 3340 fmask_load.attributes = AC_FUNC_ATTR_READNONE; 3341 3342 fmask_load.coords[0] = addr[0]; 3343 fmask_load.coords[1] = addr[1]; 3344 if (is_array_tex) 3345 fmask_load.coords[2] = addr[2]; 3346 fmask_load.a16 = ac_get_elem_bits(ac, LLVMTypeOf(addr[0])) == 16; 3347 3348 LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load); 3349 fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, ""); 3350 3351 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK 3352 * resource descriptor is 0 (invalid). 3353 */ 3354 LLVMValueRef tmp; 3355 tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, ""); 3356 tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, ""); 3357 tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, ""); 3358 fmask_value = 3359 LLVMBuildSelect(ac->builder, tmp, fmask_value, LLVMConstInt(ac->i32, 0x76543210, false), ""); 3360 3361 /* Apply the formula. */ 3362 unsigned sample_chan = is_array_tex ? 3 : 2; 3363 LLVMValueRef final_sample; 3364 final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], 3365 LLVMConstInt(LLVMTypeOf(addr[0]), 4, 0), ""); 3366 final_sample = LLVMBuildLShr(ac->builder, fmask_value, 3367 LLVMBuildZExt(ac->builder, final_sample, ac->i32, ""), ""); 3368 /* Mask the sample index by 0x7, because 0x8 means an unknown value 3369 * with EQAA, so those will map to 0. */ 3370 addr[sample_chan] = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), ""); 3371 if (fmask_load.a16) 3372 addr[sample_chan] = LLVMBuildTrunc(ac->builder, final_sample, ac->i16, ""); 3373} 3374 3375static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, 3376 LLVMValueRef lane, bool with_opt_barrier) 3377{ 3378 LLVMTypeRef type = LLVMTypeOf(src); 3379 LLVMValueRef result; 3380 3381 if (with_opt_barrier) 3382 ac_build_optimization_barrier(ctx, &src, false); 3383 3384 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 3385 if (lane) 3386 lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, ""); 3387 3388 result = 3389 ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane", 3390 ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2, 3391 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3392 3393 return LLVMBuildTrunc(ctx->builder, result, type, ""); 3394} 3395 3396static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src, 3397 LLVMValueRef lane, bool with_opt_barrier) 3398{ 3399 LLVMTypeRef src_type = LLVMTypeOf(src); 3400 src = ac_to_integer(ctx, src); 3401 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); 3402 LLVMValueRef ret; 3403 3404 if (bits > 32) { 3405 assert(bits % 32 == 0); 3406 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); 3407 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, ""); 3408 ret = LLVMGetUndef(vec_type); 3409 for (unsigned i = 0; i < bits / 32; i++) { 3410 LLVMValueRef ret_comp; 3411 3412 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); 3413 3414 ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier); 3415 3416 ret = 3417 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); 3418 } 3419 } else { 3420 ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier); 3421 } 3422 3423 if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind) 3424 return LLVMBuildIntToPtr(ctx->builder, ret, src_type, ""); 3425 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3426} 3427 3428/** 3429 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic. 3430 * 3431 * The optimization barrier is not needed if the value is the same in all lanes 3432 * or if this is called in the outermost block. 3433 * 3434 * @param ctx 3435 * @param src 3436 * @param lane - id of the lane or NULL for the first active lane 3437 * @return value of the lane 3438 */ 3439LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src, 3440 LLVMValueRef lane) 3441{ 3442 return ac_build_readlane_common(ctx, src, lane, false); 3443} 3444 3445LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) 3446{ 3447 return ac_build_readlane_common(ctx, src, lane, true); 3448} 3449 3450LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, 3451 LLVMValueRef lane) 3452{ 3453 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32, 3454 (LLVMValueRef[]){value, lane, src}, 3, 3455 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3456} 3457 3458LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src) 3459{ 3460 if (ctx->wave_size == 32) { 3461 LLVMValueRef val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, 3462 (LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE); 3463 ac_set_range_metadata(ctx, val, 0, ctx->wave_size); 3464 return val; 3465 } 3466 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, ""); 3467 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, ""); 3468 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, ""); 3469 LLVMValueRef val = 3470 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, 3471 (LLVMValueRef[]){mask_lo, add_src}, 2, AC_FUNC_ATTR_READNONE); 3472 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val}, 3473 2, AC_FUNC_ATTR_READNONE); 3474 ac_set_range_metadata(ctx, val, 0, ctx->wave_size); 3475 return val; 3476} 3477 3478LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask) 3479{ 3480 return ac_build_mbcnt_add(ctx, mask, ctx->i32_0); 3481} 3482 3483enum dpp_ctrl 3484{ 3485 _dpp_quad_perm = 0x000, 3486 _dpp_row_sl = 0x100, 3487 _dpp_row_sr = 0x110, 3488 _dpp_row_rr = 0x120, 3489 dpp_wf_sl1 = 0x130, 3490 dpp_wf_rl1 = 0x134, 3491 dpp_wf_sr1 = 0x138, 3492 dpp_wf_rr1 = 0x13C, 3493 dpp_row_mirror = 0x140, 3494 dpp_row_half_mirror = 0x141, 3495 dpp_row_bcast15 = 0x142, 3496 dpp_row_bcast31 = 0x143 3497}; 3498 3499static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, 3500 unsigned lane3) 3501{ 3502 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4); 3503 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6); 3504} 3505 3506static inline enum dpp_ctrl dpp_row_sl(unsigned amount) 3507{ 3508 assert(amount > 0 && amount < 16); 3509 return _dpp_row_sl | amount; 3510} 3511 3512static inline enum dpp_ctrl dpp_row_sr(unsigned amount) 3513{ 3514 assert(amount > 0 && amount < 16); 3515 return _dpp_row_sr | amount; 3516} 3517 3518static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, 3519 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, 3520 bool bound_ctrl) 3521{ 3522 LLVMTypeRef type = LLVMTypeOf(src); 3523 LLVMValueRef res; 3524 3525 old = LLVMBuildZExt(ctx->builder, old, ctx->i32, ""); 3526 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 3527 3528 res = ac_build_intrinsic( 3529 ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32, 3530 (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0), 3531 LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0), 3532 LLVMConstInt(ctx->i1, bound_ctrl, 0)}, 3533 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3534 3535 return LLVMBuildTrunc(ctx->builder, res, type, ""); 3536} 3537 3538static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, 3539 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, 3540 bool bound_ctrl) 3541{ 3542 LLVMTypeRef src_type = LLVMTypeOf(src); 3543 src = ac_to_integer(ctx, src); 3544 old = ac_to_integer(ctx, old); 3545 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); 3546 LLVMValueRef ret; 3547 if (bits > 32) { 3548 assert(bits % 32 == 0); 3549 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); 3550 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, ""); 3551 LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, ""); 3552 ret = LLVMGetUndef(vec_type); 3553 for (unsigned i = 0; i < bits / 32; i++) { 3554 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); 3555 old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), ""); 3556 LLVMValueRef ret_comp = 3557 _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl); 3558 ret = 3559 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); 3560 } 3561 } else { 3562 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl); 3563 } 3564 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3565} 3566 3567static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, 3568 uint64_t sel, bool exchange_rows, bool bound_ctrl) 3569{ 3570 LLVMTypeRef type = LLVMTypeOf(src); 3571 LLVMValueRef result; 3572 3573 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 3574 3575 LLVMValueRef args[6] = { 3576 src, 3577 src, 3578 LLVMConstInt(ctx->i32, sel, false), 3579 LLVMConstInt(ctx->i32, sel >> 32, false), 3580 ctx->i1true, /* fi */ 3581 bound_ctrl ? ctx->i1true : ctx->i1false, 3582 }; 3583 3584 result = 3585 ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16", 3586 ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3587 3588 return LLVMBuildTrunc(ctx->builder, result, type, ""); 3589} 3590 3591static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel, 3592 bool exchange_rows, bool bound_ctrl) 3593{ 3594 LLVMTypeRef src_type = LLVMTypeOf(src); 3595 src = ac_to_integer(ctx, src); 3596 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); 3597 LLVMValueRef ret; 3598 if (bits > 32) { 3599 assert(bits % 32 == 0); 3600 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); 3601 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, ""); 3602 ret = LLVMGetUndef(vec_type); 3603 for (unsigned i = 0; i < bits / 32; i++) { 3604 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); 3605 LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl); 3606 ret = 3607 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); 3608 } 3609 } else { 3610 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl); 3611 } 3612 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3613} 3614 3615static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask) 3616{ 3617 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32); 3618 return and_mask | (or_mask << 5) | (xor_mask << 10); 3619} 3620 3621static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, 3622 unsigned mask) 3623{ 3624 LLVMTypeRef src_type = LLVMTypeOf(src); 3625 LLVMValueRef ret; 3626 3627 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 3628 3629 ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32, 3630 (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2, 3631 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3632 3633 return LLVMBuildTrunc(ctx->builder, ret, src_type, ""); 3634} 3635 3636LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask) 3637{ 3638 LLVMTypeRef src_type = LLVMTypeOf(src); 3639 src = ac_to_integer(ctx, src); 3640 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); 3641 LLVMValueRef ret; 3642 if (bits > 32) { 3643 assert(bits % 32 == 0); 3644 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); 3645 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, ""); 3646 ret = LLVMGetUndef(vec_type); 3647 for (unsigned i = 0; i < bits / 32; i++) { 3648 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); 3649 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask); 3650 ret = 3651 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); 3652 } 3653 } else { 3654 ret = _ac_build_ds_swizzle(ctx, src, mask); 3655 } 3656 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3657} 3658 3659static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src) 3660{ 3661 LLVMTypeRef src_type = LLVMTypeOf(src); 3662 unsigned bitsize = ac_get_elem_bits(ctx, src_type); 3663 char name[32], type[8]; 3664 LLVMValueRef ret; 3665 3666 src = ac_to_integer(ctx, src); 3667 3668 if (bitsize < 32) 3669 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 3670 3671 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); 3672 snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type); 3673 ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1, 3674 AC_FUNC_ATTR_READNONE); 3675 3676 if (bitsize < 32) 3677 ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), ""); 3678 3679 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3680} 3681 3682static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src, 3683 LLVMValueRef inactive) 3684{ 3685 char name[33], type[8]; 3686 LLVMTypeRef src_type = LLVMTypeOf(src); 3687 unsigned bitsize = ac_get_elem_bits(ctx, src_type); 3688 src = ac_to_integer(ctx, src); 3689 inactive = ac_to_integer(ctx, inactive); 3690 3691 if (bitsize < 32) { 3692 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 3693 inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, ""); 3694 } 3695 3696 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); 3697 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type); 3698 LLVMValueRef ret = 3699 ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2, 3700 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3701 if (bitsize < 32) 3702 ret = LLVMBuildTrunc(ctx->builder, ret, src_type, ""); 3703 3704 return ret; 3705} 3706 3707static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, 3708 unsigned type_size) 3709{ 3710 3711 if (type_size == 0) { 3712 switch (op) { 3713 case nir_op_ior: 3714 case nir_op_ixor: 3715 return LLVMConstInt(ctx->i1, 0, 0); 3716 case nir_op_iand: 3717 return LLVMConstInt(ctx->i1, 1, 0); 3718 default: 3719 unreachable("bad reduction intrinsic"); 3720 } 3721 } else if (type_size == 1) { 3722 switch (op) { 3723 case nir_op_iadd: 3724 return ctx->i8_0; 3725 case nir_op_imul: 3726 return ctx->i8_1; 3727 case nir_op_imin: 3728 return LLVMConstInt(ctx->i8, INT8_MAX, 0); 3729 case nir_op_umin: 3730 return LLVMConstInt(ctx->i8, UINT8_MAX, 0); 3731 case nir_op_imax: 3732 return LLVMConstInt(ctx->i8, INT8_MIN, 0); 3733 case nir_op_umax: 3734 return ctx->i8_0; 3735 case nir_op_iand: 3736 return LLVMConstInt(ctx->i8, -1, 0); 3737 case nir_op_ior: 3738 return ctx->i8_0; 3739 case nir_op_ixor: 3740 return ctx->i8_0; 3741 default: 3742 unreachable("bad reduction intrinsic"); 3743 } 3744 } else if (type_size == 2) { 3745 switch (op) { 3746 case nir_op_iadd: 3747 return ctx->i16_0; 3748 case nir_op_fadd: 3749 return ctx->f16_0; 3750 case nir_op_imul: 3751 return ctx->i16_1; 3752 case nir_op_fmul: 3753 return ctx->f16_1; 3754 case nir_op_imin: 3755 return LLVMConstInt(ctx->i16, INT16_MAX, 0); 3756 case nir_op_umin: 3757 return LLVMConstInt(ctx->i16, UINT16_MAX, 0); 3758 case nir_op_fmin: 3759 return LLVMConstReal(ctx->f16, INFINITY); 3760 case nir_op_imax: 3761 return LLVMConstInt(ctx->i16, INT16_MIN, 0); 3762 case nir_op_umax: 3763 return ctx->i16_0; 3764 case nir_op_fmax: 3765 return LLVMConstReal(ctx->f16, -INFINITY); 3766 case nir_op_iand: 3767 return LLVMConstInt(ctx->i16, -1, 0); 3768 case nir_op_ior: 3769 return ctx->i16_0; 3770 case nir_op_ixor: 3771 return ctx->i16_0; 3772 default: 3773 unreachable("bad reduction intrinsic"); 3774 } 3775 } else if (type_size == 4) { 3776 switch (op) { 3777 case nir_op_iadd: 3778 return ctx->i32_0; 3779 case nir_op_fadd: 3780 return ctx->f32_0; 3781 case nir_op_imul: 3782 return ctx->i32_1; 3783 case nir_op_fmul: 3784 return ctx->f32_1; 3785 case nir_op_imin: 3786 return LLVMConstInt(ctx->i32, INT32_MAX, 0); 3787 case nir_op_umin: 3788 return LLVMConstInt(ctx->i32, UINT32_MAX, 0); 3789 case nir_op_fmin: 3790 return LLVMConstReal(ctx->f32, INFINITY); 3791 case nir_op_imax: 3792 return LLVMConstInt(ctx->i32, INT32_MIN, 0); 3793 case nir_op_umax: 3794 return ctx->i32_0; 3795 case nir_op_fmax: 3796 return LLVMConstReal(ctx->f32, -INFINITY); 3797 case nir_op_iand: 3798 return LLVMConstInt(ctx->i32, -1, 0); 3799 case nir_op_ior: 3800 return ctx->i32_0; 3801 case nir_op_ixor: 3802 return ctx->i32_0; 3803 default: 3804 unreachable("bad reduction intrinsic"); 3805 } 3806 } else { /* type_size == 64bit */ 3807 switch (op) { 3808 case nir_op_iadd: 3809 return ctx->i64_0; 3810 case nir_op_fadd: 3811 return ctx->f64_0; 3812 case nir_op_imul: 3813 return ctx->i64_1; 3814 case nir_op_fmul: 3815 return ctx->f64_1; 3816 case nir_op_imin: 3817 return LLVMConstInt(ctx->i64, INT64_MAX, 0); 3818 case nir_op_umin: 3819 return LLVMConstInt(ctx->i64, UINT64_MAX, 0); 3820 case nir_op_fmin: 3821 return LLVMConstReal(ctx->f64, INFINITY); 3822 case nir_op_imax: 3823 return LLVMConstInt(ctx->i64, INT64_MIN, 0); 3824 case nir_op_umax: 3825 return ctx->i64_0; 3826 case nir_op_fmax: 3827 return LLVMConstReal(ctx->f64, -INFINITY); 3828 case nir_op_iand: 3829 return LLVMConstInt(ctx->i64, -1, 0); 3830 case nir_op_ior: 3831 return ctx->i64_0; 3832 case nir_op_ixor: 3833 return ctx->i64_0; 3834 default: 3835 unreachable("bad reduction intrinsic"); 3836 } 3837 } 3838} 3839 3840static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, 3841 nir_op op) 3842{ 3843 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8; 3844 bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4; 3845 switch (op) { 3846 case nir_op_iadd: 3847 return LLVMBuildAdd(ctx->builder, lhs, rhs, ""); 3848 case nir_op_fadd: 3849 return LLVMBuildFAdd(ctx->builder, lhs, rhs, ""); 3850 case nir_op_imul: 3851 return LLVMBuildMul(ctx->builder, lhs, rhs, ""); 3852 case nir_op_fmul: 3853 return LLVMBuildFMul(ctx->builder, lhs, rhs, ""); 3854 case nir_op_imin: 3855 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""), 3856 lhs, rhs, ""); 3857 case nir_op_umin: 3858 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""), 3859 lhs, rhs, ""); 3860 case nir_op_fmin: 3861 return ac_build_intrinsic( 3862 ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16", 3863 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 3864 AC_FUNC_ATTR_READNONE); 3865 case nir_op_imax: 3866 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""), 3867 lhs, rhs, ""); 3868 case nir_op_umax: 3869 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""), 3870 lhs, rhs, ""); 3871 case nir_op_fmax: 3872 return ac_build_intrinsic( 3873 ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16", 3874 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 3875 AC_FUNC_ATTR_READNONE); 3876 case nir_op_iand: 3877 return LLVMBuildAnd(ctx->builder, lhs, rhs, ""); 3878 case nir_op_ior: 3879 return LLVMBuildOr(ctx->builder, lhs, rhs, ""); 3880 case nir_op_ixor: 3881 return LLVMBuildXor(ctx->builder, lhs, rhs, ""); 3882 default: 3883 unreachable("bad reduction intrinsic"); 3884 } 3885} 3886 3887/** 3888 * \param src The value to shift. 3889 * \param identity The value to use the first lane. 3890 * \param maxprefix specifies that the result only needs to be correct for a 3891 * prefix of this many threads 3892 * \return src, shifted 1 lane up, and identity shifted into lane 0. 3893 */ 3894static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src, 3895 LLVMValueRef identity, unsigned maxprefix) 3896{ 3897 if (ctx->chip_class >= GFX10) { 3898 /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */ 3899 LLVMValueRef active, tmp1, tmp2; 3900 LLVMValueRef tid = ac_get_thread_id(ctx); 3901 3902 tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); 3903 3904 tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false); 3905 3906 if (maxprefix > 32) { 3907 active = 3908 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), ""); 3909 3910 tmp2 = LLVMBuildSelect(ctx->builder, active, 3911 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)), 3912 tmp2, ""); 3913 3914 active = LLVMBuildOr( 3915 ctx->builder, active, 3916 LLVMBuildICmp(ctx->builder, LLVMIntEQ, 3917 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""), 3918 LLVMConstInt(ctx->i32, 0x10, false), ""), 3919 ""); 3920 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); 3921 } else if (maxprefix > 16) { 3922 active = 3923 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), ""); 3924 3925 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); 3926 } 3927 } else if (ctx->chip_class >= GFX8) { 3928 return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false); 3929 } 3930 3931 /* wavefront shift_right by 1 on SI/CI */ 3932 LLVMValueRef active, tmp1, tmp2; 3933 LLVMValueRef tid = ac_get_thread_id(ctx); 3934 tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2)); 3935 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00)); 3936 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, 3937 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""), 3938 LLVMConstInt(ctx->i32, 0x4, 0), ""); 3939 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); 3940 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00)); 3941 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, 3942 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""), 3943 LLVMConstInt(ctx->i32, 0x8, 0), ""); 3944 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); 3945 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00)); 3946 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, 3947 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""), 3948 LLVMConstInt(ctx->i32, 0x10, 0), ""); 3949 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); 3950 tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0)); 3951 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), ""); 3952 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); 3953 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), ""); 3954 return LLVMBuildSelect(ctx->builder, active, identity, tmp1, ""); 3955} 3956 3957/** 3958 * \param maxprefix specifies that the result only needs to be correct for a 3959 * prefix of this many threads 3960 */ 3961static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, 3962 LLVMValueRef identity, unsigned maxprefix, bool inclusive) 3963{ 3964 LLVMValueRef result, tmp; 3965 3966 if (!inclusive) 3967 src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix); 3968 3969 result = src; 3970 3971 if (ctx->chip_class <= GFX7) { 3972 assert(maxprefix == 64); 3973 LLVMValueRef tid = ac_get_thread_id(ctx); 3974 LLVMValueRef active; 3975 tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00)); 3976 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 3977 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, ""); 3978 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 3979 result = ac_build_alu_op(ctx, result, tmp, op); 3980 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00)); 3981 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 3982 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""), 3983 ctx->i32_0, ""); 3984 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 3985 result = ac_build_alu_op(ctx, result, tmp, op); 3986 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00)); 3987 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 3988 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""), 3989 ctx->i32_0, ""); 3990 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 3991 result = ac_build_alu_op(ctx, result, tmp, op); 3992 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00)); 3993 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 3994 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""), 3995 ctx->i32_0, ""); 3996 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 3997 result = ac_build_alu_op(ctx, result, tmp, op); 3998 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00)); 3999 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 4000 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""), 4001 ctx->i32_0, ""); 4002 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 4003 result = ac_build_alu_op(ctx, result, tmp, op); 4004 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0)); 4005 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 4006 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""), 4007 ctx->i32_0, ""); 4008 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 4009 result = ac_build_alu_op(ctx, result, tmp, op); 4010 return result; 4011 } 4012 4013 if (maxprefix <= 1) 4014 return result; 4015 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); 4016 result = ac_build_alu_op(ctx, result, tmp, op); 4017 if (maxprefix <= 2) 4018 return result; 4019 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false); 4020 result = ac_build_alu_op(ctx, result, tmp, op); 4021 if (maxprefix <= 3) 4022 return result; 4023 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false); 4024 result = ac_build_alu_op(ctx, result, tmp, op); 4025 if (maxprefix <= 4) 4026 return result; 4027 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false); 4028 result = ac_build_alu_op(ctx, result, tmp, op); 4029 if (maxprefix <= 8) 4030 return result; 4031 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false); 4032 result = ac_build_alu_op(ctx, result, tmp, op); 4033 if (maxprefix <= 16) 4034 return result; 4035 4036 if (ctx->chip_class >= GFX10) { 4037 LLVMValueRef tid = ac_get_thread_id(ctx); 4038 LLVMValueRef active; 4039 4040 tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false); 4041 4042 active = LLVMBuildICmp(ctx->builder, LLVMIntNE, 4043 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""), 4044 ctx->i32_0, ""); 4045 4046 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 4047 4048 result = ac_build_alu_op(ctx, result, tmp, op); 4049 4050 if (maxprefix <= 32) 4051 return result; 4052 4053 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); 4054 4055 active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), ""); 4056 4057 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); 4058 4059 result = ac_build_alu_op(ctx, result, tmp, op); 4060 return result; 4061 } 4062 4063 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); 4064 result = ac_build_alu_op(ctx, result, tmp, op); 4065 if (maxprefix <= 32) 4066 return result; 4067 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); 4068 result = ac_build_alu_op(ctx, result, tmp, op); 4069 return result; 4070} 4071 4072LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) 4073{ 4074 LLVMValueRef result; 4075 4076 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { 4077 LLVMBuilderRef builder = ctx->builder; 4078 src = LLVMBuildZExt(builder, src, ctx->i32, ""); 4079 result = ac_build_ballot(ctx, src); 4080 result = ac_build_mbcnt(ctx, result); 4081 result = LLVMBuildAdd(builder, result, src, ""); 4082 return result; 4083 } 4084 4085 ac_build_optimization_barrier(ctx, &src, false); 4086 4087 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); 4088 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), 4089 LLVMTypeOf(identity), ""); 4090 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true); 4091 4092 return ac_build_wwm(ctx, result); 4093} 4094 4095LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) 4096{ 4097 LLVMValueRef result; 4098 4099 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { 4100 LLVMBuilderRef builder = ctx->builder; 4101 src = LLVMBuildZExt(builder, src, ctx->i32, ""); 4102 result = ac_build_ballot(ctx, src); 4103 result = ac_build_mbcnt(ctx, result); 4104 return result; 4105 } 4106 4107 ac_build_optimization_barrier(ctx, &src, false); 4108 4109 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); 4110 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), 4111 LLVMTypeOf(identity), ""); 4112 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false); 4113 4114 return ac_build_wwm(ctx, result); 4115} 4116 4117LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, 4118 unsigned cluster_size) 4119{ 4120 if (cluster_size == 1) 4121 return src; 4122 ac_build_optimization_barrier(ctx, &src, false); 4123 LLVMValueRef result, swap; 4124 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); 4125 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), 4126 LLVMTypeOf(identity), ""); 4127 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2); 4128 result = ac_build_alu_op(ctx, result, swap, op); 4129 if (cluster_size == 2) 4130 return ac_build_wwm(ctx, result); 4131 4132 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1); 4133 result = ac_build_alu_op(ctx, result, swap, op); 4134 if (cluster_size == 4) 4135 return ac_build_wwm(ctx, result); 4136 4137 if (ctx->chip_class >= GFX8) 4138 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false); 4139 else 4140 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04)); 4141 result = ac_build_alu_op(ctx, result, swap, op); 4142 if (cluster_size == 8) 4143 return ac_build_wwm(ctx, result); 4144 4145 if (ctx->chip_class >= GFX8) 4146 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false); 4147 else 4148 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08)); 4149 result = ac_build_alu_op(ctx, result, swap, op); 4150 if (cluster_size == 16) 4151 return ac_build_wwm(ctx, result); 4152 4153 if (ctx->chip_class >= GFX10) 4154 swap = ac_build_permlane16(ctx, result, 0, true, false); 4155 else if (ctx->chip_class >= GFX8 && cluster_size != 32) 4156 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); 4157 else 4158 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10)); 4159 result = ac_build_alu_op(ctx, result, swap, op); 4160 if (cluster_size == 32) 4161 return ac_build_wwm(ctx, result); 4162 4163 if (ctx->chip_class >= GFX8) { 4164 if (ctx->wave_size == 64) { 4165 if (ctx->chip_class >= GFX10) 4166 swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); 4167 else 4168 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); 4169 result = ac_build_alu_op(ctx, result, swap, op); 4170 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0)); 4171 } 4172 4173 return ac_build_wwm(ctx, result); 4174 } else { 4175 swap = ac_build_readlane(ctx, result, ctx->i32_0); 4176 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0)); 4177 result = ac_build_alu_op(ctx, result, swap, op); 4178 return ac_build_wwm(ctx, result); 4179 } 4180} 4181 4182/** 4183 * "Top half" of a scan that reduces per-wave values across an entire 4184 * workgroup. 4185 * 4186 * The source value must be present in the highest lane of the wave, and the 4187 * highest lane must be live. 4188 */ 4189void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 4190{ 4191 if (ws->maxwaves <= 1) 4192 return; 4193 4194 const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false); 4195 LLVMBuilderRef builder = ctx->builder; 4196 LLVMValueRef tid = ac_get_thread_id(ctx); 4197 LLVMValueRef tmp; 4198 4199 tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, ""); 4200 ac_build_ifcc(ctx, tmp, 1000); 4201 LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, "")); 4202 ac_build_endif(ctx, 1000); 4203} 4204 4205/** 4206 * "Bottom half" of a scan that reduces per-wave values across an entire 4207 * workgroup. 4208 * 4209 * The caller must place a barrier between the top and bottom halves. 4210 */ 4211void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 4212{ 4213 const LLVMTypeRef type = LLVMTypeOf(ws->src); 4214 const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type)); 4215 4216 if (ws->maxwaves <= 1) { 4217 ws->result_reduce = ws->src; 4218 ws->result_inclusive = ws->src; 4219 ws->result_exclusive = identity; 4220 return; 4221 } 4222 assert(ws->maxwaves <= 32); 4223 4224 LLVMBuilderRef builder = ctx->builder; 4225 LLVMValueRef tid = ac_get_thread_id(ctx); 4226 LLVMBasicBlockRef bbs[2]; 4227 LLVMValueRef phivalues_scan[2]; 4228 LLVMValueRef tmp, tmp2; 4229 4230 bbs[0] = LLVMGetInsertBlock(builder); 4231 phivalues_scan[0] = LLVMGetUndef(type); 4232 4233 if (ws->enable_reduce) 4234 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, ""); 4235 else if (ws->enable_inclusive) 4236 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, ""); 4237 else 4238 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, ""); 4239 ac_build_ifcc(ctx, tmp, 1001); 4240 { 4241 tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), ""); 4242 4243 ac_build_optimization_barrier(ctx, &tmp, false); 4244 4245 bbs[1] = LLVMGetInsertBlock(builder); 4246 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true); 4247 } 4248 ac_build_endif(ctx, 1001); 4249 4250 const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs); 4251 4252 if (ws->enable_reduce) { 4253 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, ""); 4254 ws->result_reduce = ac_build_readlane(ctx, scan, tmp); 4255 } 4256 if (ws->enable_inclusive) 4257 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx); 4258 if (ws->enable_exclusive) { 4259 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, ""); 4260 tmp = ac_build_readlane(ctx, scan, tmp); 4261 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, ""); 4262 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, ""); 4263 } 4264} 4265 4266/** 4267 * Inclusive scan of a per-wave value across an entire workgroup. 4268 * 4269 * This implies an s_barrier instruction. 4270 * 4271 * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads 4272 * of the workgroup are live. (This requirement cannot easily be relaxed in a 4273 * useful manner because of the barrier in the algorithm.) 4274 */ 4275void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 4276{ 4277 ac_build_wg_wavescan_top(ctx, ws); 4278 ac_build_s_barrier(ctx); 4279 ac_build_wg_wavescan_bottom(ctx, ws); 4280} 4281 4282/** 4283 * "Top half" of a scan that reduces per-thread values across an entire 4284 * workgroup. 4285 * 4286 * All lanes must be active when this code runs. 4287 */ 4288void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 4289{ 4290 if (ws->enable_exclusive) { 4291 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op); 4292 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd) 4293 ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, ""); 4294 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op); 4295 } else { 4296 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op); 4297 } 4298 4299 bool enable_inclusive = ws->enable_inclusive; 4300 bool enable_exclusive = ws->enable_exclusive; 4301 ws->enable_inclusive = false; 4302 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; 4303 ac_build_wg_wavescan_top(ctx, ws); 4304 ws->enable_inclusive = enable_inclusive; 4305 ws->enable_exclusive = enable_exclusive; 4306} 4307 4308/** 4309 * "Bottom half" of a scan that reduces per-thread values across an entire 4310 * workgroup. 4311 * 4312 * The caller must place a barrier between the top and bottom halves. 4313 */ 4314void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 4315{ 4316 bool enable_inclusive = ws->enable_inclusive; 4317 bool enable_exclusive = ws->enable_exclusive; 4318 ws->enable_inclusive = false; 4319 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; 4320 ac_build_wg_wavescan_bottom(ctx, ws); 4321 ws->enable_inclusive = enable_inclusive; 4322 ws->enable_exclusive = enable_exclusive; 4323 4324 /* ws->result_reduce is already the correct value */ 4325 if (ws->enable_inclusive) 4326 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op); 4327 if (ws->enable_exclusive) 4328 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op); 4329} 4330 4331/** 4332 * A scan that reduces per-thread values across an entire workgroup. 4333 * 4334 * The caller must ensure that all lanes are active when this code runs 4335 * (WWM is insufficient!), because there is an implied barrier. 4336 */ 4337void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 4338{ 4339 ac_build_wg_scan_top(ctx, ws); 4340 ac_build_s_barrier(ctx); 4341 ac_build_wg_scan_bottom(ctx, ws); 4342} 4343 4344LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0, 4345 unsigned lane1, unsigned lane2, unsigned lane3) 4346{ 4347 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3); 4348 if (ctx->chip_class >= GFX8) { 4349 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false); 4350 } else { 4351 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask); 4352 } 4353} 4354 4355LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index) 4356{ 4357 LLVMTypeRef type = LLVMTypeOf(src); 4358 LLVMValueRef result; 4359 4360 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); 4361 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); 4362 4363 result = 4364 ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2, 4365 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 4366 return LLVMBuildTrunc(ctx->builder, result, type, ""); 4367} 4368 4369LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) 4370{ 4371 LLVMTypeRef type; 4372 char *intr; 4373 4374 if (bitsize == 16) { 4375 intr = "llvm.amdgcn.frexp.exp.i16.f16"; 4376 type = ctx->i16; 4377 } else if (bitsize == 32) { 4378 intr = "llvm.amdgcn.frexp.exp.i32.f32"; 4379 type = ctx->i32; 4380 } else { 4381 intr = "llvm.amdgcn.frexp.exp.i32.f64"; 4382 type = ctx->i32; 4383 } 4384 4385 LLVMValueRef params[] = { 4386 src0, 4387 }; 4388 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); 4389} 4390LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) 4391{ 4392 LLVMTypeRef type; 4393 char *intr; 4394 4395 if (bitsize == 16) { 4396 intr = "llvm.amdgcn.frexp.mant.f16"; 4397 type = ctx->f16; 4398 } else if (bitsize == 32) { 4399 intr = "llvm.amdgcn.frexp.mant.f32"; 4400 type = ctx->f32; 4401 } else { 4402 intr = "llvm.amdgcn.frexp.mant.f64"; 4403 type = ctx->f64; 4404 } 4405 4406 LLVMValueRef params[] = { 4407 src0, 4408 }; 4409 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); 4410} 4411 4412LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) 4413{ 4414 LLVMTypeRef type; 4415 char *intr; 4416 4417 if (bitsize == 16) { 4418 intr = "llvm.canonicalize.f16"; 4419 type = ctx->f16; 4420 } else if (bitsize == 32) { 4421 intr = "llvm.canonicalize.f32"; 4422 type = ctx->f32; 4423 } else { 4424 intr = "llvm.canonicalize.f64"; 4425 type = ctx->f64; 4426 } 4427 4428 LLVMValueRef params[] = { 4429 src0, 4430 }; 4431 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); 4432} 4433 4434/* 4435 * this takes an I,J coordinate pair, 4436 * and works out the X and Y derivatives. 4437 * it returns DDX(I), DDX(J), DDY(I), DDY(J). 4438 */ 4439LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij) 4440{ 4441 LLVMValueRef result[4], a; 4442 unsigned i; 4443 4444 for (i = 0; i < 2; i++) { 4445 a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), ""); 4446 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a); 4447 result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a); 4448 } 4449 return ac_build_gather_values(ctx, result, 4); 4450} 4451 4452LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx) 4453{ 4454 LLVMValueRef result; 4455 4456 if (LLVM_VERSION_MAJOR >= 13) { 4457 result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0, 4458 AC_FUNC_ATTR_READONLY | AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); 4459 } else { 4460 result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, 4461 AC_FUNC_ATTR_READNONE); 4462 } 4463 return LLVMBuildNot(ctx->builder, result, ""); 4464} 4465 4466LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx) 4467{ 4468 if (!ctx->postponed_kill) 4469 return ac_build_load_helper_invocation(ctx); 4470 4471 /* postponed_kill should be NULL on LLVM 13+ */ 4472 assert(LLVM_VERSION_MAJOR < 13); 4473 4474 /* !(exact && postponed) */ 4475 LLVMValueRef exact = 4476 ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE); 4477 4478 LLVMValueRef postponed = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, ""); 4479 return LLVMBuildNot(ctx->builder, LLVMBuildAnd(ctx->builder, exact, postponed, ""), ""); 4480} 4481 4482LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args, 4483 unsigned num_args) 4484{ 4485 LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, ""); 4486 LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func)); 4487 return ret; 4488} 4489 4490void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil, 4491 LLVMValueRef samplemask, struct ac_export_args *args) 4492{ 4493 unsigned mask = 0; 4494 unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL); 4495 4496 assert(depth || stencil || samplemask); 4497 4498 memset(args, 0, sizeof(*args)); 4499 4500 args->valid_mask = 1; /* whether the EXEC mask is valid */ 4501 args->done = 1; /* DONE bit */ 4502 4503 /* Specify the target we are exporting */ 4504 args->target = V_008DFC_SQ_EXP_MRTZ; 4505 4506 args->compr = 0; /* COMP flag */ 4507 args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */ 4508 args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */ 4509 args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */ 4510 args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */ 4511 4512 if (format == V_028710_SPI_SHADER_UINT16_ABGR) { 4513 assert(!depth); 4514 args->compr = 1; /* COMPR flag */ 4515 4516 if (stencil) { 4517 /* Stencil should be in X[23:16]. */ 4518 stencil = ac_to_integer(ctx, stencil); 4519 stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), ""); 4520 args->out[0] = ac_to_float(ctx, stencil); 4521 mask |= 0x3; 4522 } 4523 if (samplemask) { 4524 /* SampleMask should be in Y[15:0]. */ 4525 args->out[1] = samplemask; 4526 mask |= 0xc; 4527 } 4528 } else { 4529 if (depth) { 4530 args->out[0] = depth; 4531 mask |= 0x1; 4532 } 4533 if (stencil) { 4534 args->out[1] = stencil; 4535 mask |= 0x2; 4536 } 4537 if (samplemask) { 4538 args->out[2] = samplemask; 4539 mask |= 0x4; 4540 } 4541 } 4542 4543 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks 4544 * at the X writemask component. */ 4545 if (ctx->chip_class == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN) 4546 mask |= 0x1; 4547 4548 /* Specify which components to enable */ 4549 args->enabled_channels = mask; 4550} 4551 4552/* Send GS Alloc Req message from the first wave of the group to SPI. 4553 * Message payload is: 4554 * - bits 0..10: vertices in group 4555 * - bits 12..22: primitives in group 4556 */ 4557void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id, 4558 LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt) 4559{ 4560 LLVMBuilderRef builder = ctx->builder; 4561 LLVMValueRef tmp; 4562 bool export_dummy_prim = false; 4563 4564 /* HW workaround for a GPU hang with 100% culling. 4565 * We always have to export at least 1 primitive. 4566 * Export a degenerate triangle using vertex 0 for all 3 vertices. 4567 */ 4568 if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) { 4569 assert(vtx_cnt == ctx->i32_0); 4570 prim_cnt = ctx->i32_1; 4571 vtx_cnt = ctx->i32_1; 4572 export_dummy_prim = true; 4573 } 4574 4575 ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020); 4576 4577 tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), ""); 4578 tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); 4579 ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp); 4580 4581 if (export_dummy_prim) { 4582 struct ac_ngg_prim prim = {0}; 4583 /* The vertex indices are 0,0,0. */ 4584 prim.passthrough = ctx->i32_0; 4585 4586 struct ac_export_args pos = {0}; 4587 /* The hw culls primitives with NaN. */ 4588 pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = LLVMConstReal(ctx->f32, NAN); 4589 pos.target = V_008DFC_SQ_EXP_POS; 4590 pos.enabled_channels = 0xf; 4591 pos.done = true; 4592 4593 ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""), 4594 5021); 4595 ac_build_export_prim(ctx, &prim); 4596 ac_build_export(ctx, &pos); 4597 ac_build_endif(ctx, 5021); 4598 } 4599 4600 ac_build_endif(ctx, 5020); 4601} 4602 4603 4604LLVMValueRef ac_pack_edgeflags_for_export(struct ac_llvm_context *ctx, 4605 const struct ac_shader_args *args) 4606{ 4607 /* Use the following trick to extract the edge flags: 4608 * extracted = v_and_b32 gs_invocation_id, 0x700 ; get edge flags at bits 8, 9, 10 4609 * shifted = v_mul_u32_u24 extracted, 0x80402u ; shift the bits: 8->9, 9->19, 10->29 4610 * result = v_and_b32 shifted, 0x20080200 ; remove garbage 4611 */ 4612 LLVMValueRef tmp = LLVMBuildAnd(ctx->builder, 4613 ac_get_arg(ctx, args->gs_invocation_id), 4614 LLVMConstInt(ctx->i32, 0x700, 0), ""); 4615 tmp = LLVMBuildMul(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x80402u, 0), ""); 4616 return LLVMBuildAnd(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x20080200, 0), ""); 4617} 4618 4619LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim) 4620{ 4621 /* The prim export format is: 4622 * - bits 0..8: index 0 4623 * - bit 9: edge flag 0 4624 * - bits 10..18: index 1 4625 * - bit 19: edge flag 1 4626 * - bits 20..28: index 2 4627 * - bit 29: edge flag 2 4628 * - bit 31: null primitive (skip) 4629 */ 4630 LLVMBuilderRef builder = ctx->builder; 4631 LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, ""); 4632 LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), ""); 4633 result = LLVMBuildOr(ctx->builder, result, prim->edgeflags, ""); 4634 4635 for (unsigned i = 0; i < prim->num_vertices; ++i) { 4636 tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), ""); 4637 result = LLVMBuildOr(builder, result, tmp, ""); 4638 } 4639 return result; 4640} 4641 4642void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim) 4643{ 4644 struct ac_export_args args; 4645 4646 if (prim->passthrough) { 4647 args.out[0] = prim->passthrough; 4648 } else { 4649 args.out[0] = ac_pack_prim_export(ctx, prim); 4650 } 4651 4652 args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, ""); 4653 args.out[1] = LLVMGetUndef(ctx->f32); 4654 args.out[2] = LLVMGetUndef(ctx->f32); 4655 args.out[3] = LLVMGetUndef(ctx->f32); 4656 4657 args.target = V_008DFC_SQ_EXP_PRIM; 4658 args.enabled_channels = 1; 4659 args.done = true; 4660 args.valid_mask = false; 4661 args.compr = false; 4662 4663 ac_build_export(ctx, &args); 4664} 4665 4666static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx) 4667{ 4668 if (type == AC_ARG_FLOAT) { 4669 return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size); 4670 } else if (type == AC_ARG_INT) { 4671 return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size); 4672 } else { 4673 LLVMTypeRef ptr_type; 4674 switch (type) { 4675 case AC_ARG_CONST_PTR: 4676 ptr_type = ctx->i8; 4677 break; 4678 case AC_ARG_CONST_FLOAT_PTR: 4679 ptr_type = ctx->f32; 4680 break; 4681 case AC_ARG_CONST_PTR_PTR: 4682 ptr_type = ac_array_in_const32_addr_space(ctx->i8); 4683 break; 4684 case AC_ARG_CONST_DESC_PTR: 4685 ptr_type = ctx->v4i32; 4686 break; 4687 case AC_ARG_CONST_IMAGE_PTR: 4688 ptr_type = ctx->v8i32; 4689 break; 4690 default: 4691 unreachable("unknown arg type"); 4692 } 4693 if (size == 1) { 4694 return ac_array_in_const32_addr_space(ptr_type); 4695 } else { 4696 assert(size == 2); 4697 return ac_array_in_const_addr_space(ptr_type); 4698 } 4699 } 4700} 4701 4702LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx, 4703 enum ac_llvm_calling_convention convention, const char *name, 4704 LLVMTypeRef ret_type, LLVMModuleRef module) 4705{ 4706 LLVMTypeRef arg_types[AC_MAX_ARGS]; 4707 4708 for (unsigned i = 0; i < args->arg_count; i++) { 4709 arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx); 4710 } 4711 4712 LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0); 4713 4714 LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type); 4715 LLVMBasicBlockRef main_function_body = 4716 LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body"); 4717 LLVMPositionBuilderAtEnd(ctx->builder, main_function_body); 4718 4719 LLVMSetFunctionCallConv(main_function, convention); 4720 for (unsigned i = 0; i < args->arg_count; ++i) { 4721 LLVMValueRef P = LLVMGetParam(main_function, i); 4722 4723 if (args->args[i].file != AC_ARG_SGPR) 4724 continue; 4725 4726 ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG); 4727 4728 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { 4729 ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS); 4730 ac_add_attr_dereferenceable(P, UINT64_MAX); 4731 ac_add_attr_alignment(P, 4); 4732 } 4733 } 4734 4735 ctx->main_function = main_function; 4736 4737 /* Enable denormals for FP16 and FP64: */ 4738 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee"); 4739 /* Disable denormals for FP32: */ 4740 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32", 4741 "preserve-sign,preserve-sign"); 4742 return main_function; 4743} 4744 4745void ac_build_s_endpgm(struct ac_llvm_context *ctx) 4746{ 4747 LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false); 4748 LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false); 4749 LLVMBuildCall(ctx->builder, code, NULL, 0, ""); 4750} 4751 4752/** 4753 * Convert triangle strip indices to triangle indices. This is used to decompose 4754 * triangle strips into triangles. 4755 */ 4756void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd, 4757 LLVMValueRef flatshade_first, 4758 LLVMValueRef index[3]) 4759{ 4760 LLVMBuilderRef builder = ctx->builder; 4761 LLVMValueRef out[3]; 4762 4763 /* We need to change the vertex order for odd triangles to get correct 4764 * front/back facing by swapping 2 vertex indices, but we also have to 4765 * keep the provoking vertex in the same place. 4766 * 4767 * If the first vertex is provoking, swap index 1 and 2. 4768 * If the last vertex is provoking, swap index 0 and 1. 4769 */ 4770 out[0] = LLVMBuildSelect(builder, flatshade_first, index[0], 4771 LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), ""); 4772 out[1] = LLVMBuildSelect(builder, flatshade_first, 4773 LLVMBuildSelect(builder, is_odd, index[2], index[1], ""), 4774 LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), ""); 4775 out[2] = LLVMBuildSelect(builder, flatshade_first, 4776 LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], ""); 4777 memcpy(index, out, sizeof(out)); 4778} 4779