ac_llvm_build.c revision b8e80941
1/* 2 * Copyright 2014 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sub license, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, 16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 18 * USE OR OTHER DEALINGS IN THE SOFTWARE. 19 * 20 * The above copyright notice and this permission notice (including the 21 * next paragraph) shall be included in all copies or substantial portions 22 * of the Software. 23 * 24 */ 25/* based on pieces from si_pipe.c and radeon_llvm_emit.c */ 26#include "ac_llvm_build.h" 27 28#include <llvm-c/Core.h> 29 30#include "c11/threads.h" 31 32#include <assert.h> 33#include <stdio.h> 34 35#include "ac_llvm_util.h" 36#include "ac_exp_param.h" 37#include "util/bitscan.h" 38#include "util/macros.h" 39#include "util/u_atomic.h" 40#include "util/u_math.h" 41#include "sid.h" 42 43#include "shader_enums.h" 44 45#define AC_LLVM_INITIAL_CF_DEPTH 4 46 47/* Data for if/else/endif and bgnloop/endloop control flow structures. 48 */ 49struct ac_llvm_flow { 50 /* Loop exit or next part of if/else/endif. */ 51 LLVMBasicBlockRef next_block; 52 LLVMBasicBlockRef loop_entry_block; 53}; 54 55/* Initialize module-independent parts of the context. 56 * 57 * The caller is responsible for initializing ctx::module and ctx::builder. 58 */ 59void 60ac_llvm_context_init(struct ac_llvm_context *ctx, 61 enum chip_class chip_class, enum radeon_family family) 62{ 63 LLVMValueRef args[1]; 64 65 ctx->context = LLVMContextCreate(); 66 67 ctx->chip_class = chip_class; 68 ctx->family = family; 69 ctx->module = NULL; 70 ctx->builder = NULL; 71 72 ctx->voidt = LLVMVoidTypeInContext(ctx->context); 73 ctx->i1 = LLVMInt1TypeInContext(ctx->context); 74 ctx->i8 = LLVMInt8TypeInContext(ctx->context); 75 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16); 76 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32); 77 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64); 78 ctx->intptr = ctx->i32; 79 ctx->f16 = LLVMHalfTypeInContext(ctx->context); 80 ctx->f32 = LLVMFloatTypeInContext(ctx->context); 81 ctx->f64 = LLVMDoubleTypeInContext(ctx->context); 82 ctx->v2i16 = LLVMVectorType(ctx->i16, 2); 83 ctx->v2i32 = LLVMVectorType(ctx->i32, 2); 84 ctx->v3i32 = LLVMVectorType(ctx->i32, 3); 85 ctx->v4i32 = LLVMVectorType(ctx->i32, 4); 86 ctx->v2f32 = LLVMVectorType(ctx->f32, 2); 87 ctx->v4f32 = LLVMVectorType(ctx->f32, 4); 88 ctx->v8i32 = LLVMVectorType(ctx->i32, 8); 89 90 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false); 91 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false); 92 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false); 93 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false); 94 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false); 95 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false); 96 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false); 97 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false); 98 ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0); 99 ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0); 100 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0); 101 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0); 102 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0); 103 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0); 104 105 ctx->i1false = LLVMConstInt(ctx->i1, 0, false); 106 ctx->i1true = LLVMConstInt(ctx->i1, 1, false); 107 108 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, 109 "range", 5); 110 111 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, 112 "invariant.load", 14); 113 114 ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6); 115 116 args[0] = LLVMConstReal(ctx->f32, 2.5); 117 ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1); 118 119 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, 120 "amdgpu.uniform", 14); 121 122 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0); 123} 124 125void 126ac_llvm_context_dispose(struct ac_llvm_context *ctx) 127{ 128 free(ctx->flow); 129 ctx->flow = NULL; 130 ctx->flow_depth_max = 0; 131} 132 133int 134ac_get_llvm_num_components(LLVMValueRef value) 135{ 136 LLVMTypeRef type = LLVMTypeOf(value); 137 unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind 138 ? LLVMGetVectorSize(type) 139 : 1; 140 return num_components; 141} 142 143LLVMValueRef 144ac_llvm_extract_elem(struct ac_llvm_context *ac, 145 LLVMValueRef value, 146 int index) 147{ 148 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) { 149 assert(index == 0); 150 return value; 151 } 152 153 return LLVMBuildExtractElement(ac->builder, value, 154 LLVMConstInt(ac->i32, index, false), ""); 155} 156 157int 158ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type) 159{ 160 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) 161 type = LLVMGetElementType(type); 162 163 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind) 164 return LLVMGetIntTypeWidth(type); 165 166 if (type == ctx->f16) 167 return 16; 168 if (type == ctx->f32) 169 return 32; 170 if (type == ctx->f64) 171 return 64; 172 173 unreachable("Unhandled type kind in get_elem_bits"); 174} 175 176unsigned 177ac_get_type_size(LLVMTypeRef type) 178{ 179 LLVMTypeKind kind = LLVMGetTypeKind(type); 180 181 switch (kind) { 182 case LLVMIntegerTypeKind: 183 return LLVMGetIntTypeWidth(type) / 8; 184 case LLVMHalfTypeKind: 185 return 2; 186 case LLVMFloatTypeKind: 187 return 4; 188 case LLVMDoubleTypeKind: 189 return 8; 190 case LLVMPointerTypeKind: 191 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT) 192 return 4; 193 return 8; 194 case LLVMVectorTypeKind: 195 return LLVMGetVectorSize(type) * 196 ac_get_type_size(LLVMGetElementType(type)); 197 case LLVMArrayTypeKind: 198 return LLVMGetArrayLength(type) * 199 ac_get_type_size(LLVMGetElementType(type)); 200 default: 201 assert(0); 202 return 0; 203 } 204} 205 206static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) 207{ 208 if (t == ctx->i8) 209 return ctx->i8; 210 else if (t == ctx->f16 || t == ctx->i16) 211 return ctx->i16; 212 else if (t == ctx->f32 || t == ctx->i32) 213 return ctx->i32; 214 else if (t == ctx->f64 || t == ctx->i64) 215 return ctx->i64; 216 else 217 unreachable("Unhandled integer size"); 218} 219 220LLVMTypeRef 221ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t) 222{ 223 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { 224 LLVMTypeRef elem_type = LLVMGetElementType(t); 225 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), 226 LLVMGetVectorSize(t)); 227 } 228 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) { 229 switch (LLVMGetPointerAddressSpace(t)) { 230 case AC_ADDR_SPACE_GLOBAL: 231 return ctx->i64; 232 case AC_ADDR_SPACE_LDS: 233 return ctx->i32; 234 default: 235 unreachable("unhandled address space"); 236 } 237 } 238 return to_integer_type_scalar(ctx, t); 239} 240 241LLVMValueRef 242ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v) 243{ 244 LLVMTypeRef type = LLVMTypeOf(v); 245 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { 246 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), ""); 247 } 248 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), ""); 249} 250 251LLVMValueRef 252ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v) 253{ 254 LLVMTypeRef type = LLVMTypeOf(v); 255 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) 256 return v; 257 return ac_to_integer(ctx, v); 258} 259 260static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) 261{ 262 if (t == ctx->i8) 263 return ctx->i8; 264 else if (t == ctx->i16 || t == ctx->f16) 265 return ctx->f16; 266 else if (t == ctx->i32 || t == ctx->f32) 267 return ctx->f32; 268 else if (t == ctx->i64 || t == ctx->f64) 269 return ctx->f64; 270 else 271 unreachable("Unhandled float size"); 272} 273 274LLVMTypeRef 275ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t) 276{ 277 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { 278 LLVMTypeRef elem_type = LLVMGetElementType(t); 279 return LLVMVectorType(to_float_type_scalar(ctx, elem_type), 280 LLVMGetVectorSize(t)); 281 } 282 return to_float_type_scalar(ctx, t); 283} 284 285LLVMValueRef 286ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v) 287{ 288 LLVMTypeRef type = LLVMTypeOf(v); 289 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), ""); 290} 291 292 293LLVMValueRef 294ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, 295 LLVMTypeRef return_type, LLVMValueRef *params, 296 unsigned param_count, unsigned attrib_mask) 297{ 298 LLVMValueRef function, call; 299 bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY); 300 301 function = LLVMGetNamedFunction(ctx->module, name); 302 if (!function) { 303 LLVMTypeRef param_types[32], function_type; 304 unsigned i; 305 306 assert(param_count <= 32); 307 308 for (i = 0; i < param_count; ++i) { 309 assert(params[i]); 310 param_types[i] = LLVMTypeOf(params[i]); 311 } 312 function_type = 313 LLVMFunctionType(return_type, param_types, param_count, 0); 314 function = LLVMAddFunction(ctx->module, name, function_type); 315 316 LLVMSetFunctionCallConv(function, LLVMCCallConv); 317 LLVMSetLinkage(function, LLVMExternalLinkage); 318 319 if (!set_callsite_attrs) 320 ac_add_func_attributes(ctx->context, function, attrib_mask); 321 } 322 323 call = LLVMBuildCall(ctx->builder, function, params, param_count, ""); 324 if (set_callsite_attrs) 325 ac_add_func_attributes(ctx->context, call, attrib_mask); 326 return call; 327} 328 329/** 330 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with 331 * intrinsic names). 332 */ 333void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize) 334{ 335 LLVMTypeRef elem_type = type; 336 337 assert(bufsize >= 8); 338 339 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { 340 int ret = snprintf(buf, bufsize, "v%u", 341 LLVMGetVectorSize(type)); 342 if (ret < 0) { 343 char *type_name = LLVMPrintTypeToString(type); 344 fprintf(stderr, "Error building type name for: %s\n", 345 type_name); 346 return; 347 } 348 elem_type = LLVMGetElementType(type); 349 buf += ret; 350 bufsize -= ret; 351 } 352 switch (LLVMGetTypeKind(elem_type)) { 353 default: break; 354 case LLVMIntegerTypeKind: 355 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type)); 356 break; 357 case LLVMHalfTypeKind: 358 snprintf(buf, bufsize, "f16"); 359 break; 360 case LLVMFloatTypeKind: 361 snprintf(buf, bufsize, "f32"); 362 break; 363 case LLVMDoubleTypeKind: 364 snprintf(buf, bufsize, "f64"); 365 break; 366 } 367} 368 369/** 370 * Helper function that builds an LLVM IR PHI node and immediately adds 371 * incoming edges. 372 */ 373LLVMValueRef 374ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, 375 unsigned count_incoming, LLVMValueRef *values, 376 LLVMBasicBlockRef *blocks) 377{ 378 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, ""); 379 LLVMAddIncoming(phi, values, blocks, count_incoming); 380 return phi; 381} 382 383void ac_build_s_barrier(struct ac_llvm_context *ctx) 384{ 385 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 386 0, AC_FUNC_ATTR_CONVERGENT); 387} 388 389/* Prevent optimizations (at least of memory accesses) across the current 390 * point in the program by emitting empty inline assembly that is marked as 391 * having side effects. 392 * 393 * Optionally, a value can be passed through the inline assembly to prevent 394 * LLVM from hoisting calls to ReadNone functions. 395 */ 396void 397ac_build_optimization_barrier(struct ac_llvm_context *ctx, 398 LLVMValueRef *pvgpr) 399{ 400 static int counter = 0; 401 402 LLVMBuilderRef builder = ctx->builder; 403 char code[16]; 404 405 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter)); 406 407 if (!pvgpr) { 408 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false); 409 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false); 410 LLVMBuildCall(builder, inlineasm, NULL, 0, ""); 411 } else { 412 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false); 413 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false); 414 LLVMValueRef vgpr = *pvgpr; 415 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr); 416 unsigned vgpr_size = ac_get_type_size(vgpr_type); 417 LLVMValueRef vgpr0; 418 419 assert(vgpr_size % 4 == 0); 420 421 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), ""); 422 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, ""); 423 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, ""); 424 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, ""); 425 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, ""); 426 427 *pvgpr = vgpr; 428 } 429} 430 431LLVMValueRef 432ac_build_shader_clock(struct ac_llvm_context *ctx) 433{ 434 LLVMValueRef tmp = ac_build_intrinsic(ctx, "llvm.readcyclecounter", 435 ctx->i64, NULL, 0, 0); 436 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, ""); 437} 438 439LLVMValueRef 440ac_build_ballot(struct ac_llvm_context *ctx, 441 LLVMValueRef value) 442{ 443 LLVMValueRef args[3] = { 444 value, 445 ctx->i32_0, 446 LLVMConstInt(ctx->i32, LLVMIntNE, 0) 447 }; 448 449 /* We currently have no other way to prevent LLVM from lifting the icmp 450 * calls to a dominating basic block. 451 */ 452 ac_build_optimization_barrier(ctx, &args[0]); 453 454 args[0] = ac_to_integer(ctx, args[0]); 455 456 return ac_build_intrinsic(ctx, 457 "llvm.amdgcn.icmp.i32", 458 ctx->i64, args, 3, 459 AC_FUNC_ATTR_NOUNWIND | 460 AC_FUNC_ATTR_READNONE | 461 AC_FUNC_ATTR_CONVERGENT); 462} 463 464LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, 465 LLVMValueRef value) 466{ 467 LLVMValueRef args[3] = { 468 value, 469 ctx->i1false, 470 LLVMConstInt(ctx->i32, LLVMIntNE, 0), 471 }; 472 473 assert(HAVE_LLVM >= 0x0800); 474 return ac_build_intrinsic(ctx, "llvm.amdgcn.icmp.i1", ctx->i64, args, 3, 475 AC_FUNC_ATTR_NOUNWIND | 476 AC_FUNC_ATTR_READNONE | 477 AC_FUNC_ATTR_CONVERGENT); 478} 479 480LLVMValueRef 481ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value) 482{ 483 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); 484 LLVMValueRef vote_set = ac_build_ballot(ctx, value); 485 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, ""); 486} 487 488LLVMValueRef 489ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value) 490{ 491 LLVMValueRef vote_set = ac_build_ballot(ctx, value); 492 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, 493 LLVMConstInt(ctx->i64, 0, 0), ""); 494} 495 496LLVMValueRef 497ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value) 498{ 499 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); 500 LLVMValueRef vote_set = ac_build_ballot(ctx, value); 501 502 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, 503 vote_set, active_set, ""); 504 LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ, 505 vote_set, 506 LLVMConstInt(ctx->i64, 0, 0), ""); 507 return LLVMBuildOr(ctx->builder, all, none, ""); 508} 509 510LLVMValueRef 511ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, 512 unsigned value_count, unsigned component) 513{ 514 LLVMValueRef vec = NULL; 515 516 if (value_count == 1) { 517 return values[component]; 518 } else if (!value_count) 519 unreachable("value_count is 0"); 520 521 for (unsigned i = component; i < value_count + component; i++) { 522 LLVMValueRef value = values[i]; 523 524 if (i == component) 525 vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count)); 526 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false); 527 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, ""); 528 } 529 return vec; 530} 531 532LLVMValueRef 533ac_build_gather_values_extended(struct ac_llvm_context *ctx, 534 LLVMValueRef *values, 535 unsigned value_count, 536 unsigned value_stride, 537 bool load, 538 bool always_vector) 539{ 540 LLVMBuilderRef builder = ctx->builder; 541 LLVMValueRef vec = NULL; 542 unsigned i; 543 544 if (value_count == 1 && !always_vector) { 545 if (load) 546 return LLVMBuildLoad(builder, values[0], ""); 547 return values[0]; 548 } else if (!value_count) 549 unreachable("value_count is 0"); 550 551 for (i = 0; i < value_count; i++) { 552 LLVMValueRef value = values[i * value_stride]; 553 if (load) 554 value = LLVMBuildLoad(builder, value, ""); 555 556 if (!i) 557 vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count)); 558 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false); 559 vec = LLVMBuildInsertElement(builder, vec, value, index, ""); 560 } 561 return vec; 562} 563 564LLVMValueRef 565ac_build_gather_values(struct ac_llvm_context *ctx, 566 LLVMValueRef *values, 567 unsigned value_count) 568{ 569 return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false); 570} 571 572/* Expand a scalar or vector to <dst_channels x type> by filling the remaining 573 * channels with undef. Extract at most src_channels components from the input. 574 */ 575static LLVMValueRef 576ac_build_expand(struct ac_llvm_context *ctx, 577 LLVMValueRef value, 578 unsigned src_channels, 579 unsigned dst_channels) 580{ 581 LLVMTypeRef elemtype; 582 LLVMValueRef chan[dst_channels]; 583 584 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) { 585 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value)); 586 587 if (src_channels == dst_channels && vec_size == dst_channels) 588 return value; 589 590 src_channels = MIN2(src_channels, vec_size); 591 592 for (unsigned i = 0; i < src_channels; i++) 593 chan[i] = ac_llvm_extract_elem(ctx, value, i); 594 595 elemtype = LLVMGetElementType(LLVMTypeOf(value)); 596 } else { 597 if (src_channels) { 598 assert(src_channels == 1); 599 chan[0] = value; 600 } 601 elemtype = LLVMTypeOf(value); 602 } 603 604 for (unsigned i = src_channels; i < dst_channels; i++) 605 chan[i] = LLVMGetUndef(elemtype); 606 607 return ac_build_gather_values(ctx, chan, dst_channels); 608} 609 610/* Expand a scalar or vector to <4 x type> by filling the remaining channels 611 * with undef. Extract at most num_channels components from the input. 612 */ 613LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, 614 LLVMValueRef value, 615 unsigned num_channels) 616{ 617 return ac_build_expand(ctx, value, num_channels, 4); 618} 619 620LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value) 621{ 622 unsigned type_size = ac_get_type_size(LLVMTypeOf(value)); 623 const char *name; 624 625 if (type_size == 2) 626 name = "llvm.rint.f16"; 627 else if (type_size == 4) 628 name = "llvm.rint.f32"; 629 else 630 name = "llvm.rint.f64"; 631 632 return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, 633 AC_FUNC_ATTR_READNONE); 634} 635 636LLVMValueRef 637ac_build_fdiv(struct ac_llvm_context *ctx, 638 LLVMValueRef num, 639 LLVMValueRef den) 640{ 641 /* If we do (num / den), LLVM >= 7.0 does: 642 * return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f)); 643 * 644 * If we do (num * (1 / den)), LLVM does: 645 * return num * v_rcp_f32(den); 646 */ 647 LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0); 648 LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, ""); 649 LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, ""); 650 651 /* Use v_rcp_f32 instead of precise division. */ 652 if (!LLVMIsConstant(ret)) 653 LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp); 654 return ret; 655} 656 657/* See fast_idiv_by_const.h. */ 658/* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */ 659LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, 660 LLVMValueRef num, 661 LLVMValueRef multiplier, 662 LLVMValueRef pre_shift, 663 LLVMValueRef post_shift, 664 LLVMValueRef increment) 665{ 666 LLVMBuilderRef builder = ctx->builder; 667 668 num = LLVMBuildLShr(builder, num, pre_shift, ""); 669 num = LLVMBuildMul(builder, 670 LLVMBuildZExt(builder, num, ctx->i64, ""), 671 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); 672 num = LLVMBuildAdd(builder, num, 673 LLVMBuildZExt(builder, increment, ctx->i64, ""), ""); 674 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); 675 num = LLVMBuildTrunc(builder, num, ctx->i32, ""); 676 return LLVMBuildLShr(builder, num, post_shift, ""); 677} 678 679/* See fast_idiv_by_const.h. */ 680/* If num != UINT_MAX, this more efficient version can be used. */ 681/* Set: increment = util_fast_udiv_info::increment; */ 682LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, 683 LLVMValueRef num, 684 LLVMValueRef multiplier, 685 LLVMValueRef pre_shift, 686 LLVMValueRef post_shift, 687 LLVMValueRef increment) 688{ 689 LLVMBuilderRef builder = ctx->builder; 690 691 num = LLVMBuildLShr(builder, num, pre_shift, ""); 692 num = LLVMBuildNUWAdd(builder, num, increment, ""); 693 num = LLVMBuildMul(builder, 694 LLVMBuildZExt(builder, num, ctx->i64, ""), 695 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); 696 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); 697 num = LLVMBuildTrunc(builder, num, ctx->i32, ""); 698 return LLVMBuildLShr(builder, num, post_shift, ""); 699} 700 701/* See fast_idiv_by_const.h. */ 702/* Both operands must fit in 31 bits and the divisor must not be 1. */ 703LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, 704 LLVMValueRef num, 705 LLVMValueRef multiplier, 706 LLVMValueRef post_shift) 707{ 708 LLVMBuilderRef builder = ctx->builder; 709 710 num = LLVMBuildMul(builder, 711 LLVMBuildZExt(builder, num, ctx->i64, ""), 712 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); 713 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); 714 num = LLVMBuildTrunc(builder, num, ctx->i32, ""); 715 return LLVMBuildLShr(builder, num, post_shift, ""); 716} 717 718/* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27 719 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is 720 * already multiplied by two. id is the cube face number. 721 */ 722struct cube_selection_coords { 723 LLVMValueRef stc[2]; 724 LLVMValueRef ma; 725 LLVMValueRef id; 726}; 727 728static void 729build_cube_intrinsic(struct ac_llvm_context *ctx, 730 LLVMValueRef in[3], 731 struct cube_selection_coords *out) 732{ 733 LLVMTypeRef f32 = ctx->f32; 734 735 out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", 736 f32, in, 3, AC_FUNC_ATTR_READNONE); 737 out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", 738 f32, in, 3, AC_FUNC_ATTR_READNONE); 739 out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", 740 f32, in, 3, AC_FUNC_ATTR_READNONE); 741 out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", 742 f32, in, 3, AC_FUNC_ATTR_READNONE); 743} 744 745/** 746 * Build a manual selection sequence for cube face sc/tc coordinates and 747 * major axis vector (multiplied by 2 for consistency) for the given 748 * vec3 \p coords, for the face implied by \p selcoords. 749 * 750 * For the major axis, we always adjust the sign to be in the direction of 751 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards 752 * the selcoords major axis. 753 */ 754static void build_cube_select(struct ac_llvm_context *ctx, 755 const struct cube_selection_coords *selcoords, 756 const LLVMValueRef *coords, 757 LLVMValueRef *out_st, 758 LLVMValueRef *out_ma) 759{ 760 LLVMBuilderRef builder = ctx->builder; 761 LLVMTypeRef f32 = LLVMTypeOf(coords[0]); 762 LLVMValueRef is_ma_positive; 763 LLVMValueRef sgn_ma; 764 LLVMValueRef is_ma_z, is_not_ma_z; 765 LLVMValueRef is_ma_y; 766 LLVMValueRef is_ma_x; 767 LLVMValueRef sgn; 768 LLVMValueRef tmp; 769 770 is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, 771 selcoords->ma, LLVMConstReal(f32, 0.0), ""); 772 sgn_ma = LLVMBuildSelect(builder, is_ma_positive, 773 LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), ""); 774 775 is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), ""); 776 is_not_ma_z = LLVMBuildNot(builder, is_ma_z, ""); 777 is_ma_y = LLVMBuildAnd(builder, is_not_ma_z, 778 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), ""); 779 is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), ""); 780 781 /* Select sc */ 782 tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], ""); 783 sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0), 784 LLVMBuildSelect(builder, is_ma_z, sgn_ma, 785 LLVMBuildFNeg(builder, sgn_ma, ""), ""), ""); 786 out_st[0] = LLVMBuildFMul(builder, tmp, sgn, ""); 787 788 /* Select tc */ 789 tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], ""); 790 sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, 791 LLVMConstReal(f32, -1.0), ""); 792 out_st[1] = LLVMBuildFMul(builder, tmp, sgn, ""); 793 794 /* Select ma */ 795 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2], 796 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), ""); 797 tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", 798 ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE); 799 *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), ""); 800} 801 802void 803ac_prepare_cube_coords(struct ac_llvm_context *ctx, 804 bool is_deriv, bool is_array, bool is_lod, 805 LLVMValueRef *coords_arg, 806 LLVMValueRef *derivs_arg) 807{ 808 809 LLVMBuilderRef builder = ctx->builder; 810 struct cube_selection_coords selcoords; 811 LLVMValueRef coords[3]; 812 LLVMValueRef invma; 813 814 if (is_array && !is_lod) { 815 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]); 816 817 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says: 818 * 819 * "For Array forms, the array layer used will be 820 * 821 * max(0, min(d−1, floor(layer+0.5))) 822 * 823 * where d is the depth of the texture array and layer 824 * comes from the component indicated in the tables below. 825 * Workaroudn for an issue where the layer is taken from a 826 * helper invocation which happens to fall on a different 827 * layer due to extrapolation." 828 * 829 * VI and earlier attempt to implement this in hardware by 830 * clamping the value of coords[2] = (8 * layer) + face. 831 * Unfortunately, this means that the we end up with the wrong 832 * face when clamping occurs. 833 * 834 * Clamp the layer earlier to work around the issue. 835 */ 836 if (ctx->chip_class <= VI) { 837 LLVMValueRef ge0; 838 ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, ""); 839 tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, ""); 840 } 841 842 coords_arg[3] = tmp; 843 } 844 845 build_cube_intrinsic(ctx, coords_arg, &selcoords); 846 847 invma = ac_build_intrinsic(ctx, "llvm.fabs.f32", 848 ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE); 849 invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma); 850 851 for (int i = 0; i < 2; ++i) 852 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, ""); 853 854 coords[2] = selcoords.id; 855 856 if (is_deriv && derivs_arg) { 857 LLVMValueRef derivs[4]; 858 int axis; 859 860 /* Convert cube derivatives to 2D derivatives. */ 861 for (axis = 0; axis < 2; axis++) { 862 LLVMValueRef deriv_st[2]; 863 LLVMValueRef deriv_ma; 864 865 /* Transform the derivative alongside the texture 866 * coordinate. Mathematically, the correct formula is 867 * as follows. Assume we're projecting onto the +Z face 868 * and denote by dx/dh the derivative of the (original) 869 * X texture coordinate with respect to horizontal 870 * window coordinates. The projection onto the +Z face 871 * plane is: 872 * 873 * f(x,z) = x/z 874 * 875 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh 876 * = 1/z * dx/dh - x/z * 1/z * dz/dh. 877 * 878 * This motivatives the implementation below. 879 * 880 * Whether this actually gives the expected results for 881 * apps that might feed in derivatives obtained via 882 * finite differences is anyone's guess. The OpenGL spec 883 * seems awfully quiet about how textureGrad for cube 884 * maps should be handled. 885 */ 886 build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], 887 deriv_st, &deriv_ma); 888 889 deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, ""); 890 891 for (int i = 0; i < 2; ++i) 892 derivs[axis * 2 + i] = 893 LLVMBuildFSub(builder, 894 LLVMBuildFMul(builder, deriv_st[i], invma, ""), 895 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), ""); 896 } 897 898 memcpy(derivs_arg, derivs, sizeof(derivs)); 899 } 900 901 /* Shift the texture coordinate. This must be applied after the 902 * derivative calculation. 903 */ 904 for (int i = 0; i < 2; ++i) 905 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), ""); 906 907 if (is_array) { 908 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */ 909 /* coords_arg.w component - array_index for cube arrays */ 910 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]); 911 } 912 913 memcpy(coords_arg, coords, sizeof(coords)); 914} 915 916 917LLVMValueRef 918ac_build_fs_interp(struct ac_llvm_context *ctx, 919 LLVMValueRef llvm_chan, 920 LLVMValueRef attr_number, 921 LLVMValueRef params, 922 LLVMValueRef i, 923 LLVMValueRef j) 924{ 925 LLVMValueRef args[5]; 926 LLVMValueRef p1; 927 928 args[0] = i; 929 args[1] = llvm_chan; 930 args[2] = attr_number; 931 args[3] = params; 932 933 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", 934 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); 935 936 args[0] = p1; 937 args[1] = j; 938 args[2] = llvm_chan; 939 args[3] = attr_number; 940 args[4] = params; 941 942 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", 943 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); 944} 945 946LLVMValueRef 947ac_build_fs_interp_f16(struct ac_llvm_context *ctx, 948 LLVMValueRef llvm_chan, 949 LLVMValueRef attr_number, 950 LLVMValueRef params, 951 LLVMValueRef i, 952 LLVMValueRef j) 953{ 954 LLVMValueRef args[6]; 955 LLVMValueRef p1; 956 957 args[0] = i; 958 args[1] = llvm_chan; 959 args[2] = attr_number; 960 args[3] = ctx->i1false; 961 args[4] = params; 962 963 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", 964 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); 965 966 args[0] = p1; 967 args[1] = j; 968 args[2] = llvm_chan; 969 args[3] = attr_number; 970 args[4] = ctx->i1false; 971 args[5] = params; 972 973 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", 974 ctx->f16, args, 6, AC_FUNC_ATTR_READNONE); 975} 976 977LLVMValueRef 978ac_build_fs_interp_mov(struct ac_llvm_context *ctx, 979 LLVMValueRef parameter, 980 LLVMValueRef llvm_chan, 981 LLVMValueRef attr_number, 982 LLVMValueRef params) 983{ 984 LLVMValueRef args[4]; 985 986 args[0] = parameter; 987 args[1] = llvm_chan; 988 args[2] = attr_number; 989 args[3] = params; 990 991 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", 992 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); 993} 994 995LLVMValueRef 996ac_build_gep_ptr(struct ac_llvm_context *ctx, 997 LLVMValueRef base_ptr, 998 LLVMValueRef index) 999{ 1000 return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); 1001} 1002 1003LLVMValueRef 1004ac_build_gep0(struct ac_llvm_context *ctx, 1005 LLVMValueRef base_ptr, 1006 LLVMValueRef index) 1007{ 1008 LLVMValueRef indices[2] = { 1009 ctx->i32_0, 1010 index, 1011 }; 1012 return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, ""); 1013} 1014 1015LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, 1016 LLVMValueRef index) 1017{ 1018 return LLVMBuildPointerCast(ctx->builder, 1019 ac_build_gep0(ctx, ptr, index), 1020 LLVMTypeOf(ptr), ""); 1021} 1022 1023void 1024ac_build_indexed_store(struct ac_llvm_context *ctx, 1025 LLVMValueRef base_ptr, LLVMValueRef index, 1026 LLVMValueRef value) 1027{ 1028 LLVMBuildStore(ctx->builder, value, 1029 ac_build_gep0(ctx, base_ptr, index)); 1030} 1031 1032/** 1033 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad. 1034 * It's equivalent to doing a load from &base_ptr[index]. 1035 * 1036 * \param base_ptr Where the array starts. 1037 * \param index The element index into the array. 1038 * \param uniform Whether the base_ptr and index can be assumed to be 1039 * dynamically uniform (i.e. load to an SGPR) 1040 * \param invariant Whether the load is invariant (no other opcodes affect it) 1041 * \param no_unsigned_wraparound 1042 * For all possible re-associations and re-distributions of an expression 1043 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs 1044 * without inbounds in base_ptr), this parameter is true if "addr + offset" 1045 * does not result in an unsigned integer wraparound. This is used for 1046 * optimal code generation of 32-bit pointer arithmetic. 1047 * 1048 * For example, a 32-bit immediate offset that causes a 32-bit unsigned 1049 * integer wraparound can't be an imm offset in s_load_dword, because 1050 * the instruction performs "addr + offset" in 64 bits. 1051 * 1052 * Expected usage for bindless textures by chaining GEPs: 1053 * // possible unsigned wraparound, don't use InBounds: 1054 * ptr1 = LLVMBuildGEP(base_ptr, index); 1055 * image = load(ptr1); // becomes "s_load ptr1, 0" 1056 * 1057 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize); 1058 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds 1059 */ 1060static LLVMValueRef 1061ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, 1062 LLVMValueRef index, bool uniform, bool invariant, 1063 bool no_unsigned_wraparound) 1064{ 1065 LLVMValueRef pointer, result; 1066 LLVMValueRef indices[2] = {ctx->i32_0, index}; 1067 1068 if (no_unsigned_wraparound && 1069 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT) 1070 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, indices, 2, ""); 1071 else 1072 pointer = LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, ""); 1073 1074 if (uniform) 1075 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md); 1076 result = LLVMBuildLoad(ctx->builder, pointer, ""); 1077 if (invariant) 1078 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); 1079 return result; 1080} 1081 1082LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, 1083 LLVMValueRef index) 1084{ 1085 return ac_build_load_custom(ctx, base_ptr, index, false, false, false); 1086} 1087 1088LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, 1089 LLVMValueRef base_ptr, LLVMValueRef index) 1090{ 1091 return ac_build_load_custom(ctx, base_ptr, index, false, true, false); 1092} 1093 1094/* This assumes that there is no unsigned integer wraparound during the address 1095 * computation, excluding all GEPs within base_ptr. */ 1096LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, 1097 LLVMValueRef base_ptr, LLVMValueRef index) 1098{ 1099 return ac_build_load_custom(ctx, base_ptr, index, true, true, true); 1100} 1101 1102/* See ac_build_load_custom() documentation. */ 1103LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, 1104 LLVMValueRef base_ptr, LLVMValueRef index) 1105{ 1106 return ac_build_load_custom(ctx, base_ptr, index, true, true, false); 1107} 1108 1109static void 1110ac_build_buffer_store_common(struct ac_llvm_context *ctx, 1111 LLVMValueRef rsrc, 1112 LLVMValueRef data, 1113 LLVMValueRef vindex, 1114 LLVMValueRef voffset, 1115 unsigned num_channels, 1116 bool glc, 1117 bool slc, 1118 bool writeonly_memory, 1119 bool use_format) 1120{ 1121 LLVMValueRef args[] = { 1122 data, 1123 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""), 1124 vindex ? vindex : ctx->i32_0, 1125 voffset, 1126 LLVMConstInt(ctx->i1, glc, 0), 1127 LLVMConstInt(ctx->i1, slc, 0) 1128 }; 1129 unsigned func = CLAMP(num_channels, 1, 3) - 1; 1130 1131 const char *type_names[] = {"f32", "v2f32", "v4f32"}; 1132 char name[256]; 1133 1134 if (use_format) { 1135 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.format.%s", 1136 type_names[func]); 1137 } else { 1138 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s", 1139 type_names[func]); 1140 } 1141 1142 ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args), 1143 ac_get_store_intr_attribs(writeonly_memory)); 1144} 1145 1146static void 1147ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx, 1148 LLVMValueRef rsrc, 1149 LLVMValueRef data, 1150 LLVMValueRef vindex, 1151 LLVMValueRef voffset, 1152 LLVMValueRef soffset, 1153 unsigned num_channels, 1154 LLVMTypeRef return_channel_type, 1155 bool glc, 1156 bool slc, 1157 bool writeonly_memory, 1158 bool use_format, 1159 bool structurized) 1160{ 1161 LLVMValueRef args[6]; 1162 int idx = 0; 1163 args[idx++] = data; 1164 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); 1165 if (structurized) 1166 args[idx++] = vindex ? vindex : ctx->i32_0; 1167 args[idx++] = voffset ? voffset : ctx->i32_0; 1168 args[idx++] = soffset ? soffset : ctx->i32_0; 1169 args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0); 1170 unsigned func = num_channels == 3 ? 4 : num_channels; 1171 const char *indexing_kind = structurized ? "struct" : "raw"; 1172 char name[256], type_name[8]; 1173 1174 LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type; 1175 ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); 1176 1177 if (use_format) { 1178 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", 1179 indexing_kind, type_name); 1180 } else { 1181 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", 1182 indexing_kind, type_name); 1183 } 1184 1185 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, 1186 ac_get_store_intr_attribs(writeonly_memory)); 1187} 1188 1189void 1190ac_build_buffer_store_format(struct ac_llvm_context *ctx, 1191 LLVMValueRef rsrc, 1192 LLVMValueRef data, 1193 LLVMValueRef vindex, 1194 LLVMValueRef voffset, 1195 unsigned num_channels, 1196 bool glc, 1197 bool writeonly_memory) 1198{ 1199 if (HAVE_LLVM >= 0x800) { 1200 ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex, 1201 voffset, NULL, num_channels, 1202 ctx->f32, glc, false, 1203 writeonly_memory, true, true); 1204 } else { 1205 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, 1206 num_channels, glc, false, 1207 writeonly_memory, true); 1208 } 1209} 1210 1211/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4. 1212 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2), 1213 * or v4i32 (num_channels=3,4). 1214 */ 1215void 1216ac_build_buffer_store_dword(struct ac_llvm_context *ctx, 1217 LLVMValueRef rsrc, 1218 LLVMValueRef vdata, 1219 unsigned num_channels, 1220 LLVMValueRef voffset, 1221 LLVMValueRef soffset, 1222 unsigned inst_offset, 1223 bool glc, 1224 bool slc, 1225 bool writeonly_memory, 1226 bool swizzle_enable_hint) 1227{ 1228 /* Split 3 channel stores, becase LLVM doesn't support 3-channel 1229 * intrinsics. */ 1230 if (num_channels == 3) { 1231 LLVMValueRef v[3], v01; 1232 1233 for (int i = 0; i < 3; i++) { 1234 v[i] = LLVMBuildExtractElement(ctx->builder, vdata, 1235 LLVMConstInt(ctx->i32, i, 0), ""); 1236 } 1237 v01 = ac_build_gather_values(ctx, v, 2); 1238 1239 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, 1240 soffset, inst_offset, glc, slc, 1241 writeonly_memory, swizzle_enable_hint); 1242 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, 1243 soffset, inst_offset + 8, 1244 glc, slc, 1245 writeonly_memory, swizzle_enable_hint); 1246 return; 1247 } 1248 1249 /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset 1250 * (voffset is swizzled, but soffset isn't swizzled). 1251 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter. 1252 */ 1253 if (!swizzle_enable_hint) { 1254 LLVMValueRef offset = soffset; 1255 1256 if (inst_offset) 1257 offset = LLVMBuildAdd(ctx->builder, offset, 1258 LLVMConstInt(ctx->i32, inst_offset, 0), ""); 1259 1260 if (HAVE_LLVM >= 0x800) { 1261 ac_build_llvm8_buffer_store_common(ctx, rsrc, 1262 ac_to_float(ctx, vdata), 1263 ctx->i32_0, 1264 voffset, offset, 1265 num_channels, 1266 ctx->f32, 1267 glc, slc, 1268 writeonly_memory, 1269 false, false); 1270 } else { 1271 if (voffset) 1272 offset = LLVMBuildAdd(ctx->builder, offset, voffset, ""); 1273 1274 ac_build_buffer_store_common(ctx, rsrc, 1275 ac_to_float(ctx, vdata), 1276 ctx->i32_0, offset, 1277 num_channels, glc, slc, 1278 writeonly_memory, false); 1279 } 1280 return; 1281 } 1282 1283 static const unsigned dfmts[] = { 1284 V_008F0C_BUF_DATA_FORMAT_32, 1285 V_008F0C_BUF_DATA_FORMAT_32_32, 1286 V_008F0C_BUF_DATA_FORMAT_32_32_32, 1287 V_008F0C_BUF_DATA_FORMAT_32_32_32_32 1288 }; 1289 unsigned dfmt = dfmts[num_channels - 1]; 1290 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; 1291 LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0); 1292 1293 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, 1294 immoffset, num_channels, dfmt, nfmt, glc, 1295 slc, writeonly_memory); 1296} 1297 1298static LLVMValueRef 1299ac_build_buffer_load_common(struct ac_llvm_context *ctx, 1300 LLVMValueRef rsrc, 1301 LLVMValueRef vindex, 1302 LLVMValueRef voffset, 1303 unsigned num_channels, 1304 bool glc, 1305 bool slc, 1306 bool can_speculate, 1307 bool use_format) 1308{ 1309 LLVMValueRef args[] = { 1310 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""), 1311 vindex ? vindex : ctx->i32_0, 1312 voffset, 1313 LLVMConstInt(ctx->i1, glc, 0), 1314 LLVMConstInt(ctx->i1, slc, 0) 1315 }; 1316 unsigned func = CLAMP(num_channels, 1, 3) - 1; 1317 1318 LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32}; 1319 const char *type_names[] = {"f32", "v2f32", "v4f32"}; 1320 char name[256]; 1321 1322 if (use_format) { 1323 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.format.%s", 1324 type_names[func]); 1325 } else { 1326 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s", 1327 type_names[func]); 1328 } 1329 1330 return ac_build_intrinsic(ctx, name, types[func], args, 1331 ARRAY_SIZE(args), 1332 ac_get_load_intr_attribs(can_speculate)); 1333} 1334 1335static LLVMValueRef 1336ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx, 1337 LLVMValueRef rsrc, 1338 LLVMValueRef vindex, 1339 LLVMValueRef voffset, 1340 LLVMValueRef soffset, 1341 unsigned num_channels, 1342 LLVMTypeRef channel_type, 1343 bool glc, 1344 bool slc, 1345 bool can_speculate, 1346 bool use_format, 1347 bool structurized) 1348{ 1349 LLVMValueRef args[5]; 1350 int idx = 0; 1351 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); 1352 if (structurized) 1353 args[idx++] = vindex ? vindex : ctx->i32_0; 1354 args[idx++] = voffset ? voffset : ctx->i32_0; 1355 args[idx++] = soffset ? soffset : ctx->i32_0; 1356 args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0); 1357 unsigned func = num_channels == 3 ? 4 : num_channels; 1358 const char *indexing_kind = structurized ? "struct" : "raw"; 1359 char name[256], type_name[8]; 1360 1361 LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type; 1362 ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); 1363 1364 if (use_format) { 1365 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", 1366 indexing_kind, type_name); 1367 } else { 1368 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", 1369 indexing_kind, type_name); 1370 } 1371 1372 return ac_build_intrinsic(ctx, name, type, args, idx, 1373 ac_get_load_intr_attribs(can_speculate)); 1374} 1375 1376LLVMValueRef 1377ac_build_buffer_load(struct ac_llvm_context *ctx, 1378 LLVMValueRef rsrc, 1379 int num_channels, 1380 LLVMValueRef vindex, 1381 LLVMValueRef voffset, 1382 LLVMValueRef soffset, 1383 unsigned inst_offset, 1384 unsigned glc, 1385 unsigned slc, 1386 bool can_speculate, 1387 bool allow_smem) 1388{ 1389 LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0); 1390 if (voffset) 1391 offset = LLVMBuildAdd(ctx->builder, offset, voffset, ""); 1392 if (soffset) 1393 offset = LLVMBuildAdd(ctx->builder, offset, soffset, ""); 1394 1395 if (allow_smem && !slc && 1396 (!glc || (HAVE_LLVM >= 0x0800 && ctx->chip_class >= VI))) { 1397 assert(vindex == NULL); 1398 1399 LLVMValueRef result[8]; 1400 1401 for (int i = 0; i < num_channels; i++) { 1402 if (i) { 1403 offset = LLVMBuildAdd(ctx->builder, offset, 1404 LLVMConstInt(ctx->i32, 4, 0), ""); 1405 } 1406 const char *intrname = 1407 HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32" 1408 : "llvm.SI.load.const.v4i32"; 1409 unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2; 1410 LLVMValueRef args[3] = { 1411 rsrc, 1412 offset, 1413 glc ? ctx->i32_1 : ctx->i32_0, 1414 }; 1415 result[i] = ac_build_intrinsic(ctx, intrname, 1416 ctx->f32, args, num_args, 1417 AC_FUNC_ATTR_READNONE | 1418 (HAVE_LLVM < 0x0800 ? AC_FUNC_ATTR_LEGACY : 0)); 1419 } 1420 if (num_channels == 1) 1421 return result[0]; 1422 1423 if (num_channels == 3) 1424 result[num_channels++] = LLVMGetUndef(ctx->f32); 1425 return ac_build_gather_values(ctx, result, num_channels); 1426 } 1427 1428 if (HAVE_LLVM >= 0x0800) { 1429 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, 1430 offset, ctx->i32_0, 1431 num_channels, ctx->f32, 1432 glc, slc, 1433 can_speculate, false, 1434 false); 1435 } 1436 1437 return ac_build_buffer_load_common(ctx, rsrc, vindex, offset, 1438 num_channels, glc, slc, 1439 can_speculate, false); 1440} 1441 1442LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, 1443 LLVMValueRef rsrc, 1444 LLVMValueRef vindex, 1445 LLVMValueRef voffset, 1446 unsigned num_channels, 1447 bool glc, 1448 bool can_speculate) 1449{ 1450 if (HAVE_LLVM >= 0x800) { 1451 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, 1452 num_channels, ctx->f32, 1453 glc, false, 1454 can_speculate, true, true); 1455 } 1456 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, 1457 num_channels, glc, false, 1458 can_speculate, true); 1459} 1460 1461LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx, 1462 LLVMValueRef rsrc, 1463 LLVMValueRef vindex, 1464 LLVMValueRef voffset, 1465 unsigned num_channels, 1466 bool glc, 1467 bool can_speculate) 1468{ 1469 if (HAVE_LLVM >= 0x800) { 1470 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, 1471 num_channels, ctx->f32, 1472 glc, false, 1473 can_speculate, true, true); 1474 } 1475 1476 LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), ""); 1477 LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, ""); 1478 stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), ""); 1479 1480 LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder, 1481 LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""), 1482 elem_count, stride, ""); 1483 1484 LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count, 1485 LLVMConstInt(ctx->i32, 2, 0), ""); 1486 1487 return ac_build_buffer_load_common(ctx, new_rsrc, vindex, voffset, 1488 num_channels, glc, false, 1489 can_speculate, true); 1490} 1491 1492static LLVMValueRef 1493ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx, 1494 LLVMValueRef rsrc, 1495 LLVMValueRef vindex, 1496 LLVMValueRef voffset, 1497 LLVMValueRef soffset, 1498 unsigned num_channels, 1499 unsigned dfmt, 1500 unsigned nfmt, 1501 bool glc, 1502 bool slc, 1503 bool can_speculate, 1504 bool structurized) 1505{ 1506 LLVMValueRef args[6]; 1507 int idx = 0; 1508 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); 1509 if (structurized) 1510 args[idx++] = vindex ? vindex : ctx->i32_0; 1511 args[idx++] = voffset ? voffset : ctx->i32_0; 1512 args[idx++] = soffset ? soffset : ctx->i32_0; 1513 args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0); 1514 args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0); 1515 unsigned func = num_channels == 3 ? 4 : num_channels; 1516 const char *indexing_kind = structurized ? "struct" : "raw"; 1517 char name[256], type_name[8]; 1518 1519 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; 1520 ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); 1521 1522 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", 1523 indexing_kind, type_name); 1524 1525 return ac_build_intrinsic(ctx, name, type, args, idx, 1526 ac_get_load_intr_attribs(can_speculate)); 1527} 1528 1529static LLVMValueRef 1530ac_build_tbuffer_load(struct ac_llvm_context *ctx, 1531 LLVMValueRef rsrc, 1532 LLVMValueRef vindex, 1533 LLVMValueRef voffset, 1534 LLVMValueRef soffset, 1535 LLVMValueRef immoffset, 1536 unsigned num_channels, 1537 unsigned dfmt, 1538 unsigned nfmt, 1539 bool glc, 1540 bool slc, 1541 bool can_speculate, 1542 bool structurized) /* only matters for LLVM 8+ */ 1543{ 1544 if (HAVE_LLVM >= 0x800) { 1545 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); 1546 1547 return ac_build_llvm8_tbuffer_load(ctx, rsrc, vindex, voffset, 1548 soffset, num_channels, 1549 dfmt, nfmt, glc, slc, 1550 can_speculate, structurized); 1551 } 1552 1553 LLVMValueRef args[] = { 1554 rsrc, 1555 vindex ? vindex : ctx->i32_0, 1556 voffset, 1557 soffset, 1558 immoffset, 1559 LLVMConstInt(ctx->i32, dfmt, false), 1560 LLVMConstInt(ctx->i32, nfmt, false), 1561 LLVMConstInt(ctx->i1, glc, false), 1562 LLVMConstInt(ctx->i1, slc, false), 1563 }; 1564 unsigned func = CLAMP(num_channels, 1, 3) - 1; 1565 LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32}; 1566 const char *type_names[] = {"i32", "v2i32", "v4i32"}; 1567 char name[256]; 1568 1569 snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.load.%s", 1570 type_names[func]); 1571 1572 return ac_build_intrinsic(ctx, name, types[func], args, 9, 1573 ac_get_load_intr_attribs(can_speculate)); 1574} 1575 1576LLVMValueRef 1577ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, 1578 LLVMValueRef rsrc, 1579 LLVMValueRef vindex, 1580 LLVMValueRef voffset, 1581 LLVMValueRef soffset, 1582 LLVMValueRef immoffset, 1583 unsigned num_channels, 1584 unsigned dfmt, 1585 unsigned nfmt, 1586 bool glc, 1587 bool slc, 1588 bool can_speculate) 1589{ 1590 return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, 1591 immoffset, num_channels, dfmt, nfmt, glc, 1592 slc, can_speculate, true); 1593} 1594 1595LLVMValueRef 1596ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, 1597 LLVMValueRef rsrc, 1598 LLVMValueRef voffset, 1599 LLVMValueRef soffset, 1600 LLVMValueRef immoffset, 1601 unsigned num_channels, 1602 unsigned dfmt, 1603 unsigned nfmt, 1604 bool glc, 1605 bool slc, 1606 bool can_speculate) 1607{ 1608 return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset, 1609 immoffset, num_channels, dfmt, nfmt, glc, 1610 slc, can_speculate, false); 1611} 1612 1613LLVMValueRef 1614ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, 1615 LLVMValueRef rsrc, 1616 LLVMValueRef voffset, 1617 LLVMValueRef soffset, 1618 LLVMValueRef immoffset, 1619 bool glc) 1620{ 1621 LLVMValueRef res; 1622 1623 if (HAVE_LLVM >= 0x900) { 1624 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); 1625 1626 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ 1627 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL, 1628 voffset, soffset, 1629 1, ctx->i16, glc, false, 1630 false, false, false); 1631 } else { 1632 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16; 1633 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; 1634 1635 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, 1636 immoffset, 1, dfmt, nfmt, glc, false, 1637 false); 1638 1639 res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, ""); 1640 } 1641 1642 return res; 1643} 1644 1645LLVMValueRef 1646ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, 1647 LLVMValueRef rsrc, 1648 LLVMValueRef voffset, 1649 LLVMValueRef soffset, 1650 LLVMValueRef immoffset, 1651 bool glc) 1652{ 1653 LLVMValueRef res; 1654 1655 if (HAVE_LLVM >= 0x900) { 1656 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); 1657 1658 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ 1659 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL, 1660 voffset, soffset, 1661 1, ctx->i8, glc, false, 1662 false, false, false); 1663 } else { 1664 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8; 1665 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; 1666 1667 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, 1668 immoffset, 1, dfmt, nfmt, glc, false, 1669 false); 1670 1671 res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, ""); 1672 } 1673 1674 return res; 1675} 1676static void 1677ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx, 1678 LLVMValueRef rsrc, 1679 LLVMValueRef vdata, 1680 LLVMValueRef vindex, 1681 LLVMValueRef voffset, 1682 LLVMValueRef soffset, 1683 unsigned num_channels, 1684 unsigned dfmt, 1685 unsigned nfmt, 1686 bool glc, 1687 bool slc, 1688 bool writeonly_memory, 1689 bool structurized) 1690{ 1691 LLVMValueRef args[7]; 1692 int idx = 0; 1693 args[idx++] = vdata; 1694 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); 1695 if (structurized) 1696 args[idx++] = vindex ? vindex : ctx->i32_0; 1697 args[idx++] = voffset ? voffset : ctx->i32_0; 1698 args[idx++] = soffset ? soffset : ctx->i32_0; 1699 args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0); 1700 args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0); 1701 unsigned func = num_channels == 3 ? 4 : num_channels; 1702 const char *indexing_kind = structurized ? "struct" : "raw"; 1703 char name[256], type_name[8]; 1704 1705 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; 1706 ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); 1707 1708 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", 1709 indexing_kind, type_name); 1710 1711 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, 1712 ac_get_store_intr_attribs(writeonly_memory)); 1713} 1714 1715static void 1716ac_build_tbuffer_store(struct ac_llvm_context *ctx, 1717 LLVMValueRef rsrc, 1718 LLVMValueRef vdata, 1719 LLVMValueRef vindex, 1720 LLVMValueRef voffset, 1721 LLVMValueRef soffset, 1722 LLVMValueRef immoffset, 1723 unsigned num_channels, 1724 unsigned dfmt, 1725 unsigned nfmt, 1726 bool glc, 1727 bool slc, 1728 bool writeonly_memory, 1729 bool structurized) /* only matters for LLVM 8+ */ 1730{ 1731 if (HAVE_LLVM >= 0x800) { 1732 voffset = LLVMBuildAdd(ctx->builder, 1733 voffset ? voffset : ctx->i32_0, 1734 immoffset, ""); 1735 1736 ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, 1737 soffset, num_channels, dfmt, nfmt, 1738 glc, slc, writeonly_memory, 1739 structurized); 1740 } else { 1741 LLVMValueRef params[] = { 1742 vdata, 1743 rsrc, 1744 vindex ? vindex : ctx->i32_0, 1745 voffset ? voffset : ctx->i32_0, 1746 soffset ? soffset : ctx->i32_0, 1747 immoffset, 1748 LLVMConstInt(ctx->i32, dfmt, false), 1749 LLVMConstInt(ctx->i32, nfmt, false), 1750 LLVMConstInt(ctx->i1, glc, false), 1751 LLVMConstInt(ctx->i1, slc, false), 1752 }; 1753 unsigned func = CLAMP(num_channels, 1, 3) - 1; 1754 const char *type_names[] = {"i32", "v2i32", "v4i32"}; 1755 char name[256]; 1756 1757 snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s", 1758 type_names[func]); 1759 1760 ac_build_intrinsic(ctx, name, ctx->voidt, params, 10, 1761 ac_get_store_intr_attribs(writeonly_memory)); 1762 } 1763} 1764 1765void 1766ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, 1767 LLVMValueRef rsrc, 1768 LLVMValueRef vdata, 1769 LLVMValueRef vindex, 1770 LLVMValueRef voffset, 1771 LLVMValueRef soffset, 1772 LLVMValueRef immoffset, 1773 unsigned num_channels, 1774 unsigned dfmt, 1775 unsigned nfmt, 1776 bool glc, 1777 bool slc, 1778 bool writeonly_memory) 1779{ 1780 ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, 1781 immoffset, num_channels, dfmt, nfmt, glc, slc, 1782 writeonly_memory, true); 1783} 1784 1785void 1786ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, 1787 LLVMValueRef rsrc, 1788 LLVMValueRef vdata, 1789 LLVMValueRef voffset, 1790 LLVMValueRef soffset, 1791 LLVMValueRef immoffset, 1792 unsigned num_channels, 1793 unsigned dfmt, 1794 unsigned nfmt, 1795 bool glc, 1796 bool slc, 1797 bool writeonly_memory) 1798{ 1799 ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, 1800 immoffset, num_channels, dfmt, nfmt, glc, slc, 1801 writeonly_memory, false); 1802} 1803 1804void 1805ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, 1806 LLVMValueRef rsrc, 1807 LLVMValueRef vdata, 1808 LLVMValueRef voffset, 1809 LLVMValueRef soffset, 1810 bool glc, 1811 bool writeonly_memory) 1812{ 1813 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, ""); 1814 1815 if (HAVE_LLVM >= 0x900) { 1816 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ 1817 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL, 1818 voffset, soffset, 1, 1819 ctx->i16, glc, false, 1820 writeonly_memory, false, 1821 false); 1822 } else { 1823 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16; 1824 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; 1825 1826 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, ""); 1827 1828 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, 1829 ctx->i32_0, 1, dfmt, nfmt, glc, false, 1830 writeonly_memory); 1831 } 1832} 1833 1834void 1835ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, 1836 LLVMValueRef rsrc, 1837 LLVMValueRef vdata, 1838 LLVMValueRef voffset, 1839 LLVMValueRef soffset, 1840 bool glc, 1841 bool writeonly_memory) 1842{ 1843 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, ""); 1844 1845 if (HAVE_LLVM >= 0x900) { 1846 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ 1847 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL, 1848 voffset, soffset, 1, 1849 ctx->i8, glc, false, 1850 writeonly_memory, false, 1851 false); 1852 } else { 1853 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8; 1854 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; 1855 1856 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, ""); 1857 1858 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, 1859 ctx->i32_0, 1, dfmt, nfmt, glc, false, 1860 writeonly_memory); 1861 } 1862} 1863/** 1864 * Set range metadata on an instruction. This can only be used on load and 1865 * call instructions. If you know an instruction can only produce the values 1866 * 0, 1, 2, you would do set_range_metadata(value, 0, 3); 1867 * \p lo is the minimum value inclusive. 1868 * \p hi is the maximum value exclusive. 1869 */ 1870static void set_range_metadata(struct ac_llvm_context *ctx, 1871 LLVMValueRef value, unsigned lo, unsigned hi) 1872{ 1873 LLVMValueRef range_md, md_args[2]; 1874 LLVMTypeRef type = LLVMTypeOf(value); 1875 LLVMContextRef context = LLVMGetTypeContext(type); 1876 1877 md_args[0] = LLVMConstInt(type, lo, false); 1878 md_args[1] = LLVMConstInt(type, hi, false); 1879 range_md = LLVMMDNodeInContext(context, md_args, 2); 1880 LLVMSetMetadata(value, ctx->range_md_kind, range_md); 1881} 1882 1883LLVMValueRef 1884ac_get_thread_id(struct ac_llvm_context *ctx) 1885{ 1886 LLVMValueRef tid; 1887 1888 LLVMValueRef tid_args[2]; 1889 tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false); 1890 tid_args[1] = ctx->i32_0; 1891 tid_args[1] = ac_build_intrinsic(ctx, 1892 "llvm.amdgcn.mbcnt.lo", ctx->i32, 1893 tid_args, 2, AC_FUNC_ATTR_READNONE); 1894 1895 tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", 1896 ctx->i32, tid_args, 1897 2, AC_FUNC_ATTR_READNONE); 1898 set_range_metadata(ctx, tid, 0, 64); 1899 return tid; 1900} 1901 1902/* 1903 * SI implements derivatives using the local data store (LDS) 1904 * All writes to the LDS happen in all executing threads at 1905 * the same time. TID is the Thread ID for the current 1906 * thread and is a value between 0 and 63, representing 1907 * the thread's position in the wavefront. 1908 * 1909 * For the pixel shader threads are grouped into quads of four pixels. 1910 * The TIDs of the pixels of a quad are: 1911 * 1912 * +------+------+ 1913 * |4n + 0|4n + 1| 1914 * +------+------+ 1915 * |4n + 2|4n + 3| 1916 * +------+------+ 1917 * 1918 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel 1919 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of 1920 * the current pixel's column, and masking with 0xfffffffe yields the TID 1921 * of the left pixel of the current pixel's row. 1922 * 1923 * Adding 1 yields the TID of the pixel to the right of the left pixel, and 1924 * adding 2 yields the TID of the pixel below the top pixel. 1925 */ 1926LLVMValueRef 1927ac_build_ddxy(struct ac_llvm_context *ctx, 1928 uint32_t mask, 1929 int idx, 1930 LLVMValueRef val) 1931{ 1932 unsigned tl_lanes[4], trbl_lanes[4]; 1933 char name[32], type[8]; 1934 LLVMValueRef tl, trbl; 1935 LLVMTypeRef result_type; 1936 LLVMValueRef result; 1937 1938 result_type = ac_to_float_type(ctx, LLVMTypeOf(val)); 1939 1940 if (result_type == ctx->f16) 1941 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, ""); 1942 1943 for (unsigned i = 0; i < 4; ++i) { 1944 tl_lanes[i] = i & mask; 1945 trbl_lanes[i] = (i & mask) + idx; 1946 } 1947 1948 tl = ac_build_quad_swizzle(ctx, val, 1949 tl_lanes[0], tl_lanes[1], 1950 tl_lanes[2], tl_lanes[3]); 1951 trbl = ac_build_quad_swizzle(ctx, val, 1952 trbl_lanes[0], trbl_lanes[1], 1953 trbl_lanes[2], trbl_lanes[3]); 1954 1955 if (result_type == ctx->f16) { 1956 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, ""); 1957 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, ""); 1958 } 1959 1960 tl = LLVMBuildBitCast(ctx->builder, tl, result_type, ""); 1961 trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, ""); 1962 result = LLVMBuildFSub(ctx->builder, trbl, tl, ""); 1963 1964 ac_build_type_name_for_intr(result_type, type, sizeof(type)); 1965 snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type); 1966 1967 return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0); 1968} 1969 1970void 1971ac_build_sendmsg(struct ac_llvm_context *ctx, 1972 uint32_t msg, 1973 LLVMValueRef wave_id) 1974{ 1975 LLVMValueRef args[2]; 1976 args[0] = LLVMConstInt(ctx->i32, msg, false); 1977 args[1] = wave_id; 1978 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0); 1979} 1980 1981LLVMValueRef 1982ac_build_imsb(struct ac_llvm_context *ctx, 1983 LLVMValueRef arg, 1984 LLVMTypeRef dst_type) 1985{ 1986 LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", 1987 dst_type, &arg, 1, 1988 AC_FUNC_ATTR_READNONE); 1989 1990 /* The HW returns the last bit index from MSB, but NIR/TGSI wants 1991 * the index from LSB. Invert it by doing "31 - msb". */ 1992 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), 1993 msb, ""); 1994 1995 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true); 1996 LLVMValueRef cond = LLVMBuildOr(ctx->builder, 1997 LLVMBuildICmp(ctx->builder, LLVMIntEQ, 1998 arg, ctx->i32_0, ""), 1999 LLVMBuildICmp(ctx->builder, LLVMIntEQ, 2000 arg, all_ones, ""), ""); 2001 2002 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, ""); 2003} 2004 2005LLVMValueRef 2006ac_build_umsb(struct ac_llvm_context *ctx, 2007 LLVMValueRef arg, 2008 LLVMTypeRef dst_type) 2009{ 2010 const char *intrin_name; 2011 LLVMTypeRef type; 2012 LLVMValueRef highest_bit; 2013 LLVMValueRef zero; 2014 unsigned bitsize; 2015 2016 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg)); 2017 switch (bitsize) { 2018 case 64: 2019 intrin_name = "llvm.ctlz.i64"; 2020 type = ctx->i64; 2021 highest_bit = LLVMConstInt(ctx->i64, 63, false); 2022 zero = ctx->i64_0; 2023 break; 2024 case 32: 2025 intrin_name = "llvm.ctlz.i32"; 2026 type = ctx->i32; 2027 highest_bit = LLVMConstInt(ctx->i32, 31, false); 2028 zero = ctx->i32_0; 2029 break; 2030 case 16: 2031 intrin_name = "llvm.ctlz.i16"; 2032 type = ctx->i16; 2033 highest_bit = LLVMConstInt(ctx->i16, 15, false); 2034 zero = ctx->i16_0; 2035 break; 2036 case 8: 2037 intrin_name = "llvm.ctlz.i8"; 2038 type = ctx->i8; 2039 highest_bit = LLVMConstInt(ctx->i8, 7, false); 2040 zero = ctx->i8_0; 2041 break; 2042 default: 2043 unreachable(!"invalid bitsize"); 2044 break; 2045 } 2046 2047 LLVMValueRef params[2] = { 2048 arg, 2049 ctx->i1true, 2050 }; 2051 2052 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, 2053 params, 2, 2054 AC_FUNC_ATTR_READNONE); 2055 2056 /* The HW returns the last bit index from MSB, but TGSI/NIR wants 2057 * the index from LSB. Invert it by doing "31 - msb". */ 2058 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, ""); 2059 2060 if (bitsize == 64) { 2061 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, ""); 2062 } else if (bitsize < 32) { 2063 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, ""); 2064 } 2065 2066 /* check for zero */ 2067 return LLVMBuildSelect(ctx->builder, 2068 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""), 2069 LLVMConstInt(ctx->i32, -1, true), msb, ""); 2070} 2071 2072LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, 2073 LLVMValueRef b) 2074{ 2075 char name[64]; 2076 snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a))); 2077 LLVMValueRef args[2] = {a, b}; 2078 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 2079 AC_FUNC_ATTR_READNONE); 2080} 2081 2082LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, 2083 LLVMValueRef b) 2084{ 2085 char name[64]; 2086 snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a))); 2087 LLVMValueRef args[2] = {a, b}; 2088 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 2089 AC_FUNC_ATTR_READNONE); 2090} 2091 2092LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, 2093 LLVMValueRef b) 2094{ 2095 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, ""); 2096 return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); 2097} 2098 2099LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, 2100 LLVMValueRef b) 2101{ 2102 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, ""); 2103 return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); 2104} 2105 2106LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, 2107 LLVMValueRef b) 2108{ 2109 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, ""); 2110 return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); 2111} 2112 2113LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, 2114 LLVMValueRef b) 2115{ 2116 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, ""); 2117 return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); 2118} 2119 2120LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value) 2121{ 2122 LLVMTypeRef t = LLVMTypeOf(value); 2123 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)), 2124 LLVMConstReal(t, 1.0)); 2125} 2126 2127void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a) 2128{ 2129 LLVMValueRef args[9]; 2130 2131 args[0] = LLVMConstInt(ctx->i32, a->target, 0); 2132 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0); 2133 2134 if (a->compr) { 2135 LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context); 2136 LLVMTypeRef v2i16 = LLVMVectorType(i16, 2); 2137 2138 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], 2139 v2i16, ""); 2140 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], 2141 v2i16, ""); 2142 args[4] = LLVMConstInt(ctx->i1, a->done, 0); 2143 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0); 2144 2145 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", 2146 ctx->voidt, args, 6, 0); 2147 } else { 2148 args[2] = a->out[0]; 2149 args[3] = a->out[1]; 2150 args[4] = a->out[2]; 2151 args[5] = a->out[3]; 2152 args[6] = LLVMConstInt(ctx->i1, a->done, 0); 2153 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0); 2154 2155 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", 2156 ctx->voidt, args, 8, 0); 2157 } 2158} 2159 2160void ac_build_export_null(struct ac_llvm_context *ctx) 2161{ 2162 struct ac_export_args args; 2163 2164 args.enabled_channels = 0x0; /* enabled channels */ 2165 args.valid_mask = 1; /* whether the EXEC mask is valid */ 2166 args.done = 1; /* DONE bit */ 2167 args.target = V_008DFC_SQ_EXP_NULL; 2168 args.compr = 0; /* COMPR flag (0 = 32-bit export) */ 2169 args.out[0] = LLVMGetUndef(ctx->f32); /* R */ 2170 args.out[1] = LLVMGetUndef(ctx->f32); /* G */ 2171 args.out[2] = LLVMGetUndef(ctx->f32); /* B */ 2172 args.out[3] = LLVMGetUndef(ctx->f32); /* A */ 2173 2174 ac_build_export(ctx, &args); 2175} 2176 2177static unsigned ac_num_coords(enum ac_image_dim dim) 2178{ 2179 switch (dim) { 2180 case ac_image_1d: 2181 return 1; 2182 case ac_image_2d: 2183 case ac_image_1darray: 2184 return 2; 2185 case ac_image_3d: 2186 case ac_image_cube: 2187 case ac_image_2darray: 2188 case ac_image_2dmsaa: 2189 return 3; 2190 case ac_image_2darraymsaa: 2191 return 4; 2192 default: 2193 unreachable("ac_num_coords: bad dim"); 2194 } 2195} 2196 2197static unsigned ac_num_derivs(enum ac_image_dim dim) 2198{ 2199 switch (dim) { 2200 case ac_image_1d: 2201 case ac_image_1darray: 2202 return 2; 2203 case ac_image_2d: 2204 case ac_image_2darray: 2205 case ac_image_cube: 2206 return 4; 2207 case ac_image_3d: 2208 return 6; 2209 case ac_image_2dmsaa: 2210 case ac_image_2darraymsaa: 2211 default: 2212 unreachable("derivatives not supported"); 2213 } 2214} 2215 2216static const char *get_atomic_name(enum ac_atomic_op op) 2217{ 2218 switch (op) { 2219 case ac_atomic_swap: return "swap"; 2220 case ac_atomic_add: return "add"; 2221 case ac_atomic_sub: return "sub"; 2222 case ac_atomic_smin: return "smin"; 2223 case ac_atomic_umin: return "umin"; 2224 case ac_atomic_smax: return "smax"; 2225 case ac_atomic_umax: return "umax"; 2226 case ac_atomic_and: return "and"; 2227 case ac_atomic_or: return "or"; 2228 case ac_atomic_xor: return "xor"; 2229 } 2230 unreachable("bad atomic op"); 2231} 2232 2233LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, 2234 struct ac_image_args *a) 2235{ 2236 const char *overload[3] = { "", "", "" }; 2237 unsigned num_overloads = 0; 2238 LLVMValueRef args[18]; 2239 unsigned num_args = 0; 2240 enum ac_image_dim dim = a->dim; 2241 2242 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || 2243 !a->level_zero); 2244 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip && 2245 a->opcode != ac_image_store_mip) || 2246 a->lod); 2247 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || 2248 (!a->compare && !a->offset)); 2249 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || 2250 a->opcode == ac_image_get_lod) || 2251 !a->bias); 2252 assert((a->bias ? 1 : 0) + 2253 (a->lod ? 1 : 0) + 2254 (a->level_zero ? 1 : 0) + 2255 (a->derivs[0] ? 1 : 0) <= 1); 2256 2257 if (a->opcode == ac_image_get_lod) { 2258 switch (dim) { 2259 case ac_image_1darray: 2260 dim = ac_image_1d; 2261 break; 2262 case ac_image_2darray: 2263 case ac_image_cube: 2264 dim = ac_image_2d; 2265 break; 2266 default: 2267 break; 2268 } 2269 } 2270 2271 bool sample = a->opcode == ac_image_sample || 2272 a->opcode == ac_image_gather4 || 2273 a->opcode == ac_image_get_lod; 2274 bool atomic = a->opcode == ac_image_atomic || 2275 a->opcode == ac_image_atomic_cmpswap; 2276 LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32; 2277 2278 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) { 2279 args[num_args++] = a->data[0]; 2280 if (a->opcode == ac_image_atomic_cmpswap) 2281 args[num_args++] = a->data[1]; 2282 } 2283 2284 if (!atomic) 2285 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false); 2286 2287 if (a->offset) 2288 args[num_args++] = ac_to_integer(ctx, a->offset); 2289 if (a->bias) { 2290 args[num_args++] = ac_to_float(ctx, a->bias); 2291 overload[num_overloads++] = ".f32"; 2292 } 2293 if (a->compare) 2294 args[num_args++] = ac_to_float(ctx, a->compare); 2295 if (a->derivs[0]) { 2296 unsigned count = ac_num_derivs(dim); 2297 for (unsigned i = 0; i < count; ++i) 2298 args[num_args++] = ac_to_float(ctx, a->derivs[i]); 2299 overload[num_overloads++] = ".f32"; 2300 } 2301 unsigned num_coords = 2302 a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0; 2303 for (unsigned i = 0; i < num_coords; ++i) 2304 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, ""); 2305 if (a->lod) 2306 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, ""); 2307 overload[num_overloads++] = sample ? ".f32" : ".i32"; 2308 2309 args[num_args++] = a->resource; 2310 if (sample) { 2311 args[num_args++] = a->sampler; 2312 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false); 2313 } 2314 2315 args[num_args++] = ctx->i32_0; /* texfailctrl */ 2316 args[num_args++] = LLVMConstInt(ctx->i32, a->cache_policy, false); 2317 2318 const char *name; 2319 const char *atomic_subop = ""; 2320 switch (a->opcode) { 2321 case ac_image_sample: name = "sample"; break; 2322 case ac_image_gather4: name = "gather4"; break; 2323 case ac_image_load: name = "load"; break; 2324 case ac_image_load_mip: name = "load.mip"; break; 2325 case ac_image_store: name = "store"; break; 2326 case ac_image_store_mip: name = "store.mip"; break; 2327 case ac_image_atomic: 2328 name = "atomic."; 2329 atomic_subop = get_atomic_name(a->atomic); 2330 break; 2331 case ac_image_atomic_cmpswap: 2332 name = "atomic."; 2333 atomic_subop = "cmpswap"; 2334 break; 2335 case ac_image_get_lod: name = "getlod"; break; 2336 case ac_image_get_resinfo: name = "getresinfo"; break; 2337 default: unreachable("invalid image opcode"); 2338 } 2339 2340 const char *dimname; 2341 switch (dim) { 2342 case ac_image_1d: dimname = "1d"; break; 2343 case ac_image_2d: dimname = "2d"; break; 2344 case ac_image_3d: dimname = "3d"; break; 2345 case ac_image_cube: dimname = "cube"; break; 2346 case ac_image_1darray: dimname = "1darray"; break; 2347 case ac_image_2darray: dimname = "2darray"; break; 2348 case ac_image_2dmsaa: dimname = "2dmsaa"; break; 2349 case ac_image_2darraymsaa: dimname = "2darraymsaa"; break; 2350 default: unreachable("invalid dim"); 2351 } 2352 2353 bool lod_suffix = 2354 a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4); 2355 char intr_name[96]; 2356 snprintf(intr_name, sizeof(intr_name), 2357 "llvm.amdgcn.image.%s%s" /* base name */ 2358 "%s%s%s" /* sample/gather modifiers */ 2359 ".%s.%s%s%s%s", /* dimension and type overloads */ 2360 name, atomic_subop, 2361 a->compare ? ".c" : "", 2362 a->bias ? ".b" : 2363 lod_suffix ? ".l" : 2364 a->derivs[0] ? ".d" : 2365 a->level_zero ? ".lz" : "", 2366 a->offset ? ".o" : "", 2367 dimname, 2368 atomic ? "i32" : "v4f32", 2369 overload[0], overload[1], overload[2]); 2370 2371 LLVMTypeRef retty; 2372 if (atomic) 2373 retty = ctx->i32; 2374 else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) 2375 retty = ctx->voidt; 2376 else 2377 retty = ctx->v4f32; 2378 2379 LLVMValueRef result = 2380 ac_build_intrinsic(ctx, intr_name, retty, args, num_args, 2381 a->attributes); 2382 if (!sample && retty == ctx->v4f32) { 2383 result = LLVMBuildBitCast(ctx->builder, result, 2384 ctx->v4i32, ""); 2385 } 2386 return result; 2387} 2388 2389LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, 2390 LLVMValueRef args[2]) 2391{ 2392 LLVMTypeRef v2f16 = 2393 LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2); 2394 2395 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16, 2396 args, 2, AC_FUNC_ATTR_READNONE); 2397} 2398 2399LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, 2400 LLVMValueRef args[2]) 2401{ 2402 LLVMValueRef res = 2403 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", 2404 ctx->v2i16, args, 2, 2405 AC_FUNC_ATTR_READNONE); 2406 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); 2407} 2408 2409LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, 2410 LLVMValueRef args[2]) 2411{ 2412 LLVMValueRef res = 2413 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", 2414 ctx->v2i16, args, 2, 2415 AC_FUNC_ATTR_READNONE); 2416 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); 2417} 2418 2419/* The 8-bit and 10-bit clamping is for HW workarounds. */ 2420LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, 2421 LLVMValueRef args[2], unsigned bits, bool hi) 2422{ 2423 assert(bits == 8 || bits == 10 || bits == 16); 2424 2425 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, 2426 bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0); 2427 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, 2428 bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0); 2429 LLVMValueRef max_alpha = 2430 bits != 10 ? max_rgb : ctx->i32_1; 2431 LLVMValueRef min_alpha = 2432 bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0); 2433 2434 /* Clamp. */ 2435 if (bits != 16) { 2436 for (int i = 0; i < 2; i++) { 2437 bool alpha = hi && i == 1; 2438 args[i] = ac_build_imin(ctx, args[i], 2439 alpha ? max_alpha : max_rgb); 2440 args[i] = ac_build_imax(ctx, args[i], 2441 alpha ? min_alpha : min_rgb); 2442 } 2443 } 2444 2445 LLVMValueRef res = 2446 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", 2447 ctx->v2i16, args, 2, 2448 AC_FUNC_ATTR_READNONE); 2449 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); 2450} 2451 2452/* The 8-bit and 10-bit clamping is for HW workarounds. */ 2453LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, 2454 LLVMValueRef args[2], unsigned bits, bool hi) 2455{ 2456 assert(bits == 8 || bits == 10 || bits == 16); 2457 2458 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, 2459 bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0); 2460 LLVMValueRef max_alpha = 2461 bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0); 2462 2463 /* Clamp. */ 2464 if (bits != 16) { 2465 for (int i = 0; i < 2; i++) { 2466 bool alpha = hi && i == 1; 2467 args[i] = ac_build_umin(ctx, args[i], 2468 alpha ? max_alpha : max_rgb); 2469 } 2470 } 2471 2472 LLVMValueRef res = 2473 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", 2474 ctx->v2i16, args, 2, 2475 AC_FUNC_ATTR_READNONE); 2476 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); 2477} 2478 2479LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1) 2480{ 2481 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, 2482 &i1, 1, AC_FUNC_ATTR_READNONE); 2483} 2484 2485void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1) 2486{ 2487 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, 2488 &i1, 1, 0); 2489} 2490 2491LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, 2492 LLVMValueRef offset, LLVMValueRef width, 2493 bool is_signed) 2494{ 2495 LLVMValueRef args[] = { 2496 input, 2497 offset, 2498 width, 2499 }; 2500 2501 return ac_build_intrinsic(ctx, 2502 is_signed ? "llvm.amdgcn.sbfe.i32" : 2503 "llvm.amdgcn.ubfe.i32", 2504 ctx->i32, args, 3, 2505 AC_FUNC_ATTR_READNONE); 2506} 2507 2508LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, 2509 LLVMValueRef s1, LLVMValueRef s2) 2510{ 2511 return LLVMBuildAdd(ctx->builder, 2512 LLVMBuildMul(ctx->builder, s0, s1, ""), s2, ""); 2513} 2514 2515LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, 2516 LLVMValueRef s1, LLVMValueRef s2) 2517{ 2518 return LLVMBuildFAdd(ctx->builder, 2519 LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, ""); 2520} 2521 2522void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16) 2523{ 2524 LLVMValueRef args[1] = { 2525 LLVMConstInt(ctx->i32, simm16, false), 2526 }; 2527 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", 2528 ctx->voidt, args, 1, 0); 2529} 2530 2531LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0, 2532 LLVMValueRef src1, LLVMValueRef src2, 2533 unsigned bitsize) 2534{ 2535 LLVMTypeRef type; 2536 char *intr; 2537 2538 if (bitsize == 16) { 2539 intr = "llvm.amdgcn.fmed3.f16"; 2540 type = ctx->f16; 2541 } else if (bitsize == 32) { 2542 intr = "llvm.amdgcn.fmed3.f32"; 2543 type = ctx->f32; 2544 } else { 2545 intr = "llvm.amdgcn.fmed3.f64"; 2546 type = ctx->f64; 2547 } 2548 2549 LLVMValueRef params[] = { 2550 src0, 2551 src1, 2552 src2, 2553 }; 2554 return ac_build_intrinsic(ctx, intr, type, params, 3, 2555 AC_FUNC_ATTR_READNONE); 2556} 2557 2558LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, 2559 unsigned bitsize) 2560{ 2561 LLVMTypeRef type; 2562 char *intr; 2563 2564 if (bitsize == 16) { 2565 intr = "llvm.amdgcn.fract.f16"; 2566 type = ctx->f16; 2567 } else if (bitsize == 32) { 2568 intr = "llvm.amdgcn.fract.f32"; 2569 type = ctx->f32; 2570 } else { 2571 intr = "llvm.amdgcn.fract.f64"; 2572 type = ctx->f64; 2573 } 2574 2575 LLVMValueRef params[] = { 2576 src0, 2577 }; 2578 return ac_build_intrinsic(ctx, intr, type, params, 1, 2579 AC_FUNC_ATTR_READNONE); 2580} 2581 2582LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0, 2583 unsigned bitsize) 2584{ 2585 LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize); 2586 LLVMValueRef zero = LLVMConstInt(type, 0, false); 2587 LLVMValueRef one = LLVMConstInt(type, 1, false); 2588 2589 LLVMValueRef cmp, val; 2590 cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, ""); 2591 val = LLVMBuildSelect(ctx->builder, cmp, one, src0, ""); 2592 cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, ""); 2593 val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), ""); 2594 return val; 2595} 2596 2597LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0, 2598 unsigned bitsize) 2599{ 2600 LLVMValueRef cmp, val, zero, one; 2601 LLVMTypeRef type; 2602 2603 if (bitsize == 16) { 2604 type = ctx->f16; 2605 zero = ctx->f16_0; 2606 one = ctx->f16_1; 2607 } else if (bitsize == 32) { 2608 type = ctx->f32; 2609 zero = ctx->f32_0; 2610 one = ctx->f32_1; 2611 } else { 2612 type = ctx->f64; 2613 zero = ctx->f64_0; 2614 one = ctx->f64_1; 2615 } 2616 2617 cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, ""); 2618 val = LLVMBuildSelect(ctx->builder, cmp, one, src0, ""); 2619 cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, ""); 2620 val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), ""); 2621 return val; 2622} 2623 2624LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0) 2625{ 2626 LLVMValueRef result; 2627 unsigned bitsize; 2628 2629 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); 2630 2631 switch (bitsize) { 2632 case 64: 2633 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, 2634 (LLVMValueRef []) { src0 }, 1, 2635 AC_FUNC_ATTR_READNONE); 2636 2637 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); 2638 break; 2639 case 32: 2640 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, 2641 (LLVMValueRef []) { src0 }, 1, 2642 AC_FUNC_ATTR_READNONE); 2643 break; 2644 case 16: 2645 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, 2646 (LLVMValueRef []) { src0 }, 1, 2647 AC_FUNC_ATTR_READNONE); 2648 2649 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); 2650 break; 2651 case 8: 2652 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, 2653 (LLVMValueRef []) { src0 }, 1, 2654 AC_FUNC_ATTR_READNONE); 2655 2656 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); 2657 break; 2658 default: 2659 unreachable(!"invalid bitsize"); 2660 break; 2661 } 2662 2663 return result; 2664} 2665 2666LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, 2667 LLVMValueRef src0) 2668{ 2669 LLVMValueRef result; 2670 unsigned bitsize; 2671 2672 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); 2673 2674 switch (bitsize) { 2675 case 64: 2676 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, 2677 (LLVMValueRef []) { src0 }, 1, 2678 AC_FUNC_ATTR_READNONE); 2679 2680 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); 2681 break; 2682 case 32: 2683 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, 2684 (LLVMValueRef []) { src0 }, 1, 2685 AC_FUNC_ATTR_READNONE); 2686 break; 2687 case 16: 2688 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, 2689 (LLVMValueRef []) { src0 }, 1, 2690 AC_FUNC_ATTR_READNONE); 2691 2692 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); 2693 break; 2694 case 8: 2695 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, 2696 (LLVMValueRef []) { src0 }, 1, 2697 AC_FUNC_ATTR_READNONE); 2698 2699 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); 2700 break; 2701 default: 2702 unreachable(!"invalid bitsize"); 2703 break; 2704 } 2705 2706 return result; 2707} 2708 2709#define AC_EXP_TARGET 0 2710#define AC_EXP_ENABLED_CHANNELS 1 2711#define AC_EXP_OUT0 2 2712 2713enum ac_ir_type { 2714 AC_IR_UNDEF, 2715 AC_IR_CONST, 2716 AC_IR_VALUE, 2717}; 2718 2719struct ac_vs_exp_chan 2720{ 2721 LLVMValueRef value; 2722 float const_float; 2723 enum ac_ir_type type; 2724}; 2725 2726struct ac_vs_exp_inst { 2727 unsigned offset; 2728 LLVMValueRef inst; 2729 struct ac_vs_exp_chan chan[4]; 2730}; 2731 2732struct ac_vs_exports { 2733 unsigned num; 2734 struct ac_vs_exp_inst exp[VARYING_SLOT_MAX]; 2735}; 2736 2737/* Return true if the PARAM export has been eliminated. */ 2738static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, 2739 uint32_t num_outputs, 2740 struct ac_vs_exp_inst *exp) 2741{ 2742 unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */ 2743 bool is_zero[4] = {}, is_one[4] = {}; 2744 2745 for (i = 0; i < 4; i++) { 2746 /* It's a constant expression. Undef outputs are eliminated too. */ 2747 if (exp->chan[i].type == AC_IR_UNDEF) { 2748 is_zero[i] = true; 2749 is_one[i] = true; 2750 } else if (exp->chan[i].type == AC_IR_CONST) { 2751 if (exp->chan[i].const_float == 0) 2752 is_zero[i] = true; 2753 else if (exp->chan[i].const_float == 1) 2754 is_one[i] = true; 2755 else 2756 return false; /* other constant */ 2757 } else 2758 return false; 2759 } 2760 2761 /* Only certain combinations of 0 and 1 can be eliminated. */ 2762 if (is_zero[0] && is_zero[1] && is_zero[2]) 2763 default_val = is_zero[3] ? 0 : 1; 2764 else if (is_one[0] && is_one[1] && is_one[2]) 2765 default_val = is_zero[3] ? 2 : 3; 2766 else 2767 return false; 2768 2769 /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */ 2770 LLVMInstructionEraseFromParent(exp->inst); 2771 2772 /* Change OFFSET to DEFAULT_VAL. */ 2773 for (i = 0; i < num_outputs; i++) { 2774 if (vs_output_param_offset[i] == exp->offset) { 2775 vs_output_param_offset[i] = 2776 AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val; 2777 break; 2778 } 2779 } 2780 return true; 2781} 2782 2783static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx, 2784 uint8_t *vs_output_param_offset, 2785 uint32_t num_outputs, 2786 struct ac_vs_exports *processed, 2787 struct ac_vs_exp_inst *exp) 2788{ 2789 unsigned p, copy_back_channels = 0; 2790 2791 /* See if the output is already in the list of processed outputs. 2792 * The LLVMValueRef comparison relies on SSA. 2793 */ 2794 for (p = 0; p < processed->num; p++) { 2795 bool different = false; 2796 2797 for (unsigned j = 0; j < 4; j++) { 2798 struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j]; 2799 struct ac_vs_exp_chan *c2 = &exp->chan[j]; 2800 2801 /* Treat undef as a match. */ 2802 if (c2->type == AC_IR_UNDEF) 2803 continue; 2804 2805 /* If c1 is undef but c2 isn't, we can copy c2 to c1 2806 * and consider the instruction duplicated. 2807 */ 2808 if (c1->type == AC_IR_UNDEF) { 2809 copy_back_channels |= 1 << j; 2810 continue; 2811 } 2812 2813 /* Test whether the channels are not equal. */ 2814 if (c1->type != c2->type || 2815 (c1->type == AC_IR_CONST && 2816 c1->const_float != c2->const_float) || 2817 (c1->type == AC_IR_VALUE && 2818 c1->value != c2->value)) { 2819 different = true; 2820 break; 2821 } 2822 } 2823 if (!different) 2824 break; 2825 2826 copy_back_channels = 0; 2827 } 2828 if (p == processed->num) 2829 return false; 2830 2831 /* If a match was found, but the matching export has undef where the new 2832 * one has a normal value, copy the normal value to the undef channel. 2833 */ 2834 struct ac_vs_exp_inst *match = &processed->exp[p]; 2835 2836 /* Get current enabled channels mask. */ 2837 LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS); 2838 unsigned enabled_channels = LLVMConstIntGetZExtValue(arg); 2839 2840 while (copy_back_channels) { 2841 unsigned chan = u_bit_scan(©_back_channels); 2842 2843 assert(match->chan[chan].type == AC_IR_UNDEF); 2844 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, 2845 exp->chan[chan].value); 2846 match->chan[chan] = exp->chan[chan]; 2847 2848 /* Update number of enabled channels because the original mask 2849 * is not always 0xf. 2850 */ 2851 enabled_channels |= (1 << chan); 2852 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS, 2853 LLVMConstInt(ctx->i32, enabled_channels, 0)); 2854 } 2855 2856 /* The PARAM export is duplicated. Kill it. */ 2857 LLVMInstructionEraseFromParent(exp->inst); 2858 2859 /* Change OFFSET to the matching export. */ 2860 for (unsigned i = 0; i < num_outputs; i++) { 2861 if (vs_output_param_offset[i] == exp->offset) { 2862 vs_output_param_offset[i] = match->offset; 2863 break; 2864 } 2865 } 2866 return true; 2867} 2868 2869void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, 2870 LLVMValueRef main_fn, 2871 uint8_t *vs_output_param_offset, 2872 uint32_t num_outputs, 2873 uint8_t *num_param_exports) 2874{ 2875 LLVMBasicBlockRef bb; 2876 bool removed_any = false; 2877 struct ac_vs_exports exports; 2878 2879 exports.num = 0; 2880 2881 /* Process all LLVM instructions. */ 2882 bb = LLVMGetFirstBasicBlock(main_fn); 2883 while (bb) { 2884 LLVMValueRef inst = LLVMGetFirstInstruction(bb); 2885 2886 while (inst) { 2887 LLVMValueRef cur = inst; 2888 inst = LLVMGetNextInstruction(inst); 2889 struct ac_vs_exp_inst exp; 2890 2891 if (LLVMGetInstructionOpcode(cur) != LLVMCall) 2892 continue; 2893 2894 LLVMValueRef callee = ac_llvm_get_called_value(cur); 2895 2896 if (!ac_llvm_is_function(callee)) 2897 continue; 2898 2899 const char *name = LLVMGetValueName(callee); 2900 unsigned num_args = LLVMCountParams(callee); 2901 2902 /* Check if this is an export instruction. */ 2903 if ((num_args != 9 && num_args != 8) || 2904 (strcmp(name, "llvm.SI.export") && 2905 strcmp(name, "llvm.amdgcn.exp.f32"))) 2906 continue; 2907 2908 LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET); 2909 unsigned target = LLVMConstIntGetZExtValue(arg); 2910 2911 if (target < V_008DFC_SQ_EXP_PARAM) 2912 continue; 2913 2914 target -= V_008DFC_SQ_EXP_PARAM; 2915 2916 /* Parse the instruction. */ 2917 memset(&exp, 0, sizeof(exp)); 2918 exp.offset = target; 2919 exp.inst = cur; 2920 2921 for (unsigned i = 0; i < 4; i++) { 2922 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i); 2923 2924 exp.chan[i].value = v; 2925 2926 if (LLVMIsUndef(v)) { 2927 exp.chan[i].type = AC_IR_UNDEF; 2928 } else if (LLVMIsAConstantFP(v)) { 2929 LLVMBool loses_info; 2930 exp.chan[i].type = AC_IR_CONST; 2931 exp.chan[i].const_float = 2932 LLVMConstRealGetDouble(v, &loses_info); 2933 } else { 2934 exp.chan[i].type = AC_IR_VALUE; 2935 } 2936 } 2937 2938 /* Eliminate constant and duplicated PARAM exports. */ 2939 if (ac_eliminate_const_output(vs_output_param_offset, 2940 num_outputs, &exp) || 2941 ac_eliminate_duplicated_output(ctx, 2942 vs_output_param_offset, 2943 num_outputs, &exports, 2944 &exp)) { 2945 removed_any = true; 2946 } else { 2947 exports.exp[exports.num++] = exp; 2948 } 2949 } 2950 bb = LLVMGetNextBasicBlock(bb); 2951 } 2952 2953 /* Remove holes in export memory due to removed PARAM exports. 2954 * This is done by renumbering all PARAM exports. 2955 */ 2956 if (removed_any) { 2957 uint8_t old_offset[VARYING_SLOT_MAX]; 2958 unsigned out, i; 2959 2960 /* Make a copy of the offsets. We need the old version while 2961 * we are modifying some of them. */ 2962 memcpy(old_offset, vs_output_param_offset, 2963 sizeof(old_offset)); 2964 2965 for (i = 0; i < exports.num; i++) { 2966 unsigned offset = exports.exp[i].offset; 2967 2968 /* Update vs_output_param_offset. Multiple outputs can 2969 * have the same offset. 2970 */ 2971 for (out = 0; out < num_outputs; out++) { 2972 if (old_offset[out] == offset) 2973 vs_output_param_offset[out] = i; 2974 } 2975 2976 /* Change the PARAM offset in the instruction. */ 2977 LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET, 2978 LLVMConstInt(ctx->i32, 2979 V_008DFC_SQ_EXP_PARAM + i, 0)); 2980 } 2981 *num_param_exports = exports.num; 2982 } 2983} 2984 2985void ac_init_exec_full_mask(struct ac_llvm_context *ctx) 2986{ 2987 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); 2988 ac_build_intrinsic(ctx, 2989 "llvm.amdgcn.init.exec", ctx->voidt, 2990 &full_mask, 1, AC_FUNC_ATTR_CONVERGENT); 2991} 2992 2993void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx) 2994{ 2995 unsigned lds_size = ctx->chip_class >= CIK ? 65536 : 32768; 2996 ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0, 2997 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), 2998 "lds"); 2999} 3000 3001LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, 3002 LLVMValueRef dw_addr) 3003{ 3004 return ac_build_load(ctx, ctx->lds, dw_addr); 3005} 3006 3007void ac_lds_store(struct ac_llvm_context *ctx, 3008 LLVMValueRef dw_addr, 3009 LLVMValueRef value) 3010{ 3011 value = ac_to_integer(ctx, value); 3012 ac_build_indexed_store(ctx, ctx->lds, 3013 dw_addr, value); 3014} 3015 3016LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, 3017 LLVMTypeRef dst_type, 3018 LLVMValueRef src0) 3019{ 3020 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); 3021 const char *intrin_name; 3022 LLVMTypeRef type; 3023 LLVMValueRef zero; 3024 3025 switch (src0_bitsize) { 3026 case 64: 3027 intrin_name = "llvm.cttz.i64"; 3028 type = ctx->i64; 3029 zero = ctx->i64_0; 3030 break; 3031 case 32: 3032 intrin_name = "llvm.cttz.i32"; 3033 type = ctx->i32; 3034 zero = ctx->i32_0; 3035 break; 3036 case 16: 3037 intrin_name = "llvm.cttz.i16"; 3038 type = ctx->i16; 3039 zero = ctx->i16_0; 3040 break; 3041 case 8: 3042 intrin_name = "llvm.cttz.i8"; 3043 type = ctx->i8; 3044 zero = ctx->i8_0; 3045 break; 3046 default: 3047 unreachable(!"invalid bitsize"); 3048 } 3049 3050 LLVMValueRef params[2] = { 3051 src0, 3052 3053 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't 3054 * add special code to check for x=0. The reason is that 3055 * the LLVM behavior for x=0 is different from what we 3056 * need here. However, LLVM also assumes that ffs(x) is 3057 * in [0, 31], but GLSL expects that ffs(0) = -1, so 3058 * a conditional assignment to handle 0 is still required. 3059 * 3060 * The hardware already implements the correct behavior. 3061 */ 3062 ctx->i1true, 3063 }; 3064 3065 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, 3066 params, 2, 3067 AC_FUNC_ATTR_READNONE); 3068 3069 if (src0_bitsize == 64) { 3070 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, ""); 3071 } else if (src0_bitsize < 32) { 3072 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, ""); 3073 } 3074 3075 /* TODO: We need an intrinsic to skip this conditional. */ 3076 /* Check for zero: */ 3077 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, 3078 LLVMIntEQ, src0, 3079 zero, ""), 3080 LLVMConstInt(ctx->i32, -1, 0), lsb, ""); 3081} 3082 3083LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type) 3084{ 3085 return LLVMPointerType(LLVMArrayType(elem_type, 0), 3086 AC_ADDR_SPACE_CONST); 3087} 3088 3089LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type) 3090{ 3091 return LLVMPointerType(LLVMArrayType(elem_type, 0), 3092 AC_ADDR_SPACE_CONST_32BIT); 3093} 3094 3095static struct ac_llvm_flow * 3096get_current_flow(struct ac_llvm_context *ctx) 3097{ 3098 if (ctx->flow_depth > 0) 3099 return &ctx->flow[ctx->flow_depth - 1]; 3100 return NULL; 3101} 3102 3103static struct ac_llvm_flow * 3104get_innermost_loop(struct ac_llvm_context *ctx) 3105{ 3106 for (unsigned i = ctx->flow_depth; i > 0; --i) { 3107 if (ctx->flow[i - 1].loop_entry_block) 3108 return &ctx->flow[i - 1]; 3109 } 3110 return NULL; 3111} 3112 3113static struct ac_llvm_flow * 3114push_flow(struct ac_llvm_context *ctx) 3115{ 3116 struct ac_llvm_flow *flow; 3117 3118 if (ctx->flow_depth >= ctx->flow_depth_max) { 3119 unsigned new_max = MAX2(ctx->flow_depth << 1, 3120 AC_LLVM_INITIAL_CF_DEPTH); 3121 3122 ctx->flow = realloc(ctx->flow, new_max * sizeof(*ctx->flow)); 3123 ctx->flow_depth_max = new_max; 3124 } 3125 3126 flow = &ctx->flow[ctx->flow_depth]; 3127 ctx->flow_depth++; 3128 3129 flow->next_block = NULL; 3130 flow->loop_entry_block = NULL; 3131 return flow; 3132} 3133 3134static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, 3135 int label_id) 3136{ 3137 char buf[32]; 3138 snprintf(buf, sizeof(buf), "%s%d", base, label_id); 3139 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf); 3140} 3141 3142/* Append a basic block at the level of the parent flow. 3143 */ 3144static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, 3145 const char *name) 3146{ 3147 assert(ctx->flow_depth >= 1); 3148 3149 if (ctx->flow_depth >= 2) { 3150 struct ac_llvm_flow *flow = &ctx->flow[ctx->flow_depth - 2]; 3151 3152 return LLVMInsertBasicBlockInContext(ctx->context, 3153 flow->next_block, name); 3154 } 3155 3156 LLVMValueRef main_fn = 3157 LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder)); 3158 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name); 3159} 3160 3161/* Emit a branch to the given default target for the current block if 3162 * applicable -- that is, if the current block does not already contain a 3163 * branch from a break or continue. 3164 */ 3165static void emit_default_branch(LLVMBuilderRef builder, 3166 LLVMBasicBlockRef target) 3167{ 3168 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder))) 3169 LLVMBuildBr(builder, target); 3170} 3171 3172void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id) 3173{ 3174 struct ac_llvm_flow *flow = push_flow(ctx); 3175 flow->loop_entry_block = append_basic_block(ctx, "LOOP"); 3176 flow->next_block = append_basic_block(ctx, "ENDLOOP"); 3177 set_basicblock_name(flow->loop_entry_block, "loop", label_id); 3178 LLVMBuildBr(ctx->builder, flow->loop_entry_block); 3179 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block); 3180} 3181 3182void ac_build_break(struct ac_llvm_context *ctx) 3183{ 3184 struct ac_llvm_flow *flow = get_innermost_loop(ctx); 3185 LLVMBuildBr(ctx->builder, flow->next_block); 3186} 3187 3188void ac_build_continue(struct ac_llvm_context *ctx) 3189{ 3190 struct ac_llvm_flow *flow = get_innermost_loop(ctx); 3191 LLVMBuildBr(ctx->builder, flow->loop_entry_block); 3192} 3193 3194void ac_build_else(struct ac_llvm_context *ctx, int label_id) 3195{ 3196 struct ac_llvm_flow *current_branch = get_current_flow(ctx); 3197 LLVMBasicBlockRef endif_block; 3198 3199 assert(!current_branch->loop_entry_block); 3200 3201 endif_block = append_basic_block(ctx, "ENDIF"); 3202 emit_default_branch(ctx->builder, endif_block); 3203 3204 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); 3205 set_basicblock_name(current_branch->next_block, "else", label_id); 3206 3207 current_branch->next_block = endif_block; 3208} 3209 3210void ac_build_endif(struct ac_llvm_context *ctx, int label_id) 3211{ 3212 struct ac_llvm_flow *current_branch = get_current_flow(ctx); 3213 3214 assert(!current_branch->loop_entry_block); 3215 3216 emit_default_branch(ctx->builder, current_branch->next_block); 3217 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); 3218 set_basicblock_name(current_branch->next_block, "endif", label_id); 3219 3220 ctx->flow_depth--; 3221} 3222 3223void ac_build_endloop(struct ac_llvm_context *ctx, int label_id) 3224{ 3225 struct ac_llvm_flow *current_loop = get_current_flow(ctx); 3226 3227 assert(current_loop->loop_entry_block); 3228 3229 emit_default_branch(ctx->builder, current_loop->loop_entry_block); 3230 3231 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block); 3232 set_basicblock_name(current_loop->next_block, "endloop", label_id); 3233 ctx->flow_depth--; 3234} 3235 3236void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id) 3237{ 3238 struct ac_llvm_flow *flow = push_flow(ctx); 3239 LLVMBasicBlockRef if_block; 3240 3241 if_block = append_basic_block(ctx, "IF"); 3242 flow->next_block = append_basic_block(ctx, "ELSE"); 3243 set_basicblock_name(if_block, "if", label_id); 3244 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block); 3245 LLVMPositionBuilderAtEnd(ctx->builder, if_block); 3246} 3247 3248void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value, 3249 int label_id) 3250{ 3251 LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE, 3252 value, ctx->f32_0, ""); 3253 ac_build_ifcc(ctx, cond, label_id); 3254} 3255 3256void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value, 3257 int label_id) 3258{ 3259 LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE, 3260 ac_to_integer(ctx, value), 3261 ctx->i32_0, ""); 3262 ac_build_ifcc(ctx, cond, label_id); 3263} 3264 3265LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, 3266 const char *name) 3267{ 3268 LLVMBuilderRef builder = ac->builder; 3269 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder); 3270 LLVMValueRef function = LLVMGetBasicBlockParent(current_block); 3271 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function); 3272 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block); 3273 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context); 3274 LLVMValueRef res; 3275 3276 if (first_instr) { 3277 LLVMPositionBuilderBefore(first_builder, first_instr); 3278 } else { 3279 LLVMPositionBuilderAtEnd(first_builder, first_block); 3280 } 3281 3282 res = LLVMBuildAlloca(first_builder, type, name); 3283 LLVMDisposeBuilder(first_builder); 3284 return res; 3285} 3286 3287LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, 3288 LLVMTypeRef type, const char *name) 3289{ 3290 LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name); 3291 LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr); 3292 return ptr; 3293} 3294 3295LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, 3296 LLVMTypeRef type) 3297{ 3298 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); 3299 return LLVMBuildBitCast(ctx->builder, ptr, 3300 LLVMPointerType(type, addr_space), ""); 3301} 3302 3303LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, 3304 unsigned count) 3305{ 3306 unsigned num_components = ac_get_llvm_num_components(value); 3307 if (count == num_components) 3308 return value; 3309 3310 LLVMValueRef masks[MAX2(count, 2)]; 3311 masks[0] = ctx->i32_0; 3312 masks[1] = ctx->i32_1; 3313 for (unsigned i = 2; i < count; i++) 3314 masks[i] = LLVMConstInt(ctx->i32, i, false); 3315 3316 if (count == 1) 3317 return LLVMBuildExtractElement(ctx->builder, value, masks[0], 3318 ""); 3319 3320 LLVMValueRef swizzle = LLVMConstVector(masks, count); 3321 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, ""); 3322} 3323 3324LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, 3325 unsigned rshift, unsigned bitwidth) 3326{ 3327 LLVMValueRef value = param; 3328 if (rshift) 3329 value = LLVMBuildLShr(ctx->builder, value, 3330 LLVMConstInt(ctx->i32, rshift, false), ""); 3331 3332 if (rshift + bitwidth < 32) { 3333 unsigned mask = (1 << bitwidth) - 1; 3334 value = LLVMBuildAnd(ctx->builder, value, 3335 LLVMConstInt(ctx->i32, mask, false), ""); 3336 } 3337 return value; 3338} 3339 3340/* Adjust the sample index according to FMASK. 3341 * 3342 * For uncompressed MSAA surfaces, FMASK should return 0x76543210, 3343 * which is the identity mapping. Each nibble says which physical sample 3344 * should be fetched to get that sample. 3345 * 3346 * For example, 0x11111100 means there are only 2 samples stored and 3347 * the second sample covers 3/4 of the pixel. When reading samples 0 3348 * and 1, return physical sample 0 (determined by the first two 0s 3349 * in FMASK), otherwise return physical sample 1. 3350 * 3351 * The sample index should be adjusted as follows: 3352 * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF; 3353 */ 3354void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, 3355 LLVMValueRef *addr, bool is_array_tex) 3356{ 3357 struct ac_image_args fmask_load = {}; 3358 fmask_load.opcode = ac_image_load; 3359 fmask_load.resource = fmask; 3360 fmask_load.dmask = 0xf; 3361 fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d; 3362 fmask_load.attributes = AC_FUNC_ATTR_READNONE; 3363 3364 fmask_load.coords[0] = addr[0]; 3365 fmask_load.coords[1] = addr[1]; 3366 if (is_array_tex) 3367 fmask_load.coords[2] = addr[2]; 3368 3369 LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load); 3370 fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, 3371 ac->i32_0, ""); 3372 3373 /* Apply the formula. */ 3374 unsigned sample_chan = is_array_tex ? 3 : 2; 3375 LLVMValueRef final_sample; 3376 final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], 3377 LLVMConstInt(ac->i32, 4, 0), ""); 3378 final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, ""); 3379 /* Mask the sample index by 0x7, because 0x8 means an unknown value 3380 * with EQAA, so those will map to 0. */ 3381 final_sample = LLVMBuildAnd(ac->builder, final_sample, 3382 LLVMConstInt(ac->i32, 0x7, 0), ""); 3383 3384 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK 3385 * resource descriptor is 0 (invalid). 3386 */ 3387 LLVMValueRef tmp; 3388 tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, ""); 3389 tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, ""); 3390 tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, ""); 3391 3392 /* Replace the MSAA sample index. */ 3393 addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, 3394 addr[sample_chan], ""); 3395} 3396 3397static LLVMValueRef 3398_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) 3399{ 3400 ac_build_optimization_barrier(ctx, &src); 3401 return ac_build_intrinsic(ctx, 3402 lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane", 3403 LLVMTypeOf(src), (LLVMValueRef []) { 3404 src, lane }, 3405 lane == NULL ? 1 : 2, 3406 AC_FUNC_ATTR_READNONE | 3407 AC_FUNC_ATTR_CONVERGENT); 3408} 3409 3410/** 3411 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic. 3412 * @param ctx 3413 * @param src 3414 * @param lane - id of the lane or NULL for the first active lane 3415 * @return value of the lane 3416 */ 3417LLVMValueRef 3418ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) 3419{ 3420 LLVMTypeRef src_type = LLVMTypeOf(src); 3421 src = ac_to_integer(ctx, src); 3422 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); 3423 LLVMValueRef ret; 3424 3425 if (bits == 32) { 3426 ret = _ac_build_readlane(ctx, src, lane); 3427 } else { 3428 assert(bits % 32 == 0); 3429 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); 3430 LLVMValueRef src_vector = 3431 LLVMBuildBitCast(ctx->builder, src, vec_type, ""); 3432 ret = LLVMGetUndef(vec_type); 3433 for (unsigned i = 0; i < bits / 32; i++) { 3434 src = LLVMBuildExtractElement(ctx->builder, src_vector, 3435 LLVMConstInt(ctx->i32, i, 0), ""); 3436 LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane); 3437 ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp, 3438 LLVMConstInt(ctx->i32, i, 0), ""); 3439 } 3440 } 3441 if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind) 3442 return LLVMBuildIntToPtr(ctx->builder, ret, src_type, ""); 3443 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3444} 3445 3446LLVMValueRef 3447ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane) 3448{ 3449 /* TODO: Use the actual instruction when LLVM adds an intrinsic for it. 3450 */ 3451 LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane, 3452 ac_get_thread_id(ctx), ""); 3453 return LLVMBuildSelect(ctx->builder, pred, value, src, ""); 3454} 3455 3456LLVMValueRef 3457ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask) 3458{ 3459 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, 3460 LLVMVectorType(ctx->i32, 2), 3461 ""); 3462 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, 3463 ctx->i32_0, ""); 3464 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, 3465 ctx->i32_1, ""); 3466 LLVMValueRef val = 3467 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, 3468 (LLVMValueRef []) { mask_lo, ctx->i32_0 }, 3469 2, AC_FUNC_ATTR_READNONE); 3470 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, 3471 (LLVMValueRef []) { mask_hi, val }, 3472 2, AC_FUNC_ATTR_READNONE); 3473 return val; 3474} 3475 3476enum dpp_ctrl { 3477 _dpp_quad_perm = 0x000, 3478 _dpp_row_sl = 0x100, 3479 _dpp_row_sr = 0x110, 3480 _dpp_row_rr = 0x120, 3481 dpp_wf_sl1 = 0x130, 3482 dpp_wf_rl1 = 0x134, 3483 dpp_wf_sr1 = 0x138, 3484 dpp_wf_rr1 = 0x13C, 3485 dpp_row_mirror = 0x140, 3486 dpp_row_half_mirror = 0x141, 3487 dpp_row_bcast15 = 0x142, 3488 dpp_row_bcast31 = 0x143 3489}; 3490 3491static inline enum dpp_ctrl 3492dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) 3493{ 3494 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4); 3495 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6); 3496} 3497 3498static inline enum dpp_ctrl 3499dpp_row_sl(unsigned amount) 3500{ 3501 assert(amount > 0 && amount < 16); 3502 return _dpp_row_sl | amount; 3503} 3504 3505static inline enum dpp_ctrl 3506dpp_row_sr(unsigned amount) 3507{ 3508 assert(amount > 0 && amount < 16); 3509 return _dpp_row_sr | amount; 3510} 3511 3512static LLVMValueRef 3513_ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, 3514 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, 3515 bool bound_ctrl) 3516{ 3517 return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32", 3518 LLVMTypeOf(old), 3519 (LLVMValueRef[]) { 3520 old, src, 3521 LLVMConstInt(ctx->i32, dpp_ctrl, 0), 3522 LLVMConstInt(ctx->i32, row_mask, 0), 3523 LLVMConstInt(ctx->i32, bank_mask, 0), 3524 LLVMConstInt(ctx->i1, bound_ctrl, 0) }, 3525 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3526} 3527 3528static LLVMValueRef 3529ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, 3530 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, 3531 bool bound_ctrl) 3532{ 3533 LLVMTypeRef src_type = LLVMTypeOf(src); 3534 src = ac_to_integer(ctx, src); 3535 old = ac_to_integer(ctx, old); 3536 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); 3537 LLVMValueRef ret; 3538 if (bits == 32) { 3539 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, 3540 bank_mask, bound_ctrl); 3541 } else { 3542 assert(bits % 32 == 0); 3543 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); 3544 LLVMValueRef src_vector = 3545 LLVMBuildBitCast(ctx->builder, src, vec_type, ""); 3546 LLVMValueRef old_vector = 3547 LLVMBuildBitCast(ctx->builder, old, vec_type, ""); 3548 ret = LLVMGetUndef(vec_type); 3549 for (unsigned i = 0; i < bits / 32; i++) { 3550 src = LLVMBuildExtractElement(ctx->builder, src_vector, 3551 LLVMConstInt(ctx->i32, i, 3552 0), ""); 3553 old = LLVMBuildExtractElement(ctx->builder, old_vector, 3554 LLVMConstInt(ctx->i32, i, 3555 0), ""); 3556 LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src, 3557 dpp_ctrl, 3558 row_mask, 3559 bank_mask, 3560 bound_ctrl); 3561 ret = LLVMBuildInsertElement(ctx->builder, ret, 3562 ret_comp, 3563 LLVMConstInt(ctx->i32, i, 3564 0), ""); 3565 } 3566 } 3567 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3568} 3569 3570static inline unsigned 3571ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask) 3572{ 3573 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32); 3574 return and_mask | (or_mask << 5) | (xor_mask << 10); 3575} 3576 3577static LLVMValueRef 3578_ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask) 3579{ 3580 return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", 3581 LLVMTypeOf(src), (LLVMValueRef []) { 3582 src, LLVMConstInt(ctx->i32, mask, 0) }, 3583 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); 3584} 3585 3586LLVMValueRef 3587ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask) 3588{ 3589 LLVMTypeRef src_type = LLVMTypeOf(src); 3590 src = ac_to_integer(ctx, src); 3591 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); 3592 LLVMValueRef ret; 3593 if (bits == 32) { 3594 ret = _ac_build_ds_swizzle(ctx, src, mask); 3595 } else { 3596 assert(bits % 32 == 0); 3597 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); 3598 LLVMValueRef src_vector = 3599 LLVMBuildBitCast(ctx->builder, src, vec_type, ""); 3600 ret = LLVMGetUndef(vec_type); 3601 for (unsigned i = 0; i < bits / 32; i++) { 3602 src = LLVMBuildExtractElement(ctx->builder, src_vector, 3603 LLVMConstInt(ctx->i32, i, 3604 0), ""); 3605 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, 3606 mask); 3607 ret = LLVMBuildInsertElement(ctx->builder, ret, 3608 ret_comp, 3609 LLVMConstInt(ctx->i32, i, 3610 0), ""); 3611 } 3612 } 3613 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3614} 3615 3616static LLVMValueRef 3617ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src) 3618{ 3619 char name[32], type[8]; 3620 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); 3621 snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type); 3622 return ac_build_intrinsic(ctx, name, LLVMTypeOf(src), 3623 (LLVMValueRef []) { src }, 1, 3624 AC_FUNC_ATTR_READNONE); 3625} 3626 3627static LLVMValueRef 3628ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src, 3629 LLVMValueRef inactive) 3630{ 3631 char name[33], type[8]; 3632 LLVMTypeRef src_type = LLVMTypeOf(src); 3633 src = ac_to_integer(ctx, src); 3634 inactive = ac_to_integer(ctx, inactive); 3635 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); 3636 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type); 3637 LLVMValueRef ret = 3638 ac_build_intrinsic(ctx, name, 3639 LLVMTypeOf(src), (LLVMValueRef []) { 3640 src, inactive }, 2, 3641 AC_FUNC_ATTR_READNONE | 3642 AC_FUNC_ATTR_CONVERGENT); 3643 return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); 3644} 3645 3646static LLVMValueRef 3647get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size) 3648{ 3649 if (type_size == 4) { 3650 switch (op) { 3651 case nir_op_iadd: return ctx->i32_0; 3652 case nir_op_fadd: return ctx->f32_0; 3653 case nir_op_imul: return ctx->i32_1; 3654 case nir_op_fmul: return ctx->f32_1; 3655 case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0); 3656 case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0); 3657 case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY); 3658 case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0); 3659 case nir_op_umax: return ctx->i32_0; 3660 case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY); 3661 case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0); 3662 case nir_op_ior: return ctx->i32_0; 3663 case nir_op_ixor: return ctx->i32_0; 3664 default: 3665 unreachable("bad reduction intrinsic"); 3666 } 3667 } else { /* type_size == 64bit */ 3668 switch (op) { 3669 case nir_op_iadd: return ctx->i64_0; 3670 case nir_op_fadd: return ctx->f64_0; 3671 case nir_op_imul: return ctx->i64_1; 3672 case nir_op_fmul: return ctx->f64_1; 3673 case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0); 3674 case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0); 3675 case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY); 3676 case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0); 3677 case nir_op_umax: return ctx->i64_0; 3678 case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY); 3679 case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0); 3680 case nir_op_ior: return ctx->i64_0; 3681 case nir_op_ixor: return ctx->i64_0; 3682 default: 3683 unreachable("bad reduction intrinsic"); 3684 } 3685 } 3686} 3687 3688static LLVMValueRef 3689ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op) 3690{ 3691 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8; 3692 switch (op) { 3693 case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, ""); 3694 case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, ""); 3695 case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, ""); 3696 case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, ""); 3697 case nir_op_imin: return LLVMBuildSelect(ctx->builder, 3698 LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""), 3699 lhs, rhs, ""); 3700 case nir_op_umin: return LLVMBuildSelect(ctx->builder, 3701 LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""), 3702 lhs, rhs, ""); 3703 case nir_op_fmin: return ac_build_intrinsic(ctx, 3704 _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32", 3705 _64bit ? ctx->f64 : ctx->f32, 3706 (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE); 3707 case nir_op_imax: return LLVMBuildSelect(ctx->builder, 3708 LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""), 3709 lhs, rhs, ""); 3710 case nir_op_umax: return LLVMBuildSelect(ctx->builder, 3711 LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""), 3712 lhs, rhs, ""); 3713 case nir_op_fmax: return ac_build_intrinsic(ctx, 3714 _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32", 3715 _64bit ? ctx->f64 : ctx->f32, 3716 (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE); 3717 case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, ""); 3718 case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, ""); 3719 case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, ""); 3720 default: 3721 unreachable("bad reduction intrinsic"); 3722 } 3723} 3724 3725/** 3726 * \param maxprefix specifies that the result only needs to be correct for a 3727 * prefix of this many threads 3728 * 3729 * TODO: add inclusive and excluse scan functions for SI chip class. 3730 */ 3731static LLVMValueRef 3732ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity, 3733 unsigned maxprefix) 3734{ 3735 LLVMValueRef result, tmp; 3736 result = src; 3737 if (maxprefix <= 1) 3738 return result; 3739 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); 3740 result = ac_build_alu_op(ctx, result, tmp, op); 3741 if (maxprefix <= 2) 3742 return result; 3743 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false); 3744 result = ac_build_alu_op(ctx, result, tmp, op); 3745 if (maxprefix <= 3) 3746 return result; 3747 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false); 3748 result = ac_build_alu_op(ctx, result, tmp, op); 3749 if (maxprefix <= 4) 3750 return result; 3751 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false); 3752 result = ac_build_alu_op(ctx, result, tmp, op); 3753 if (maxprefix <= 8) 3754 return result; 3755 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false); 3756 result = ac_build_alu_op(ctx, result, tmp, op); 3757 if (maxprefix <= 16) 3758 return result; 3759 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); 3760 result = ac_build_alu_op(ctx, result, tmp, op); 3761 if (maxprefix <= 32) 3762 return result; 3763 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); 3764 result = ac_build_alu_op(ctx, result, tmp, op); 3765 return result; 3766} 3767 3768LLVMValueRef 3769ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) 3770{ 3771 LLVMValueRef result; 3772 3773 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { 3774 LLVMBuilderRef builder = ctx->builder; 3775 src = LLVMBuildZExt(builder, src, ctx->i32, ""); 3776 result = ac_build_ballot(ctx, src); 3777 result = ac_build_mbcnt(ctx, result); 3778 result = LLVMBuildAdd(builder, result, src, ""); 3779 return result; 3780 } 3781 3782 ac_build_optimization_barrier(ctx, &src); 3783 3784 LLVMValueRef identity = 3785 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); 3786 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), 3787 LLVMTypeOf(identity), ""); 3788 result = ac_build_scan(ctx, op, result, identity, 64); 3789 3790 return ac_build_wwm(ctx, result); 3791} 3792 3793LLVMValueRef 3794ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) 3795{ 3796 LLVMValueRef result; 3797 3798 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { 3799 LLVMBuilderRef builder = ctx->builder; 3800 src = LLVMBuildZExt(builder, src, ctx->i32, ""); 3801 result = ac_build_ballot(ctx, src); 3802 result = ac_build_mbcnt(ctx, result); 3803 return result; 3804 } 3805 3806 ac_build_optimization_barrier(ctx, &src); 3807 3808 LLVMValueRef identity = 3809 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); 3810 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), 3811 LLVMTypeOf(identity), ""); 3812 result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false); 3813 result = ac_build_scan(ctx, op, result, identity, 64); 3814 3815 return ac_build_wwm(ctx, result); 3816} 3817 3818LLVMValueRef 3819ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size) 3820{ 3821 if (cluster_size == 1) return src; 3822 ac_build_optimization_barrier(ctx, &src); 3823 LLVMValueRef result, swap; 3824 LLVMValueRef identity = get_reduction_identity(ctx, op, 3825 ac_get_type_size(LLVMTypeOf(src))); 3826 result = LLVMBuildBitCast(ctx->builder, 3827 ac_build_set_inactive(ctx, src, identity), 3828 LLVMTypeOf(identity), ""); 3829 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2); 3830 result = ac_build_alu_op(ctx, result, swap, op); 3831 if (cluster_size == 2) return ac_build_wwm(ctx, result); 3832 3833 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1); 3834 result = ac_build_alu_op(ctx, result, swap, op); 3835 if (cluster_size == 4) return ac_build_wwm(ctx, result); 3836 3837 if (ctx->chip_class >= VI) 3838 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false); 3839 else 3840 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04)); 3841 result = ac_build_alu_op(ctx, result, swap, op); 3842 if (cluster_size == 8) return ac_build_wwm(ctx, result); 3843 3844 if (ctx->chip_class >= VI) 3845 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false); 3846 else 3847 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08)); 3848 result = ac_build_alu_op(ctx, result, swap, op); 3849 if (cluster_size == 16) return ac_build_wwm(ctx, result); 3850 3851 if (ctx->chip_class >= VI && cluster_size != 32) 3852 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); 3853 else 3854 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10)); 3855 result = ac_build_alu_op(ctx, result, swap, op); 3856 if (cluster_size == 32) return ac_build_wwm(ctx, result); 3857 3858 if (ctx->chip_class >= VI) { 3859 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); 3860 result = ac_build_alu_op(ctx, result, swap, op); 3861 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0)); 3862 return ac_build_wwm(ctx, result); 3863 } else { 3864 swap = ac_build_readlane(ctx, result, ctx->i32_0); 3865 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0)); 3866 result = ac_build_alu_op(ctx, result, swap, op); 3867 return ac_build_wwm(ctx, result); 3868 } 3869} 3870 3871/** 3872 * "Top half" of a scan that reduces per-wave values across an entire 3873 * workgroup. 3874 * 3875 * The source value must be present in the highest lane of the wave, and the 3876 * highest lane must be live. 3877 */ 3878void 3879ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 3880{ 3881 if (ws->maxwaves <= 1) 3882 return; 3883 3884 const LLVMValueRef i32_63 = LLVMConstInt(ctx->i32, 63, false); 3885 LLVMBuilderRef builder = ctx->builder; 3886 LLVMValueRef tid = ac_get_thread_id(ctx); 3887 LLVMValueRef tmp; 3888 3889 tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, i32_63, ""); 3890 ac_build_ifcc(ctx, tmp, 1000); 3891 LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, "")); 3892 ac_build_endif(ctx, 1000); 3893} 3894 3895/** 3896 * "Bottom half" of a scan that reduces per-wave values across an entire 3897 * workgroup. 3898 * 3899 * The caller must place a barrier between the top and bottom halves. 3900 */ 3901void 3902ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 3903{ 3904 const LLVMTypeRef type = LLVMTypeOf(ws->src); 3905 const LLVMValueRef identity = 3906 get_reduction_identity(ctx, ws->op, ac_get_type_size(type)); 3907 3908 if (ws->maxwaves <= 1) { 3909 ws->result_reduce = ws->src; 3910 ws->result_inclusive = ws->src; 3911 ws->result_exclusive = identity; 3912 return; 3913 } 3914 assert(ws->maxwaves <= 32); 3915 3916 LLVMBuilderRef builder = ctx->builder; 3917 LLVMValueRef tid = ac_get_thread_id(ctx); 3918 LLVMBasicBlockRef bbs[2]; 3919 LLVMValueRef phivalues_scan[2]; 3920 LLVMValueRef tmp, tmp2; 3921 3922 bbs[0] = LLVMGetInsertBlock(builder); 3923 phivalues_scan[0] = LLVMGetUndef(type); 3924 3925 if (ws->enable_reduce) 3926 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, ""); 3927 else if (ws->enable_inclusive) 3928 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, ""); 3929 else 3930 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, ""); 3931 ac_build_ifcc(ctx, tmp, 1001); 3932 { 3933 tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), ""); 3934 3935 ac_build_optimization_barrier(ctx, &tmp); 3936 3937 bbs[1] = LLVMGetInsertBlock(builder); 3938 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves); 3939 } 3940 ac_build_endif(ctx, 1001); 3941 3942 const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs); 3943 3944 if (ws->enable_reduce) { 3945 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, ""); 3946 ws->result_reduce = ac_build_readlane(ctx, scan, tmp); 3947 } 3948 if (ws->enable_inclusive) 3949 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx); 3950 if (ws->enable_exclusive) { 3951 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, ""); 3952 tmp = ac_build_readlane(ctx, scan, tmp); 3953 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, ""); 3954 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, ""); 3955 } 3956} 3957 3958/** 3959 * Inclusive scan of a per-wave value across an entire workgroup. 3960 * 3961 * This implies an s_barrier instruction. 3962 * 3963 * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads 3964 * of the workgroup are live. (This requirement cannot easily be relaxed in a 3965 * useful manner because of the barrier in the algorithm.) 3966 */ 3967void 3968ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 3969{ 3970 ac_build_wg_wavescan_top(ctx, ws); 3971 ac_build_s_barrier(ctx); 3972 ac_build_wg_wavescan_bottom(ctx, ws); 3973} 3974 3975/** 3976 * "Top half" of a scan that reduces per-thread values across an entire 3977 * workgroup. 3978 * 3979 * All lanes must be active when this code runs. 3980 */ 3981void 3982ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 3983{ 3984 if (ws->enable_exclusive) { 3985 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op); 3986 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd) 3987 ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, ""); 3988 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op); 3989 } else { 3990 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op); 3991 } 3992 3993 bool enable_inclusive = ws->enable_inclusive; 3994 bool enable_exclusive = ws->enable_exclusive; 3995 ws->enable_inclusive = false; 3996 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; 3997 ac_build_wg_wavescan_top(ctx, ws); 3998 ws->enable_inclusive = enable_inclusive; 3999 ws->enable_exclusive = enable_exclusive; 4000} 4001 4002/** 4003 * "Bottom half" of a scan that reduces per-thread values across an entire 4004 * workgroup. 4005 * 4006 * The caller must place a barrier between the top and bottom halves. 4007 */ 4008void 4009ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 4010{ 4011 bool enable_inclusive = ws->enable_inclusive; 4012 bool enable_exclusive = ws->enable_exclusive; 4013 ws->enable_inclusive = false; 4014 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; 4015 ac_build_wg_wavescan_bottom(ctx, ws); 4016 ws->enable_inclusive = enable_inclusive; 4017 ws->enable_exclusive = enable_exclusive; 4018 4019 /* ws->result_reduce is already the correct value */ 4020 if (ws->enable_inclusive) 4021 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op); 4022 if (ws->enable_exclusive) 4023 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op); 4024} 4025 4026/** 4027 * A scan that reduces per-thread values across an entire workgroup. 4028 * 4029 * The caller must ensure that all lanes are active when this code runs 4030 * (WWM is insufficient!), because there is an implied barrier. 4031 */ 4032void 4033ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) 4034{ 4035 ac_build_wg_scan_top(ctx, ws); 4036 ac_build_s_barrier(ctx); 4037 ac_build_wg_scan_bottom(ctx, ws); 4038} 4039 4040LLVMValueRef 4041ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, 4042 unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) 4043{ 4044 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3); 4045 if (ctx->chip_class >= VI) { 4046 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false); 4047 } else { 4048 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask); 4049 } 4050} 4051 4052LLVMValueRef 4053ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index) 4054{ 4055 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); 4056 return ac_build_intrinsic(ctx, 4057 "llvm.amdgcn.ds.bpermute", ctx->i32, 4058 (LLVMValueRef []) {index, src}, 2, 4059 AC_FUNC_ATTR_READNONE | 4060 AC_FUNC_ATTR_CONVERGENT); 4061} 4062 4063LLVMValueRef 4064ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, 4065 unsigned bitsize) 4066{ 4067 LLVMTypeRef type; 4068 char *intr; 4069 4070 if (bitsize == 16) { 4071 intr = "llvm.amdgcn.frexp.exp.i16.f16"; 4072 type = ctx->i16; 4073 } else if (bitsize == 32) { 4074 intr = "llvm.amdgcn.frexp.exp.i32.f32"; 4075 type = ctx->i32; 4076 } else { 4077 intr = "llvm.amdgcn.frexp.exp.i32.f64"; 4078 type = ctx->i32; 4079 } 4080 4081 LLVMValueRef params[] = { 4082 src0, 4083 }; 4084 return ac_build_intrinsic(ctx, intr, type, params, 1, 4085 AC_FUNC_ATTR_READNONE); 4086} 4087LLVMValueRef 4088ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, 4089 unsigned bitsize) 4090{ 4091 LLVMTypeRef type; 4092 char *intr; 4093 4094 if (bitsize == 16) { 4095 intr = "llvm.amdgcn.frexp.mant.f16"; 4096 type = ctx->f16; 4097 } else if (bitsize == 32) { 4098 intr = "llvm.amdgcn.frexp.mant.f32"; 4099 type = ctx->f32; 4100 } else { 4101 intr = "llvm.amdgcn.frexp.mant.f64"; 4102 type = ctx->f64; 4103 } 4104 4105 LLVMValueRef params[] = { 4106 src0, 4107 }; 4108 return ac_build_intrinsic(ctx, intr, type, params, 1, 4109 AC_FUNC_ATTR_READNONE); 4110} 4111 4112/* 4113 * this takes an I,J coordinate pair, 4114 * and works out the X and Y derivatives. 4115 * it returns DDX(I), DDX(J), DDY(I), DDY(J). 4116 */ 4117LLVMValueRef 4118ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij) 4119{ 4120 LLVMValueRef result[4], a; 4121 unsigned i; 4122 4123 for (i = 0; i < 2; i++) { 4124 a = LLVMBuildExtractElement(ctx->builder, interp_ij, 4125 LLVMConstInt(ctx->i32, i, false), ""); 4126 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a); 4127 result[2+i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a); 4128 } 4129 return ac_build_gather_values(ctx, result, 4); 4130} 4131 4132LLVMValueRef 4133ac_build_load_helper_invocation(struct ac_llvm_context *ctx) 4134{ 4135 LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", 4136 ctx->i1, NULL, 0, 4137 AC_FUNC_ATTR_READNONE); 4138 result = LLVMBuildNot(ctx->builder, result, ""); 4139 return LLVMBuildSExt(ctx->builder, result, ctx->i32, ""); 4140} 4141