1/* 2 * Copyright 2016 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "ac_exp_param.h" 26#include "ac_nir_to_llvm.h" 27#include "ac_rtld.h" 28#include "si_pipe.h" 29#include "si_shader_internal.h" 30#include "sid.h" 31#include "tgsi/tgsi_from_mesa.h" 32#include "util/u_memory.h" 33 34struct si_llvm_diagnostics { 35 struct pipe_debug_callback *debug; 36 unsigned retval; 37}; 38 39static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context) 40{ 41 struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context; 42 LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di); 43 const char *severity_str = NULL; 44 45 switch (severity) { 46 case LLVMDSError: 47 severity_str = "error"; 48 break; 49 case LLVMDSWarning: 50 severity_str = "warning"; 51 break; 52 case LLVMDSRemark: 53 case LLVMDSNote: 54 default: 55 return; 56 } 57 58 char *description = LLVMGetDiagInfoDescription(di); 59 60 pipe_debug_message(diag->debug, SHADER_INFO, "LLVM diagnostic (%s): %s", severity_str, 61 description); 62 63 if (severity == LLVMDSError) { 64 diag->retval = 1; 65 fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description); 66 } 67 68 LLVMDisposeMessage(description); 69} 70 71bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary, 72 struct ac_shader_config *conf, struct ac_llvm_compiler *compiler, 73 struct ac_llvm_context *ac, struct pipe_debug_callback *debug, 74 gl_shader_stage stage, const char *name, bool less_optimized) 75{ 76 unsigned count = p_atomic_inc_return(&sscreen->num_compilations); 77 78 if (si_can_dump_shader(sscreen, stage)) { 79 fprintf(stderr, "radeonsi: Compiling shader %d\n", count); 80 81 if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) { 82 fprintf(stderr, "%s LLVM IR:\n\n", name); 83 ac_dump_module(ac->module); 84 fprintf(stderr, "\n"); 85 } 86 } 87 88 if (sscreen->record_llvm_ir) { 89 char *ir = LLVMPrintModuleToString(ac->module); 90 binary->llvm_ir_string = strdup(ir); 91 LLVMDisposeMessage(ir); 92 } 93 94 if (!si_replace_shader(count, binary)) { 95 struct ac_compiler_passes *passes = compiler->passes; 96 97 if (less_optimized && compiler->low_opt_passes) 98 passes = compiler->low_opt_passes; 99 100 struct si_llvm_diagnostics diag = {debug}; 101 LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag); 102 103 if (!ac_compile_module_to_elf(passes, ac->module, (char **)&binary->elf_buffer, 104 &binary->elf_size)) 105 diag.retval = 1; 106 107 if (diag.retval != 0) { 108 pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed"); 109 return false; 110 } 111 } 112 113 struct ac_rtld_binary rtld; 114 if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){ 115 .info = &sscreen->info, 116 .shader_type = stage, 117 .wave_size = ac->wave_size, 118 .num_parts = 1, 119 .elf_ptrs = &binary->elf_buffer, 120 .elf_sizes = &binary->elf_size})) 121 return false; 122 123 bool ok = ac_rtld_read_config(&sscreen->info, &rtld, conf); 124 ac_rtld_close(&rtld); 125 return ok; 126} 127 128void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen, 129 struct ac_llvm_compiler *compiler, unsigned wave_size) 130{ 131 memset(ctx, 0, sizeof(*ctx)); 132 ctx->screen = sscreen; 133 ctx->compiler = compiler; 134 135 ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, sscreen->info.family, 136 &sscreen->info, AC_FLOAT_MODE_DEFAULT_OPENGL, wave_size, 64); 137} 138 139void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types, 140 unsigned num_return_elems, unsigned max_workgroup_size) 141{ 142 LLVMTypeRef ret_type; 143 enum ac_llvm_calling_convention call_conv; 144 145 if (num_return_elems) 146 ret_type = LLVMStructTypeInContext(ctx->ac.context, return_types, num_return_elems, true); 147 else 148 ret_type = ctx->ac.voidt; 149 150 gl_shader_stage real_stage = ctx->stage; 151 152 /* LS is merged into HS (TCS), and ES is merged into GS. */ 153 if (ctx->screen->info.chip_class >= GFX9) { 154 if (ctx->shader->key.as_ls) 155 real_stage = MESA_SHADER_TESS_CTRL; 156 else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg) 157 real_stage = MESA_SHADER_GEOMETRY; 158 } 159 160 switch (real_stage) { 161 case MESA_SHADER_VERTEX: 162 case MESA_SHADER_TESS_EVAL: 163 call_conv = AC_LLVM_AMDGPU_VS; 164 break; 165 case MESA_SHADER_TESS_CTRL: 166 call_conv = AC_LLVM_AMDGPU_HS; 167 break; 168 case MESA_SHADER_GEOMETRY: 169 call_conv = AC_LLVM_AMDGPU_GS; 170 break; 171 case MESA_SHADER_FRAGMENT: 172 call_conv = AC_LLVM_AMDGPU_PS; 173 break; 174 case MESA_SHADER_COMPUTE: 175 call_conv = AC_LLVM_AMDGPU_CS; 176 break; 177 default: 178 unreachable("Unhandle shader type"); 179 } 180 181 /* Setup the function */ 182 ctx->return_type = ret_type; 183 ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name, ret_type, ctx->ac.module); 184 ctx->return_value = LLVMGetUndef(ctx->return_type); 185 186 if (ctx->screen->info.address32_hi) { 187 ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-32bit-address-high-bits", 188 ctx->screen->info.address32_hi); 189 } 190 191 ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size); 192 ac_llvm_set_target_features(ctx->main_fn, &ctx->ac); 193} 194 195void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shader) 196{ 197 struct si_shader *shader = ctx->shader; 198 LLVMTypeRef returns[AC_MAX_ARGS]; 199 unsigned i; 200 201 si_init_shader_args(ctx, ngg_cull_shader); 202 203 for (i = 0; i < ctx->args.num_sgprs_returned; i++) 204 returns[i] = ctx->ac.i32; /* SGPR */ 205 for (; i < ctx->args.return_count; i++) 206 returns[i] = ctx->ac.f32; /* VGPR */ 207 208 si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns, 209 ctx->args.return_count, si_get_max_workgroup_size(shader)); 210 211 /* Reserve register locations for VGPR inputs the PS prolog may need. */ 212 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->shader->is_monolithic) { 213 ac_llvm_add_target_dep_function_attr( 214 ctx->main_fn, "InitialPSInputAddr", 215 S_0286D0_PERSP_SAMPLE_ENA(1) | S_0286D0_PERSP_CENTER_ENA(1) | 216 S_0286D0_PERSP_CENTROID_ENA(1) | S_0286D0_LINEAR_SAMPLE_ENA(1) | 217 S_0286D0_LINEAR_CENTER_ENA(1) | S_0286D0_LINEAR_CENTROID_ENA(1) | 218 S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_ANCILLARY_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1)); 219 } 220 221 222 if (shader->key.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL) { 223 if (USE_LDS_SYMBOLS) { 224 /* The LSHS size is not known until draw time, so we append it 225 * at the end of whatever LDS use there may be in the rest of 226 * the shader (currently none, unless LLVM decides to do its 227 * own LDS-based lowering). 228 */ 229 ctx->ac.lds = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), 230 "__lds_end", AC_ADDR_SPACE_LDS); 231 LLVMSetAlignment(ctx->ac.lds, 256); 232 } else { 233 ac_declare_lds_as_pointer(&ctx->ac); 234 } 235 } 236 237 /* Unlike radv, we override these arguments in the prolog, so to the 238 * API shader they appear as normal arguments. 239 */ 240 if (ctx->stage == MESA_SHADER_VERTEX) { 241 ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id); 242 ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id); 243 } else if (ctx->stage == MESA_SHADER_FRAGMENT) { 244 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid); 245 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid); 246 } 247} 248 249void si_llvm_optimize_module(struct si_shader_context *ctx) 250{ 251 /* Dump LLVM IR before any optimization passes */ 252 if (ctx->screen->debug_flags & DBG(PREOPT_IR) && si_can_dump_shader(ctx->screen, ctx->stage)) 253 LLVMDumpModule(ctx->ac.module); 254 255 /* Run the pass */ 256 LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module); 257 LLVMDisposeBuilder(ctx->ac.builder); 258} 259 260void si_llvm_dispose(struct si_shader_context *ctx) 261{ 262 LLVMDisposeModule(ctx->ac.module); 263 LLVMContextDispose(ctx->ac.context); 264 ac_llvm_context_dispose(&ctx->ac); 265} 266 267/** 268 * Load a dword from a constant buffer. 269 */ 270LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource, 271 LLVMValueRef offset) 272{ 273 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, 0, ctx->ac.f32, 274 0, true, true); 275} 276 277void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret) 278{ 279 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) 280 LLVMBuildRetVoid(ctx->ac.builder); 281 else 282 LLVMBuildRet(ctx->ac.builder, ret); 283} 284 285LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret, 286 struct ac_arg param, unsigned return_index) 287{ 288 return LLVMBuildInsertValue(ctx->ac.builder, ret, ac_get_arg(&ctx->ac, param), return_index, ""); 289} 290 291LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret, 292 struct ac_arg param, unsigned return_index) 293{ 294 LLVMBuilderRef builder = ctx->ac.builder; 295 LLVMValueRef p = ac_get_arg(&ctx->ac, param); 296 297 return LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, p), return_index, ""); 298} 299 300LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret, 301 struct ac_arg param, unsigned return_index) 302{ 303 LLVMBuilderRef builder = ctx->ac.builder; 304 LLVMValueRef ptr = ac_get_arg(&ctx->ac, param); 305 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, ""); 306 return LLVMBuildInsertValue(builder, ret, ptr, return_index, ""); 307} 308 309LLVMValueRef si_prolog_get_internal_bindings(struct si_shader_context *ctx) 310{ 311 LLVMValueRef ptr[2], list; 312 bool merged_shader = si_is_merged_shader(ctx->shader); 313 314 ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_INTERNAL_BINDINGS); 315 list = 316 LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); 317 return list; 318} 319 320void si_llvm_emit_barrier(struct si_shader_context *ctx) 321{ 322 /* GFX6 only (thanks to a hw bug workaround): 323 * The real barrier instruction isn’t needed, because an entire patch 324 * always fits into a single wave. 325 */ 326 if (ctx->screen->info.chip_class == GFX6 && ctx->stage == MESA_SHADER_TESS_CTRL) { 327 ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE); 328 return; 329 } 330 331 ac_build_s_barrier(&ctx->ac); 332} 333 334/* Ensure that the esgs ring is declared. 335 * 336 * We declare it with 64KB alignment as a hint that the 337 * pointer value will always be 0. 338 */ 339void si_llvm_declare_esgs_ring(struct si_shader_context *ctx) 340{ 341 if (ctx->esgs_ring) 342 return; 343 344 assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring")); 345 346 ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), 347 "esgs_ring", AC_ADDR_SPACE_LDS); 348 LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage); 349 LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); 350} 351 352static void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, 353 unsigned bitoffset) 354{ 355 LLVMValueRef args[] = { 356 ac_get_arg(&ctx->ac, param), 357 LLVMConstInt(ctx->ac.i32, bitoffset, 0), 358 }; 359 ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.init.exec.from.input", ctx->ac.voidt, args, 2, 360 AC_FUNC_ATTR_CONVERGENT); 361} 362 363/** 364 * Get the value of a shader input parameter and extract a bitfield. 365 */ 366static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, LLVMValueRef value, 367 unsigned rshift, unsigned bitwidth) 368{ 369 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) 370 value = ac_to_integer(&ctx->ac, value); 371 372 if (rshift) 373 value = LLVMBuildLShr(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, rshift, 0), ""); 374 375 if (rshift + bitwidth < 32) { 376 unsigned mask = (1 << bitwidth) - 1; 377 value = LLVMBuildAnd(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, mask, 0), ""); 378 } 379 380 return value; 381} 382 383LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift, 384 unsigned bitwidth) 385{ 386 LLVMValueRef value = ac_get_arg(&ctx->ac, param); 387 388 return unpack_llvm_param(ctx, value, rshift, bitwidth); 389} 390 391LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle) 392{ 393 if (swizzle > 0) 394 return ctx->ac.i32_0; 395 396 switch (ctx->stage) { 397 case MESA_SHADER_VERTEX: 398 return ac_get_arg(&ctx->ac, ctx->args.vs_prim_id); 399 case MESA_SHADER_TESS_CTRL: 400 return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id); 401 case MESA_SHADER_TESS_EVAL: 402 return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id); 403 case MESA_SHADER_GEOMETRY: 404 return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id); 405 default: 406 assert(0); 407 return ctx->ac.i32_0; 408 } 409} 410 411static LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi) 412{ 413 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 414 415 assert(ctx->shader->selector->info.base.workgroup_size_variable && 416 ctx->shader->selector->info.uses_variable_block_size); 417 418 LLVMValueRef chan[3] = { 419 si_unpack_param(ctx, ctx->block_size, 0, 10), 420 si_unpack_param(ctx, ctx->block_size, 10, 10), 421 si_unpack_param(ctx, ctx->block_size, 20, 10), 422 }; 423 return ac_build_gather_values(&ctx->ac, chan, 3); 424} 425 426static void si_llvm_declare_compute_memory(struct si_shader_context *ctx) 427{ 428 struct si_shader_selector *sel = ctx->shader->selector; 429 unsigned lds_size = sel->info.base.shared_size; 430 431 LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS); 432 LLVMValueRef var; 433 434 assert(!ctx->ac.lds); 435 436 var = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i8, lds_size), 437 "compute_lds", AC_ADDR_SPACE_LDS); 438 LLVMSetAlignment(var, 64 * 1024); 439 440 ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); 441} 442 443static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir) 444{ 445 if (nir->info.stage == MESA_SHADER_FRAGMENT) { 446 unsigned colors_read = ctx->shader->selector->info.colors_read; 447 LLVMValueRef main_fn = ctx->main_fn; 448 449 LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32); 450 451 unsigned offset = SI_PARAM_POS_FIXED_PT + 1; 452 453 if (colors_read & 0x0f) { 454 unsigned mask = colors_read & 0x0f; 455 LLVMValueRef values[4]; 456 values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; 457 values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; 458 values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; 459 values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; 460 ctx->abi.color0 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4)); 461 } 462 if (colors_read & 0xf0) { 463 unsigned mask = (colors_read & 0xf0) >> 4; 464 LLVMValueRef values[4]; 465 values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; 466 values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; 467 values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; 468 values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; 469 ctx->abi.color1 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4)); 470 } 471 472 ctx->abi.interp_at_sample_force_center = 473 ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center; 474 475 ctx->abi.kill_ps_if_inf_interp = 476 ctx->screen->options.no_infinite_interp && 477 (ctx->shader->selector->info.uses_persp_center || 478 ctx->shader->selector->info.uses_persp_centroid || 479 ctx->shader->selector->info.uses_persp_sample); 480 481 } else if (nir->info.stage == MESA_SHADER_COMPUTE) { 482 if (nir->info.cs.user_data_components_amd) { 483 ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data); 484 ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data, 485 nir->info.cs.user_data_components_amd); 486 } 487 488 if (ctx->shader->selector->info.base.shared_size) 489 si_llvm_declare_compute_memory(ctx); 490 } 491 492 ctx->abi.clamp_shadow_reference = true; 493 ctx->abi.robust_buffer_access = true; 494 ctx->abi.convert_undef_to_zero = true; 495 ctx->abi.clamp_div_by_zero = ctx->screen->options.clamp_div_by_zero; 496 ctx->abi.adjust_frag_coord_z = false; 497 498 const struct si_shader_info *info = &ctx->shader->selector->info; 499 for (unsigned i = 0; i < info->num_outputs; i++) { 500 LLVMTypeRef type = ctx->ac.f32; 501 502 /* Only FS uses unpacked f16. Other stages pack 16-bit outputs into low and high bits of f32. */ 503 if (nir->info.stage == MESA_SHADER_FRAGMENT && 504 nir_alu_type_get_type_size(ctx->shader->selector->info.output_type[i]) == 16) 505 type = ctx->ac.f16; 506 507 for (unsigned j = 0; j < 4; j++) 508 ctx->abi.outputs[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, type, ""); 509 } 510 511 ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir); 512 513 return true; 514} 515 516/** 517 * Given a list of shader part functions, build a wrapper function that 518 * runs them in sequence to form a monolithic shader. 519 */ 520void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts, 521 unsigned num_parts, unsigned main_part, 522 unsigned next_shader_first_part, bool same_thread_count) 523{ 524 LLVMBuilderRef builder = ctx->ac.builder; 525 /* PS epilog has one arg per color component; gfx9 merged shader 526 * prologs need to forward 40 SGPRs. 527 */ 528 LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS]; 529 LLVMTypeRef function_type; 530 unsigned num_first_params; 531 unsigned num_out, initial_num_out; 532 ASSERTED unsigned num_out_sgpr; /* used in debug checks */ 533 ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */ 534 unsigned num_sgprs, num_vgprs; 535 unsigned gprs; 536 537 memset(&ctx->args, 0, sizeof(ctx->args)); 538 539 for (unsigned i = 0; i < num_parts; ++i) { 540 ac_add_function_attr(ctx->ac.context, parts[i], -1, AC_FUNC_ATTR_ALWAYSINLINE); 541 LLVMSetLinkage(parts[i], LLVMPrivateLinkage); 542 } 543 544 /* The parameters of the wrapper function correspond to those of the 545 * first part in terms of SGPRs and VGPRs, but we use the types of the 546 * main part to get the right types. This is relevant for the 547 * dereferenceable attribute on descriptor table pointers. 548 */ 549 num_sgprs = 0; 550 num_vgprs = 0; 551 552 function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); 553 num_first_params = LLVMCountParamTypes(function_type); 554 555 for (unsigned i = 0; i < num_first_params; ++i) { 556 LLVMValueRef param = LLVMGetParam(parts[0], i); 557 558 if (ac_is_sgpr_param(param)) { 559 assert(num_vgprs == 0); 560 num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; 561 } else { 562 num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; 563 } 564 } 565 566 gprs = 0; 567 while (gprs < num_sgprs + num_vgprs) { 568 LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count); 569 LLVMTypeRef type = LLVMTypeOf(param); 570 unsigned size = ac_get_type_size(type) / 4; 571 572 /* This is going to get casted anyways, so we don't have to 573 * have the exact same type. But we do have to preserve the 574 * pointer-ness so that LLVM knows about it. 575 */ 576 enum ac_arg_type arg_type = AC_ARG_INT; 577 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { 578 type = LLVMGetElementType(type); 579 580 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { 581 if (LLVMGetVectorSize(type) == 4) 582 arg_type = AC_ARG_CONST_DESC_PTR; 583 else if (LLVMGetVectorSize(type) == 8) 584 arg_type = AC_ARG_CONST_IMAGE_PTR; 585 else 586 assert(0); 587 } else if (type == ctx->ac.f32) { 588 arg_type = AC_ARG_CONST_FLOAT_PTR; 589 } else { 590 assert(0); 591 } 592 } 593 594 ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR, size, arg_type, NULL); 595 596 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); 597 assert(gprs + size <= num_sgprs + num_vgprs && 598 (gprs >= num_sgprs || gprs + size <= num_sgprs)); 599 600 gprs += size; 601 } 602 603 /* Prepare the return type. */ 604 unsigned num_returns = 0; 605 LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type; 606 607 last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1])); 608 return_type = LLVMGetReturnType(last_func_type); 609 610 switch (LLVMGetTypeKind(return_type)) { 611 case LLVMStructTypeKind: 612 num_returns = LLVMCountStructElementTypes(return_type); 613 assert(num_returns <= ARRAY_SIZE(returns)); 614 LLVMGetStructElementTypes(return_type, returns); 615 break; 616 case LLVMVoidTypeKind: 617 break; 618 default: 619 unreachable("unexpected type"); 620 } 621 622 si_llvm_create_func(ctx, "wrapper", returns, num_returns, 623 si_get_max_workgroup_size(ctx->shader)); 624 625 if (si_is_merged_shader(ctx->shader) && !same_thread_count) 626 ac_init_exec_full_mask(&ctx->ac); 627 628 /* Record the arguments of the function as if they were an output of 629 * a previous part. 630 */ 631 num_out = 0; 632 num_out_sgpr = 0; 633 634 for (unsigned i = 0; i < ctx->args.arg_count; ++i) { 635 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); 636 LLVMTypeRef param_type = LLVMTypeOf(param); 637 LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32; 638 unsigned size = ac_get_type_size(param_type) / 4; 639 640 if (size == 1) { 641 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 642 param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, ""); 643 param_type = ctx->ac.i32; 644 } 645 646 if (param_type != out_type) 647 param = LLVMBuildBitCast(builder, param, out_type, ""); 648 out[num_out++] = param; 649 } else { 650 LLVMTypeRef vector_type = LLVMVectorType(out_type, size); 651 652 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 653 param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, ""); 654 param_type = ctx->ac.i64; 655 } 656 657 if (param_type != vector_type) 658 param = LLVMBuildBitCast(builder, param, vector_type, ""); 659 660 for (unsigned j = 0; j < size; ++j) 661 out[num_out++] = 662 LLVMBuildExtractElement(builder, param, LLVMConstInt(ctx->ac.i32, j, 0), ""); 663 } 664 665 if (ctx->args.args[i].file == AC_ARG_SGPR) 666 num_out_sgpr = num_out; 667 } 668 669 memcpy(initial, out, sizeof(out)); 670 initial_num_out = num_out; 671 initial_num_out_sgpr = num_out_sgpr; 672 673 /* Now chain the parts. */ 674 LLVMValueRef ret = NULL; 675 for (unsigned part = 0; part < num_parts; ++part) { 676 LLVMValueRef in[AC_MAX_ARGS]; 677 LLVMTypeRef ret_type; 678 unsigned out_idx = 0; 679 unsigned num_params = LLVMCountParams(parts[part]); 680 681 /* Merged shaders are executed conditionally depending 682 * on the number of enabled threads passed in the input SGPRs. */ 683 if (si_is_multi_part_shader(ctx->shader) && part == 0) { 684 if (same_thread_count) { 685 struct ac_arg arg; 686 arg.arg_index = 3; 687 arg.used = true; 688 689 si_init_exec_from_input(ctx, arg, 0); 690 } else { 691 LLVMValueRef ena, count = initial[3]; 692 693 count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), ""); 694 ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, ""); 695 ac_build_ifcc(&ctx->ac, ena, 6506); 696 } 697 } 698 699 /* Derive arguments for the next part from outputs of the 700 * previous one. 701 */ 702 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) { 703 LLVMValueRef param; 704 LLVMTypeRef param_type; 705 bool is_sgpr; 706 unsigned param_size; 707 LLVMValueRef arg = NULL; 708 709 param = LLVMGetParam(parts[part], param_idx); 710 param_type = LLVMTypeOf(param); 711 param_size = ac_get_type_size(param_type) / 4; 712 is_sgpr = ac_is_sgpr_param(param); 713 714 if (is_sgpr) { 715 ac_add_function_attr(ctx->ac.context, parts[part], param_idx + 1, AC_FUNC_ATTR_INREG); 716 } else if (out_idx < num_out_sgpr) { 717 /* Skip returned SGPRs the current part doesn't 718 * declare on the input. */ 719 out_idx = num_out_sgpr; 720 } 721 722 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out)); 723 724 if (param_size == 1) 725 arg = out[out_idx]; 726 else 727 arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size); 728 729 if (LLVMTypeOf(arg) != param_type) { 730 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 731 if (LLVMGetPointerAddressSpace(param_type) == AC_ADDR_SPACE_CONST_32BIT) { 732 arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, ""); 733 arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); 734 } else { 735 arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, ""); 736 arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); 737 } 738 } else { 739 arg = LLVMBuildBitCast(builder, arg, param_type, ""); 740 } 741 } 742 743 in[param_idx] = arg; 744 out_idx += param_size; 745 } 746 747 ret = ac_build_call(&ctx->ac, parts[part], in, num_params); 748 749 if (!same_thread_count && 750 si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) { 751 ac_build_endif(&ctx->ac, 6506); 752 753 /* The second half of the merged shader should use 754 * the inputs from the toplevel (wrapper) function, 755 * not the return value from the last call. 756 * 757 * That's because the last call was executed condi- 758 * tionally, so we can't consume it in the main 759 * block. 760 */ 761 memcpy(out, initial, sizeof(initial)); 762 num_out = initial_num_out; 763 num_out_sgpr = initial_num_out_sgpr; 764 765 /* Execute the second shader conditionally based on the number of 766 * enabled threads there. 767 */ 768 if (ctx->stage == MESA_SHADER_TESS_CTRL) { 769 LLVMValueRef ena, count = initial[3]; 770 771 count = LLVMBuildLShr(builder, count, LLVMConstInt(ctx->ac.i32, 8, 0), ""); 772 count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), ""); 773 ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, ""); 774 ac_build_ifcc(&ctx->ac, ena, 6507); 775 } 776 continue; 777 } 778 779 /* Extract the returned GPRs. */ 780 ret_type = LLVMTypeOf(ret); 781 num_out = 0; 782 num_out_sgpr = 0; 783 784 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { 785 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); 786 787 unsigned ret_size = LLVMCountStructElementTypes(ret_type); 788 789 for (unsigned i = 0; i < ret_size; ++i) { 790 LLVMValueRef val = LLVMBuildExtractValue(builder, ret, i, ""); 791 792 assert(num_out < ARRAY_SIZE(out)); 793 out[num_out++] = val; 794 795 if (LLVMTypeOf(val) == ctx->ac.i32) { 796 assert(num_out_sgpr + 1 == num_out); 797 num_out_sgpr = num_out; 798 } 799 } 800 } 801 } 802 803 /* Close the conditional wrapping the second shader. */ 804 if (ctx->stage == MESA_SHADER_TESS_CTRL && 805 !same_thread_count && si_is_multi_part_shader(ctx->shader)) 806 ac_build_endif(&ctx->ac, 6507); 807 808 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) 809 LLVMBuildRetVoid(builder); 810 else 811 LLVMBuildRet(builder, ret); 812} 813 814bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader, 815 struct nir_shader *nir, bool free_nir, bool ngg_cull_shader) 816{ 817 struct si_shader_selector *sel = shader->selector; 818 const struct si_shader_info *info = &sel->info; 819 820 ctx->shader = shader; 821 ctx->stage = sel->info.stage; 822 823 ctx->num_const_buffers = info->base.num_ubos; 824 ctx->num_shader_buffers = info->base.num_ssbos; 825 826 ctx->num_samplers = BITSET_LAST_BIT(info->base.textures_used); 827 ctx->num_images = info->base.num_images; 828 829 si_llvm_init_resource_callbacks(ctx); 830 831 switch (ctx->stage) { 832 case MESA_SHADER_VERTEX: 833 si_llvm_init_vs_callbacks(ctx, ngg_cull_shader); 834 break; 835 case MESA_SHADER_TESS_CTRL: 836 si_llvm_init_tcs_callbacks(ctx); 837 break; 838 case MESA_SHADER_TESS_EVAL: 839 si_llvm_init_tes_callbacks(ctx, ngg_cull_shader); 840 break; 841 case MESA_SHADER_GEOMETRY: 842 si_llvm_init_gs_callbacks(ctx); 843 break; 844 case MESA_SHADER_FRAGMENT: 845 si_llvm_init_ps_callbacks(ctx); 846 break; 847 case MESA_SHADER_COMPUTE: 848 ctx->abi.load_local_group_size = si_llvm_get_block_size; 849 break; 850 default: 851 assert(!"Unsupported shader type"); 852 return false; 853 } 854 855 si_llvm_create_main_func(ctx, ngg_cull_shader); 856 857 if (ctx->shader->key.as_es || ctx->stage == MESA_SHADER_GEOMETRY) 858 si_preload_esgs_ring(ctx); 859 860 if (ctx->stage == MESA_SHADER_GEOMETRY) 861 si_preload_gs_rings(ctx); 862 else if (ctx->stage == MESA_SHADER_TESS_EVAL) 863 si_llvm_preload_tes_rings(ctx); 864 865 if (ctx->stage == MESA_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) { 866 for (unsigned i = 0; i < 6; i++) { 867 ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); 868 } 869 } 870 871 if (ctx->stage == MESA_SHADER_GEOMETRY) { 872 for (unsigned i = 0; i < 4; i++) { 873 ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); 874 } 875 if (shader->key.as_ngg) { 876 for (unsigned i = 0; i < 4; ++i) { 877 ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); 878 ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); 879 } 880 881 assert(!ctx->gs_ngg_scratch); 882 LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader)); 883 ctx->gs_ngg_scratch = 884 LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS); 885 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32)); 886 LLVMSetAlignment(ctx->gs_ngg_scratch, 4); 887 888 ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace( 889 ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); 890 LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage); 891 LLVMSetAlignment(ctx->gs_ngg_emit, 4); 892 } 893 } 894 895 if (ctx->stage != MESA_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) { 896 /* Unconditionally declare scratch space base for streamout and 897 * vertex compaction. Whether space is actually allocated is 898 * determined during linking / PM4 creation. 899 */ 900 si_llvm_declare_esgs_ring(ctx); 901 902 /* This is really only needed when streamout and / or vertex 903 * compaction is enabled. 904 */ 905 if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) { 906 LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader)); 907 ctx->gs_ngg_scratch = 908 LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS); 909 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32)); 910 LLVMSetAlignment(ctx->gs_ngg_scratch, 4); 911 } 912 } 913 914 /* For merged shaders (VS-TCS, VS-GS, TES-GS): */ 915 if (ctx->screen->info.chip_class >= GFX9 && si_is_merged_shader(shader)) { 916 LLVMValueRef thread_enabled = NULL; 917 918 /* TES is special because it has only 1 shader part if NGG shader culling is disabled, 919 * and therefore it doesn't use the wrapper function. 920 */ 921 bool no_wrapper_func = ctx->stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es && 922 !shader->key.opt.ngg_culling; 923 924 /* Set EXEC = ~0 before the first shader. If the prolog is present, EXEC is set there 925 * instead. For monolithic shaders, the wrapper function does this. 926 */ 927 if ((!shader->is_monolithic || no_wrapper_func) && 928 (ctx->stage == MESA_SHADER_TESS_EVAL || 929 (ctx->stage == MESA_SHADER_VERTEX && 930 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) 931 ac_init_exec_full_mask(&ctx->ac); 932 933 /* NGG VS and NGG TES: Send gs_alloc_req and the prim export at the beginning to decrease 934 * register usage. 935 */ 936 if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) && 937 shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) { 938 /* GFX10 requires a barrier before gs_alloc_req due to a hw bug. */ 939 if (ctx->screen->info.chip_class == GFX10) 940 ac_build_s_barrier(&ctx->ac); 941 942 gfx10_ngg_build_sendmsg_gs_alloc_req(ctx); 943 944 /* Build the primitive export at the beginning 945 * of the shader if possible. 946 */ 947 if (gfx10_ngg_export_prim_early(shader)) 948 gfx10_ngg_build_export_prim(ctx, NULL, NULL); 949 } 950 951 /* NGG GS: Initialize LDS and insert s_barrier, which must not be inside the if statement. */ 952 if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.as_ngg) 953 gfx10_ngg_gs_emit_prologue(ctx); 954 955 if (ctx->stage == MESA_SHADER_GEOMETRY || 956 (ctx->stage == MESA_SHADER_TESS_CTRL && !shader->is_monolithic)) { 957 /* Wrap both shaders in an if statement according to the number of enabled threads 958 * there. For monolithic TCS, the if statement is inserted by the wrapper function, 959 * not here. 960 */ 961 thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */ 962 } else if (((shader->key.as_ls || shader->key.as_es) && !shader->is_monolithic) || 963 (shader->key.as_ngg && !shader->key.as_es)) { 964 /* This is NGG VS or NGG TES or VS before GS or TES before GS or VS before TCS. 965 * For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS), 966 * the if statement is inserted by the wrapper function. 967 */ 968 thread_enabled = si_is_es_thread(ctx); /* 1st shader: thread enabled bool */ 969 } 970 971 if (thread_enabled) { 972 ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder); 973 ctx->merged_wrap_if_label = 11500; 974 ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label); 975 } 976 977 /* Execute a barrier before the second shader in 978 * a merged shader. 979 * 980 * Execute the barrier inside the conditional block, 981 * so that empty waves can jump directly to s_endpgm, 982 * which will also signal the barrier. 983 * 984 * This is possible in gfx9, because an empty wave 985 * for the second shader does not participate in 986 * the epilogue. With NGG, empty waves may still 987 * be required to export data (e.g. GS output vertices), 988 * so we cannot let them exit early. 989 * 990 * If the shader is TCS and the TCS epilog is present 991 * and contains a barrier, it will wait there and then 992 * reach s_endpgm. 993 */ 994 if (ctx->stage == MESA_SHADER_TESS_CTRL) { 995 /* We need the barrier only if TCS inputs are read from LDS. */ 996 if (!shader->key.opt.same_patch_vertices || 997 shader->selector->info.base.inputs_read & 998 ~shader->selector->tcs_vgpr_only_inputs) 999 ac_build_s_barrier(&ctx->ac); 1000 } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) { 1001 /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */ 1002 ac_build_s_barrier(&ctx->ac); 1003 } 1004 } 1005 1006 bool success = si_nir_build_llvm(ctx, nir); 1007 if (free_nir) 1008 ralloc_free(nir); 1009 if (!success) { 1010 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n"); 1011 return false; 1012 } 1013 1014 si_llvm_build_ret(ctx, ctx->return_value); 1015 return true; 1016} 1017 1018static bool si_should_optimize_less(struct ac_llvm_compiler *compiler, 1019 struct si_shader_selector *sel) 1020{ 1021 if (!compiler->low_opt_passes) 1022 return false; 1023 1024 /* Assume a slow CPU. */ 1025 assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.chip_class <= GFX8); 1026 1027 /* For a crazy dEQP test containing 2597 memory opcodes, mostly 1028 * buffer stores. */ 1029 return sel->info.stage == MESA_SHADER_COMPUTE && sel->info.num_memory_stores > 1000; 1030} 1031 1032static void si_optimize_vs_outputs(struct si_shader_context *ctx) 1033{ 1034 struct si_shader *shader = ctx->shader; 1035 struct si_shader_info *info = &shader->selector->info; 1036 unsigned skip_vs_optim_mask = 0; 1037 1038 if ((ctx->stage != MESA_SHADER_VERTEX && ctx->stage != MESA_SHADER_TESS_EVAL) || 1039 shader->key.as_ls || shader->key.as_es) 1040 return; 1041 1042 /* Optimizing these outputs is not possible, since they might be overriden 1043 * at runtime with S_028644_PT_SPRITE_TEX. */ 1044 for (int i = 0; i < info->num_outputs; i++) { 1045 if (info->output_semantic[i] == VARYING_SLOT_PNTC || 1046 (info->output_semantic[i] >= VARYING_SLOT_TEX0 && 1047 info->output_semantic[i] <= VARYING_SLOT_TEX7)) { 1048 skip_vs_optim_mask |= 1u << shader->info.vs_output_param_offset[i]; 1049 } 1050 } 1051 1052 ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset, 1053 info->num_outputs, skip_vs_optim_mask, 1054 &shader->info.nr_param_exports); 1055} 1056 1057bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 1058 struct si_shader *shader, struct pipe_debug_callback *debug, 1059 struct nir_shader *nir, bool free_nir) 1060{ 1061 struct si_shader_selector *sel = shader->selector; 1062 struct si_shader_context ctx; 1063 1064 si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader)); 1065 1066 LLVMValueRef ngg_cull_main_fn = NULL; 1067 if (shader->key.opt.ngg_culling) { 1068 if (!si_llvm_translate_nir(&ctx, shader, nir, false, true)) { 1069 si_llvm_dispose(&ctx); 1070 return false; 1071 } 1072 ngg_cull_main_fn = ctx.main_fn; 1073 ctx.main_fn = NULL; 1074 } 1075 1076 if (!si_llvm_translate_nir(&ctx, shader, nir, free_nir, false)) { 1077 si_llvm_dispose(&ctx); 1078 return false; 1079 } 1080 1081 if (shader->is_monolithic && ctx.stage == MESA_SHADER_VERTEX) { 1082 LLVMValueRef parts[4]; 1083 unsigned num_parts = 0; 1084 bool first_is_prolog = false; 1085 LLVMValueRef main_fn = ctx.main_fn; 1086 1087 if (ngg_cull_main_fn) { 1088 if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, true)) { 1089 union si_shader_part_key prolog_key; 1090 si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true, 1091 &shader->key.part.vs.prolog, shader, &prolog_key); 1092 prolog_key.vs_prolog.is_monolithic = true; 1093 si_llvm_build_vs_prolog(&ctx, &prolog_key); 1094 parts[num_parts++] = ctx.main_fn; 1095 first_is_prolog = true; 1096 } 1097 parts[num_parts++] = ngg_cull_main_fn; 1098 } 1099 1100 if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, false)) { 1101 union si_shader_part_key prolog_key; 1102 si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false, 1103 &shader->key.part.vs.prolog, shader, &prolog_key); 1104 prolog_key.vs_prolog.is_monolithic = true; 1105 si_llvm_build_vs_prolog(&ctx, &prolog_key); 1106 parts[num_parts++] = ctx.main_fn; 1107 if (num_parts == 1) 1108 first_is_prolog = true; 1109 } 1110 parts[num_parts++] = main_fn; 1111 1112 si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false); 1113 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) { 1114 LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn; 1115 1116 /* We reuse the VS prolog code for TES just to load the input VGPRs from LDS. */ 1117 union si_shader_part_key prolog_key; 1118 memset(&prolog_key, 0, sizeof(prolog_key)); 1119 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs; 1120 prolog_key.vs_prolog.num_merged_next_stage_vgprs = 5; 1121 prolog_key.vs_prolog.as_ngg = 1; 1122 prolog_key.vs_prolog.load_vgprs_after_culling = 1; 1123 prolog_key.vs_prolog.is_monolithic = true; 1124 si_llvm_build_vs_prolog(&ctx, &prolog_key); 1125 prolog = ctx.main_fn; 1126 1127 parts[0] = ngg_cull_main_fn; 1128 parts[1] = prolog; 1129 parts[2] = main_fn; 1130 1131 si_build_wrapper_function(&ctx, parts, 3, 0, 0, false); 1132 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) { 1133 if (sscreen->info.chip_class >= GFX9) { 1134 struct si_shader_selector *ls = shader->key.part.tcs.ls; 1135 LLVMValueRef parts[4]; 1136 bool vs_needs_prolog = 1137 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, &shader->key, false); 1138 1139 /* TCS main part */ 1140 parts[2] = ctx.main_fn; 1141 1142 /* TCS epilog */ 1143 union si_shader_part_key tcs_epilog_key; 1144 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key)); 1145 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 1146 si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key); 1147 parts[3] = ctx.main_fn; 1148 1149 /* VS as LS main part */ 1150 ctx.next_shader_sel = ctx.shader->selector; 1151 nir = si_get_nir_shader(ls, NULL, &free_nir); 1152 struct si_shader shader_ls = {}; 1153 shader_ls.selector = ls; 1154 shader_ls.key.as_ls = 1; 1155 shader_ls.key.mono = shader->key.mono; 1156 shader_ls.key.opt = shader->key.opt; 1157 shader_ls.is_monolithic = true; 1158 1159 if (!si_llvm_translate_nir(&ctx, &shader_ls, nir, free_nir, false)) { 1160 si_llvm_dispose(&ctx); 1161 return false; 1162 } 1163 shader->info.uses_instanceid |= ls->info.uses_instanceid; 1164 parts[1] = ctx.main_fn; 1165 1166 /* LS prolog */ 1167 if (vs_needs_prolog) { 1168 union si_shader_part_key vs_prolog_key; 1169 si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false, 1170 &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key); 1171 vs_prolog_key.vs_prolog.is_monolithic = true; 1172 si_llvm_build_vs_prolog(&ctx, &vs_prolog_key); 1173 parts[0] = ctx.main_fn; 1174 } 1175 1176 /* Reset the shader context. */ 1177 ctx.shader = shader; 1178 ctx.stage = MESA_SHADER_TESS_CTRL; 1179 1180 si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog, 1181 vs_needs_prolog, vs_needs_prolog ? 2 : 1, 1182 shader->key.opt.same_patch_vertices); 1183 } else { 1184 LLVMValueRef parts[2]; 1185 union si_shader_part_key epilog_key; 1186 1187 parts[0] = ctx.main_fn; 1188 1189 memset(&epilog_key, 0, sizeof(epilog_key)); 1190 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 1191 si_llvm_build_tcs_epilog(&ctx, &epilog_key); 1192 parts[1] = ctx.main_fn; 1193 1194 si_build_wrapper_function(&ctx, parts, 2, 0, 0, false); 1195 } 1196 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_GEOMETRY) { 1197 if (ctx.screen->info.chip_class >= GFX9) { 1198 struct si_shader_selector *es = shader->key.part.gs.es; 1199 LLVMValueRef es_prolog = NULL; 1200 LLVMValueRef es_main = NULL; 1201 LLVMValueRef gs_prolog = NULL; 1202 LLVMValueRef gs_main = ctx.main_fn; 1203 1204 /* GS prolog */ 1205 union si_shader_part_key gs_prolog_key; 1206 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key)); 1207 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 1208 gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg; 1209 si_llvm_build_gs_prolog(&ctx, &gs_prolog_key); 1210 gs_prolog = ctx.main_fn; 1211 1212 /* ES main part */ 1213 nir = si_get_nir_shader(es, NULL, &free_nir); 1214 struct si_shader shader_es = {}; 1215 shader_es.selector = es; 1216 shader_es.key.as_es = 1; 1217 shader_es.key.as_ngg = shader->key.as_ngg; 1218 shader_es.key.mono = shader->key.mono; 1219 shader_es.key.opt = shader->key.opt; 1220 shader_es.is_monolithic = true; 1221 1222 if (!si_llvm_translate_nir(&ctx, &shader_es, nir, free_nir, false)) { 1223 si_llvm_dispose(&ctx); 1224 return false; 1225 } 1226 shader->info.uses_instanceid |= es->info.uses_instanceid; 1227 es_main = ctx.main_fn; 1228 1229 /* ES prolog */ 1230 if (es->info.stage == MESA_SHADER_VERTEX && 1231 si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, &shader->key, false)) { 1232 union si_shader_part_key vs_prolog_key; 1233 si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false, 1234 &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key); 1235 vs_prolog_key.vs_prolog.is_monolithic = true; 1236 si_llvm_build_vs_prolog(&ctx, &vs_prolog_key); 1237 es_prolog = ctx.main_fn; 1238 } 1239 1240 /* Reset the shader context. */ 1241 ctx.shader = shader; 1242 ctx.stage = MESA_SHADER_GEOMETRY; 1243 1244 /* Prepare the array of shader parts. */ 1245 LLVMValueRef parts[4]; 1246 unsigned num_parts = 0, main_part, next_first_part; 1247 1248 if (es_prolog) 1249 parts[num_parts++] = es_prolog; 1250 1251 parts[main_part = num_parts++] = es_main; 1252 parts[next_first_part = num_parts++] = gs_prolog; 1253 parts[num_parts++] = gs_main; 1254 1255 si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part, false); 1256 } else { 1257 LLVMValueRef parts[2]; 1258 union si_shader_part_key prolog_key; 1259 1260 parts[1] = ctx.main_fn; 1261 1262 memset(&prolog_key, 0, sizeof(prolog_key)); 1263 prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 1264 si_llvm_build_gs_prolog(&ctx, &prolog_key); 1265 parts[0] = ctx.main_fn; 1266 1267 si_build_wrapper_function(&ctx, parts, 2, 1, 0, false); 1268 } 1269 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_FRAGMENT) { 1270 si_llvm_build_monolithic_ps(&ctx, shader); 1271 } 1272 1273 si_llvm_optimize_module(&ctx); 1274 1275 /* Post-optimization transformations and analysis. */ 1276 si_optimize_vs_outputs(&ctx); 1277 1278 if ((debug && debug->debug_message) || si_can_dump_shader(sscreen, ctx.stage)) { 1279 ctx.shader->info.private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_fn); 1280 } 1281 1282 /* Make sure the input is a pointer and not integer followed by inttoptr. */ 1283 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind); 1284 1285 /* Compile to bytecode. */ 1286 if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug, 1287 ctx.stage, si_get_shader_name(shader), 1288 si_should_optimize_less(compiler, shader->selector))) { 1289 si_llvm_dispose(&ctx); 1290 fprintf(stderr, "LLVM failed to compile shader\n"); 1291 return false; 1292 } 1293 1294 si_llvm_dispose(&ctx); 1295 return true; 1296} 1297