1/* 2 * Copyright © 2016 Red Hat. 3 * Copyright © 2016 Bas Nieuwenhuizen 4 * 5 * based in part on anv driver which is: 6 * Copyright © 2015 Intel Corporation 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the next 16 * paragraph) shall be included in all copies or substantial portions of the 17 * Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 25 * IN THE SOFTWARE. 26 */ 27 28#include "nir/nir.h" 29#include "radv_debug.h" 30#include "radv_llvm_helper.h" 31#include "radv_private.h" 32#include "radv_shader.h" 33#include "radv_shader_args.h" 34 35#include "ac_binary.h" 36#include "ac_exp_param.h" 37#include "ac_llvm_build.h" 38#include "ac_nir_to_llvm.h" 39#include "ac_shader_abi.h" 40#include "ac_shader_util.h" 41#include "sid.h" 42 43struct radv_shader_context { 44 struct ac_llvm_context ac; 45 const struct nir_shader *shader; 46 struct ac_shader_abi abi; 47 const struct radv_shader_args *args; 48 49 gl_shader_stage stage; 50 51 unsigned max_workgroup_size; 52 LLVMContextRef context; 53 LLVMValueRef main_function; 54 55 LLVMValueRef descriptor_sets[MAX_SETS]; 56 57 LLVMValueRef ring_offsets; 58 59 LLVMValueRef vs_rel_patch_id; 60 61 LLVMValueRef gs_wave_id; 62 LLVMValueRef gs_vtx_offset[6]; 63 64 LLVMValueRef esgs_ring; 65 LLVMValueRef gsvs_ring[4]; 66 LLVMValueRef hs_ring_tess_offchip; 67 LLVMValueRef hs_ring_tess_factor; 68 69 uint64_t output_mask; 70 71 LLVMValueRef gs_next_vertex[4]; 72 LLVMValueRef gs_curprim_verts[4]; 73 LLVMValueRef gs_generated_prims[4]; 74 LLVMValueRef gs_ngg_emit; 75 LLVMValueRef gs_ngg_scratch; 76 77 LLVMValueRef vertexptr; /* GFX10 only */ 78}; 79 80struct radv_shader_output_values { 81 LLVMValueRef values[4]; 82 unsigned slot_name; 83 unsigned slot_index; 84 unsigned usage_mask; 85}; 86 87static inline struct radv_shader_context * 88radv_shader_context_from_abi(struct ac_shader_abi *abi) 89{ 90 return container_of(abi, struct radv_shader_context, abi); 91} 92 93static LLVMValueRef 94create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, LLVMBuilderRef builder, 95 const struct ac_shader_args *args, enum ac_llvm_calling_convention convention, 96 unsigned max_workgroup_size, const struct radv_nir_compiler_options *options) 97{ 98 LLVMValueRef main_function = ac_build_main(args, ctx, convention, "main", ctx->voidt, module); 99 100 if (options->address32_hi) { 101 ac_llvm_add_target_dep_function_attr(main_function, "amdgpu-32bit-address-high-bits", 102 options->address32_hi); 103 } 104 105 ac_llvm_set_workgroup_size(main_function, max_workgroup_size); 106 ac_llvm_set_target_features(main_function, ctx); 107 108 return main_function; 109} 110 111static void 112load_descriptor_sets(struct radv_shader_context *ctx) 113{ 114 struct radv_userdata_locations *user_sgprs_locs = &ctx->args->shader_info->user_sgprs_locs; 115 uint32_t mask = ctx->args->shader_info->desc_set_used_mask; 116 117 if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) { 118 LLVMValueRef desc_sets = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]); 119 while (mask) { 120 int i = u_bit_scan(&mask); 121 122 ctx->descriptor_sets[i] = 123 ac_build_load_to_sgpr(&ctx->ac, desc_sets, LLVMConstInt(ctx->ac.i32, i, false)); 124 LLVMSetAlignment(ctx->descriptor_sets[i], 4); 125 } 126 } else { 127 while (mask) { 128 int i = u_bit_scan(&mask); 129 130 ctx->descriptor_sets[i] = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]); 131 } 132 } 133} 134 135static enum ac_llvm_calling_convention 136get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage) 137{ 138 switch (stage) { 139 case MESA_SHADER_VERTEX: 140 case MESA_SHADER_TESS_EVAL: 141 return AC_LLVM_AMDGPU_VS; 142 break; 143 case MESA_SHADER_GEOMETRY: 144 return AC_LLVM_AMDGPU_GS; 145 break; 146 case MESA_SHADER_TESS_CTRL: 147 return AC_LLVM_AMDGPU_HS; 148 break; 149 case MESA_SHADER_FRAGMENT: 150 return AC_LLVM_AMDGPU_PS; 151 break; 152 case MESA_SHADER_COMPUTE: 153 return AC_LLVM_AMDGPU_CS; 154 break; 155 default: 156 unreachable("Unhandle shader type"); 157 } 158} 159 160/* Returns whether the stage is a stage that can be directly before the GS */ 161static bool 162is_pre_gs_stage(gl_shader_stage stage) 163{ 164 return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL; 165} 166 167static void 168create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage) 169{ 170 if (ctx->ac.chip_class >= GFX10) { 171 if (is_pre_gs_stage(stage) && ctx->args->shader_info->is_ngg) { 172 /* On GFX10, VS is merged into GS for NGG. */ 173 stage = MESA_SHADER_GEOMETRY; 174 has_previous_stage = true; 175 } 176 } 177 178 ctx->main_function = 179 create_llvm_function(&ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac, 180 get_llvm_calling_convention(ctx->main_function, stage), 181 ctx->max_workgroup_size, ctx->args->options); 182 183 ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr", 184 LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), NULL, 0, 185 AC_FUNC_ATTR_READNONE); 186 ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets, 187 ac_array_in_const_addr_space(ctx->ac.v4i32), ""); 188 189 load_descriptor_sets(ctx); 190 191 if (stage == MESA_SHADER_TESS_CTRL || 192 (stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.as_ls) || 193 /* GFX9 has the ESGS ring buffer in LDS. */ 194 (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) { 195 ac_declare_lds_as_pointer(&ctx->ac); 196 } 197} 198 199static LLVMValueRef 200radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, unsigned desc_set, 201 unsigned binding) 202{ 203 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 204 LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set]; 205 struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout; 206 struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout; 207 unsigned base_offset = layout->binding[binding].offset; 208 LLVMValueRef offset, stride; 209 210 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || 211 layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { 212 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + 213 layout->binding[binding].dynamic_offset_offset; 214 desc_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.push_constants); 215 base_offset = pipeline_layout->push_constant_size + 16 * idx; 216 stride = LLVMConstInt(ctx->ac.i32, 16, false); 217 } else 218 stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false); 219 220 offset = LLVMConstInt(ctx->ac.i32, base_offset, false); 221 222 if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { 223 offset = ac_build_imad(&ctx->ac, index, stride, offset); 224 } 225 226 desc_ptr = LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.i32, ""); 227 228 LLVMValueRef res[] = {desc_ptr, offset, ctx->ac.i32_0}; 229 return ac_build_gather_values(&ctx->ac, res, 3); 230} 231 232static uint32_t 233radv_get_sample_pos_offset(uint32_t num_samples) 234{ 235 uint32_t sample_pos_offset = 0; 236 237 switch (num_samples) { 238 case 2: 239 sample_pos_offset = 1; 240 break; 241 case 4: 242 sample_pos_offset = 3; 243 break; 244 case 8: 245 sample_pos_offset = 7; 246 break; 247 default: 248 break; 249 } 250 return sample_pos_offset; 251} 252 253static LLVMValueRef 254load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id) 255{ 256 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 257 258 LLVMValueRef result; 259 LLVMValueRef index = LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false); 260 LLVMValueRef ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ring_offsets, &index, 1, ""); 261 262 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ac_array_in_const_addr_space(ctx->ac.v2f32), ""); 263 264 uint32_t sample_pos_offset = radv_get_sample_pos_offset(ctx->args->options->key.ps.num_samples); 265 266 sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id, 267 LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), ""); 268 result = ac_build_load_invariant(&ctx->ac, ptr, sample_id); 269 270 return result; 271} 272 273static LLVMValueRef 274load_sample_mask_in(struct ac_shader_abi *abi) 275{ 276 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 277 uint8_t log2_ps_iter_samples; 278 279 if (ctx->args->shader_info->ps.uses_sample_shading) { 280 log2_ps_iter_samples = util_logbase2(ctx->args->options->key.ps.num_samples); 281 } else { 282 log2_ps_iter_samples = ctx->args->options->key.ps.log2_ps_iter_samples; 283 } 284 285 LLVMValueRef result, sample_id; 286 if (log2_ps_iter_samples) { 287 /* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */ 288 sample_id = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.ancillary), 8, 4); 289 sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, 1, false), sample_id, ""); 290 result = LLVMBuildAnd(ctx->ac.builder, sample_id, 291 ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage), ""); 292 } else { 293 result = ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage); 294 } 295 296 return result; 297} 298 299static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, 300 LLVMValueRef vertexidx, LLVMValueRef *addrs); 301 302static void 303visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef vertexidx, 304 LLVMValueRef *addrs) 305{ 306 unsigned offset = 0; 307 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 308 309 if (ctx->args->shader_info->is_ngg) { 310 gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs); 311 return; 312 } 313 314 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { 315 unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i]; 316 uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i]; 317 LLVMValueRef *out_ptr = &addrs[i * 4]; 318 int length = util_last_bit(output_usage_mask); 319 320 if (!(ctx->output_mask & (1ull << i)) || output_stream != stream) 321 continue; 322 323 for (unsigned j = 0; j < length; j++) { 324 if (!(output_usage_mask & (1 << j))) 325 continue; 326 327 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); 328 LLVMValueRef voffset = 329 LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out, false); 330 331 offset++; 332 333 voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, ""); 334 voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), ""); 335 336 out_val = ac_to_integer(&ctx->ac, out_val); 337 out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, ""); 338 339 ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, 340 ac_get_arg(&ctx->ac, ctx->args->ac.gs2vs_offset), 0, 341 ac_glc | ac_slc | ac_swizzled); 342 } 343 } 344 345 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), 346 ctx->gs_wave_id); 347} 348 349static void 350visit_end_primitive(struct ac_shader_abi *abi, unsigned stream) 351{ 352 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 353 354 if (ctx->args->shader_info->is_ngg) { 355 LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]); 356 return; 357 } 358 359 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), 360 ctx->gs_wave_id); 361} 362 363static LLVMValueRef 364load_ring_tess_factors(struct ac_shader_abi *abi) 365{ 366 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 367 assert(ctx->stage == MESA_SHADER_TESS_CTRL); 368 369 return ctx->hs_ring_tess_factor; 370} 371 372static LLVMValueRef 373load_ring_tess_offchip(struct ac_shader_abi *abi) 374{ 375 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 376 assert(ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL); 377 378 return ctx->hs_ring_tess_offchip; 379} 380 381static LLVMValueRef 382load_ring_esgs(struct ac_shader_abi *abi) 383{ 384 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 385 assert(ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL || 386 ctx->stage == MESA_SHADER_GEOMETRY); 387 388 return ctx->esgs_ring; 389} 390 391static LLVMValueRef 392radv_load_base_vertex(struct ac_shader_abi *abi, bool non_indexed_is_zero) 393{ 394 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 395 return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex); 396} 397 398static LLVMValueRef 399get_desc_ptr(struct radv_shader_context *ctx, LLVMValueRef ptr, bool non_uniform) 400{ 401 LLVMValueRef set_ptr = ac_llvm_extract_elem(&ctx->ac, ptr, 0); 402 LLVMValueRef offset = ac_llvm_extract_elem(&ctx->ac, ptr, 1); 403 ptr = LLVMBuildNUWAdd(ctx->ac.builder, set_ptr, offset, ""); 404 405 unsigned addr_space = AC_ADDR_SPACE_CONST_32BIT; 406 if (non_uniform) { 407 /* 32-bit seems to always use SMEM. addrspacecast from 32-bit -> 64-bit is broken. */ 408 LLVMValueRef dwords[] = {ptr, 409 LLVMConstInt(ctx->ac.i32, ctx->args->options->address32_hi, false)}; 410 ptr = ac_build_gather_values(&ctx->ac, dwords, 2); 411 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, ""); 412 addr_space = AC_ADDR_SPACE_CONST; 413 } 414 return LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.v4i32, addr_space), ""); 415} 416 417static LLVMValueRef 418radv_load_ssbo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr, bool write, bool non_uniform) 419{ 420 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 421 LLVMValueRef result; 422 423 buffer_ptr = get_desc_ptr(ctx, buffer_ptr, non_uniform); 424 if (!non_uniform) 425 LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md); 426 427 result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, ""); 428 LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md); 429 LLVMSetAlignment(result, 4); 430 431 return result; 432} 433 434static LLVMValueRef 435radv_load_ubo(struct ac_shader_abi *abi, unsigned desc_set, unsigned binding, bool valid_binding, 436 LLVMValueRef buffer_ptr) 437{ 438 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 439 LLVMValueRef result; 440 441 if (valid_binding) { 442 struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout; 443 struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout; 444 445 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { 446 LLVMValueRef set_ptr = ac_llvm_extract_elem(&ctx->ac, buffer_ptr, 0); 447 LLVMValueRef offset = ac_llvm_extract_elem(&ctx->ac, buffer_ptr, 1); 448 buffer_ptr = LLVMBuildNUWAdd(ctx->ac.builder, set_ptr, offset, ""); 449 450 uint32_t desc_type = 451 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 452 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); 453 454 if (ctx->ac.chip_class >= GFX10) { 455 desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 456 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); 457 } else { 458 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 459 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 460 } 461 462 LLVMValueRef desc_components[4] = { 463 LLVMBuildPtrToInt(ctx->ac.builder, buffer_ptr, ctx->ac.intptr, ""), 464 LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->args->options->address32_hi), 465 false), 466 LLVMConstInt(ctx->ac.i32, 0xffffffff, false), 467 LLVMConstInt(ctx->ac.i32, desc_type, false), 468 }; 469 470 return ac_build_gather_values(&ctx->ac, desc_components, 4); 471 } 472 } 473 474 buffer_ptr = get_desc_ptr(ctx, buffer_ptr, false); 475 LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md); 476 477 result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, ""); 478 LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md); 479 LLVMSetAlignment(result, 4); 480 481 return result; 482} 483 484static LLVMValueRef 485radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsigned base_index, 486 unsigned constant_index, LLVMValueRef index, 487 enum ac_descriptor_type desc_type, bool image, bool write, bool bindless) 488{ 489 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 490 LLVMValueRef list = ctx->descriptor_sets[descriptor_set]; 491 struct radv_descriptor_set_layout *layout = 492 ctx->args->options->layout->set[descriptor_set].layout; 493 struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index; 494 unsigned offset = binding->offset; 495 unsigned stride = binding->size; 496 unsigned type_size; 497 LLVMBuilderRef builder = ctx->ac.builder; 498 LLVMTypeRef type; 499 500 assert(base_index < layout->binding_count); 501 502 if (binding->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE && desc_type == AC_DESC_FMASK) 503 return NULL; 504 505 switch (desc_type) { 506 case AC_DESC_IMAGE: 507 type = ctx->ac.v8i32; 508 type_size = 32; 509 break; 510 case AC_DESC_FMASK: 511 type = ctx->ac.v8i32; 512 offset += 32; 513 type_size = 32; 514 break; 515 case AC_DESC_SAMPLER: 516 type = ctx->ac.v4i32; 517 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) { 518 offset += radv_combined_image_descriptor_sampler_offset(binding); 519 } 520 521 type_size = 16; 522 break; 523 case AC_DESC_BUFFER: 524 type = ctx->ac.v4i32; 525 type_size = 16; 526 break; 527 case AC_DESC_PLANE_0: 528 case AC_DESC_PLANE_1: 529 case AC_DESC_PLANE_2: 530 type = ctx->ac.v8i32; 531 type_size = 32; 532 offset += 32 * (desc_type - AC_DESC_PLANE_0); 533 break; 534 default: 535 unreachable("invalid desc_type\n"); 536 } 537 538 offset += constant_index * stride; 539 540 if (desc_type == AC_DESC_SAMPLER && binding->immutable_samplers_offset && 541 (!index || binding->immutable_samplers_equal)) { 542 if (binding->immutable_samplers_equal) 543 constant_index = 0; 544 545 const uint32_t *samplers = radv_immutable_samplers(layout, binding); 546 547 LLVMValueRef constants[] = { 548 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 0], 0), 549 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 1], 0), 550 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 2], 0), 551 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 3], 0), 552 }; 553 return ac_build_gather_values(&ctx->ac, constants, 4); 554 } 555 556 assert(stride % type_size == 0); 557 558 LLVMValueRef adjusted_index = index; 559 if (!adjusted_index) 560 adjusted_index = ctx->ac.i32_0; 561 562 adjusted_index = 563 LLVMBuildMul(builder, adjusted_index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), ""); 564 565 LLVMValueRef val_offset = LLVMConstInt(ctx->ac.i32, offset, 0); 566 list = LLVMBuildGEP(builder, list, &val_offset, 1, ""); 567 list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(type), ""); 568 569 LLVMValueRef descriptor = ac_build_load_to_sgpr(&ctx->ac, list, adjusted_index); 570 571 /* 3 plane formats always have same size and format for plane 1 & 2, so 572 * use the tail from plane 1 so that we can store only the first 16 bytes 573 * of the last plane. */ 574 if (desc_type == AC_DESC_PLANE_2) { 575 LLVMValueRef descriptor2 = 576 radv_get_sampler_desc(abi, descriptor_set, base_index, constant_index, index, 577 AC_DESC_PLANE_1, image, write, bindless); 578 579 LLVMValueRef components[8]; 580 for (unsigned i = 0; i < 4; ++i) 581 components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i); 582 583 for (unsigned i = 4; i < 8; ++i) 584 components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i); 585 descriptor = ac_build_gather_values(&ctx->ac, components, 8); 586 } else if (desc_type == AC_DESC_IMAGE && 587 ctx->args->options->has_image_load_dcc_bug && 588 image && !write) { 589 LLVMValueRef components[8]; 590 591 for (unsigned i = 0; i < 8; i++) 592 components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i); 593 594 /* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a hardware bug. */ 595 components[6] = LLVMBuildAnd(ctx->ac.builder, components[6], 596 LLVMConstInt(ctx->ac.i32, C_00A018_WRITE_COMPRESS_ENABLE, false), ""); 597 598 descriptor = ac_build_gather_values(&ctx->ac, components, 8); 599 } 600 601 return descriptor; 602} 603 604/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW. 605 * so we may need to fix it up. */ 606static LLVMValueRef 607adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, unsigned adjustment, LLVMValueRef alpha) 608{ 609 if (adjustment == ALPHA_ADJUST_NONE) 610 return alpha; 611 612 LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0); 613 614 alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, ""); 615 616 if (adjustment == ALPHA_ADJUST_SSCALED) 617 alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, ""); 618 else 619 alpha = ac_to_integer(&ctx->ac, alpha); 620 621 /* For the integer-like cases, do a natural sign extension. 622 * 623 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 624 * and happen to contain 0, 1, 2, 3 as the two LSBs of the 625 * exponent. 626 */ 627 alpha = 628 LLVMBuildShl(ctx->ac.builder, alpha, 629 adjustment == ALPHA_ADJUST_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, ""); 630 alpha = LLVMBuildAShr(ctx->ac.builder, alpha, c30, ""); 631 632 /* Convert back to the right type. */ 633 if (adjustment == ALPHA_ADJUST_SNORM) { 634 LLVMValueRef clamp; 635 LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0); 636 alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, ""); 637 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, alpha, neg_one, ""); 638 alpha = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, alpha, ""); 639 } else if (adjustment == ALPHA_ADJUST_SSCALED) { 640 alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, ""); 641 } 642 643 return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, ""); 644} 645 646static LLVMValueRef 647radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value, 648 unsigned num_channels, bool is_float) 649{ 650 LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0; 651 LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1; 652 LLVMValueRef chan[4]; 653 654 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) { 655 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value)); 656 657 if (num_channels == 4 && num_channels == vec_size) 658 return value; 659 660 num_channels = MIN2(num_channels, vec_size); 661 662 for (unsigned i = 0; i < num_channels; i++) 663 chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i); 664 } else { 665 assert(num_channels == 1); 666 chan[0] = value; 667 } 668 669 for (unsigned i = num_channels; i < 4; i++) { 670 chan[i] = i == 3 ? one : zero; 671 chan[i] = ac_to_integer(&ctx->ac, chan[i]); 672 } 673 674 return ac_build_gather_values(&ctx->ac, chan, 4); 675} 676 677static void 678load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTypeRef dest_type, 679 LLVMValueRef out[4]) 680{ 681 LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.vertex_buffers); 682 LLVMValueRef t_offset; 683 LLVMValueRef t_list; 684 LLVMValueRef input; 685 LLVMValueRef buffer_index; 686 unsigned attrib_index = driver_location - VERT_ATTRIB_GENERIC0; 687 unsigned attrib_format = ctx->args->options->key.vs.vertex_attribute_formats[attrib_index]; 688 unsigned data_format = attrib_format & 0x0f; 689 unsigned num_format = (attrib_format >> 4) & 0x07; 690 bool is_float = 691 num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT; 692 uint8_t input_usage_mask = 693 ctx->args->shader_info->vs.input_usage_mask[driver_location]; 694 unsigned num_input_channels = util_last_bit(input_usage_mask); 695 696 if (ctx->args->options->key.vs.instance_rate_inputs & (1u << attrib_index)) { 697 uint32_t divisor = ctx->args->options->key.vs.instance_rate_divisors[attrib_index]; 698 699 if (divisor) { 700 buffer_index = ctx->abi.instance_id; 701 702 if (divisor != 1) { 703 buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index, 704 LLVMConstInt(ctx->ac.i32, divisor, 0), ""); 705 } 706 } else { 707 buffer_index = ctx->ac.i32_0; 708 } 709 710 buffer_index = LLVMBuildAdd( 711 ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.start_instance), buffer_index, ""); 712 } else { 713 buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id, 714 ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex), ""); 715 } 716 717 const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format); 718 719 /* Adjust the number of channels to load based on the vertex attribute format. */ 720 unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels); 721 unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index]; 722 unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index]; 723 unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index]; 724 unsigned alpha_adjust = ctx->args->options->key.vs.vertex_alpha_adjust[attrib_index]; 725 726 if (ctx->args->options->key.vs.vertex_post_shuffle & (1 << attrib_index)) { 727 /* Always load, at least, 3 channels for formats that need to be shuffled because X<->Z. */ 728 num_channels = MAX2(num_channels, 3); 729 } 730 731 unsigned desc_index = 732 ctx->args->shader_info->vs.use_per_attribute_vb_descs ? attrib_index : attrib_binding; 733 desc_index = util_bitcount(ctx->args->shader_info->vs.vb_desc_usage_mask & 734 u_bit_consecutive(0, desc_index)); 735 t_offset = LLVMConstInt(ctx->ac.i32, desc_index, false); 736 t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); 737 738 /* Always split typed vertex buffer loads on GFX6 and GFX10+ to avoid any alignment issues that 739 * triggers memory violations and eventually a GPU hang. This can happen if the stride (static or 740 * dynamic) is unaligned and also if the VBO offset is aligned to a scalar (eg. stride is 8 and 741 * VBO offset is 2 for R16G16B16A16_SNORM). 742 */ 743 if (ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10) { 744 unsigned chan_format = vtx_info->chan_format; 745 LLVMValueRef values[4]; 746 747 assert(ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10); 748 749 for (unsigned chan = 0; chan < num_channels; chan++) { 750 unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size; 751 LLVMValueRef chan_index = buffer_index; 752 753 if (attrib_stride != 0 && chan_offset > attrib_stride) { 754 LLVMValueRef buffer_offset = 755 LLVMConstInt(ctx->ac.i32, chan_offset / attrib_stride, false); 756 757 chan_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, ""); 758 759 chan_offset = chan_offset % attrib_stride; 760 } 761 762 values[chan] = ac_build_struct_tbuffer_load( 763 &ctx->ac, t_list, chan_index, LLVMConstInt(ctx->ac.i32, chan_offset, false), 764 ctx->ac.i32_0, ctx->ac.i32_0, 1, chan_format, num_format, 0, true); 765 } 766 767 input = ac_build_gather_values(&ctx->ac, values, num_channels); 768 } else { 769 if (attrib_stride != 0 && attrib_offset > attrib_stride) { 770 LLVMValueRef buffer_offset = 771 LLVMConstInt(ctx->ac.i32, attrib_offset / attrib_stride, false); 772 773 buffer_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, ""); 774 775 attrib_offset = attrib_offset % attrib_stride; 776 } 777 778 input = ac_build_struct_tbuffer_load( 779 &ctx->ac, t_list, buffer_index, LLVMConstInt(ctx->ac.i32, attrib_offset, false), 780 ctx->ac.i32_0, ctx->ac.i32_0, num_channels, data_format, num_format, 0, true); 781 } 782 783 if (ctx->args->options->key.vs.vertex_post_shuffle & (1 << attrib_index)) { 784 LLVMValueRef c[4]; 785 c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2); 786 c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1); 787 c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0); 788 c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3); 789 790 input = ac_build_gather_values(&ctx->ac, c, 4); 791 } 792 793 input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float); 794 795 for (unsigned chan = 0; chan < 4; chan++) { 796 LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false); 797 out[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, ""); 798 if (dest_type == ctx->ac.i16 && is_float) { 799 out[chan] = LLVMBuildBitCast(ctx->ac.builder, out[chan], ctx->ac.f32, ""); 800 out[chan] = LLVMBuildFPTrunc(ctx->ac.builder, out[chan], ctx->ac.f16, ""); 801 } 802 } 803 804 out[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, out[3]); 805 806 for (unsigned chan = 0; chan < 4; chan++) { 807 out[chan] = ac_to_integer(&ctx->ac, out[chan]); 808 if (dest_type == ctx->ac.i16 && !is_float) 809 out[chan] = LLVMBuildTrunc(ctx->ac.builder, out[chan], ctx->ac.i16, ""); 810 } 811} 812 813static LLVMValueRef 814radv_load_vs_inputs(struct ac_shader_abi *abi, unsigned driver_location, unsigned component, 815 unsigned num_components, unsigned vertex_index, LLVMTypeRef type) 816{ 817 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 818 LLVMValueRef values[4]; 819 820 load_vs_input(ctx, driver_location, type, values); 821 822 for (unsigned i = 0; i < 4; i++) 823 values[i] = LLVMBuildBitCast(ctx->ac.builder, values[i], type, ""); 824 825 return ac_build_varying_gather_values(&ctx->ac, values, num_components, component); 826} 827 828static void 829prepare_interp_optimize(struct radv_shader_context *ctx, struct nir_shader *nir) 830{ 831 bool uses_center = false; 832 bool uses_centroid = false; 833 nir_foreach_shader_in_variable (variable, nir) { 834 if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT || 835 variable->data.sample) 836 continue; 837 838 if (variable->data.centroid) 839 uses_centroid = true; 840 else 841 uses_center = true; 842 } 843 844 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid); 845 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid); 846 847 if (uses_center && uses_centroid) { 848 LLVMValueRef sel = 849 LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask), 850 ctx->ac.i32_0, ""); 851 ctx->abi.persp_centroid = 852 LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.persp_center), 853 ctx->abi.persp_centroid, ""); 854 ctx->abi.linear_centroid = 855 LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.linear_center), 856 ctx->abi.linear_centroid, ""); 857 } 858} 859 860static void 861scan_shader_output_decl(struct radv_shader_context *ctx, struct nir_variable *variable, 862 struct nir_shader *shader, gl_shader_stage stage) 863{ 864 int idx = variable->data.driver_location; 865 unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); 866 uint64_t mask_attribs; 867 868 if (variable->data.compact) { 869 unsigned component_count = variable->data.location_frac + glsl_get_length(variable->type); 870 attrib_count = (component_count + 3) / 4; 871 } 872 873 mask_attribs = ((1ull << attrib_count) - 1) << idx; 874 875 ctx->output_mask |= mask_attribs; 876} 877 878/* Initialize arguments for the shader export intrinsic */ 879static void 880si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef *values, 881 unsigned enabled_channels, unsigned target, struct ac_export_args *args) 882{ 883 /* Specify the channels that are enabled. */ 884 args->enabled_channels = enabled_channels; 885 886 /* Specify whether the EXEC mask represents the valid mask */ 887 args->valid_mask = 0; 888 889 /* Specify whether this is the last export */ 890 args->done = 0; 891 892 /* Specify the target we are exporting */ 893 args->target = target; 894 895 args->compr = false; 896 args->out[0] = LLVMGetUndef(ctx->ac.f32); 897 args->out[1] = LLVMGetUndef(ctx->ac.f32); 898 args->out[2] = LLVMGetUndef(ctx->ac.f32); 899 args->out[3] = LLVMGetUndef(ctx->ac.f32); 900 901 if (!values) 902 return; 903 904 bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; 905 if (ctx->stage == MESA_SHADER_FRAGMENT) { 906 unsigned index = target - V_008DFC_SQ_EXP_MRT; 907 unsigned col_format = (ctx->args->options->key.ps.col_format >> (4 * index)) & 0xf; 908 bool is_int8 = (ctx->args->options->key.ps.is_int8 >> index) & 1; 909 bool is_int10 = (ctx->args->options->key.ps.is_int10 >> index) & 1; 910 911 LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL; 912 LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits, 913 bool hi) = NULL; 914 915 switch (col_format) { 916 case V_028714_SPI_SHADER_ZERO: 917 args->enabled_channels = 0; /* writemask */ 918 args->target = V_008DFC_SQ_EXP_NULL; 919 break; 920 921 case V_028714_SPI_SHADER_32_R: 922 args->enabled_channels = 1; 923 args->out[0] = values[0]; 924 break; 925 926 case V_028714_SPI_SHADER_32_GR: 927 args->enabled_channels = 0x3; 928 args->out[0] = values[0]; 929 args->out[1] = values[1]; 930 break; 931 932 case V_028714_SPI_SHADER_32_AR: 933 if (ctx->ac.chip_class >= GFX10) { 934 args->enabled_channels = 0x3; 935 args->out[0] = values[0]; 936 args->out[1] = values[3]; 937 } else { 938 args->enabled_channels = 0x9; 939 args->out[0] = values[0]; 940 args->out[3] = values[3]; 941 } 942 break; 943 944 case V_028714_SPI_SHADER_FP16_ABGR: 945 args->enabled_channels = 0xf; 946 packf = ac_build_cvt_pkrtz_f16; 947 if (is_16bit) { 948 for (unsigned chan = 0; chan < 4; chan++) 949 values[chan] = LLVMBuildFPExt(ctx->ac.builder, values[chan], ctx->ac.f32, ""); 950 } 951 break; 952 953 case V_028714_SPI_SHADER_UNORM16_ABGR: 954 args->enabled_channels = 0xf; 955 packf = ac_build_cvt_pknorm_u16; 956 break; 957 958 case V_028714_SPI_SHADER_SNORM16_ABGR: 959 args->enabled_channels = 0xf; 960 packf = ac_build_cvt_pknorm_i16; 961 break; 962 963 case V_028714_SPI_SHADER_UINT16_ABGR: 964 args->enabled_channels = 0xf; 965 packi = ac_build_cvt_pk_u16; 966 if (is_16bit) { 967 for (unsigned chan = 0; chan < 4; chan++) 968 values[chan] = LLVMBuildZExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]), 969 ctx->ac.i32, ""); 970 } 971 break; 972 973 case V_028714_SPI_SHADER_SINT16_ABGR: 974 args->enabled_channels = 0xf; 975 packi = ac_build_cvt_pk_i16; 976 if (is_16bit) { 977 for (unsigned chan = 0; chan < 4; chan++) 978 values[chan] = LLVMBuildSExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]), 979 ctx->ac.i32, ""); 980 } 981 break; 982 983 default: 984 case V_028714_SPI_SHADER_32_ABGR: 985 memcpy(&args->out[0], values, sizeof(values[0]) * 4); 986 break; 987 } 988 989 /* Replace NaN by zero (only 32-bit) to fix game bugs if 990 * requested. 991 */ 992 if (ctx->args->options->enable_mrt_output_nan_fixup && !is_16bit && 993 (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR || 994 col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR || 995 col_format == V_028714_SPI_SHADER_FP16_ABGR)) { 996 for (unsigned i = 0; i < 4; i++) { 997 LLVMValueRef class_args[2] = {values[i], 998 LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN, false)}; 999 LLVMValueRef isnan = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1, 1000 class_args, 2, AC_FUNC_ATTR_READNONE); 1001 values[i] = LLVMBuildSelect(ctx->ac.builder, isnan, ctx->ac.f32_0, values[i], ""); 1002 } 1003 } 1004 1005 /* Pack f16 or norm_i16/u16. */ 1006 if (packf) { 1007 for (unsigned chan = 0; chan < 2; chan++) { 1008 LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]}; 1009 LLVMValueRef packed; 1010 1011 packed = packf(&ctx->ac, pack_args); 1012 args->out[chan] = ac_to_float(&ctx->ac, packed); 1013 } 1014 args->compr = 1; /* COMPR flag */ 1015 } 1016 1017 /* Pack i16/u16. */ 1018 if (packi) { 1019 for (unsigned chan = 0; chan < 2; chan++) { 1020 LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]), 1021 ac_to_integer(&ctx->ac, values[2 * chan + 1])}; 1022 LLVMValueRef packed; 1023 1024 packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1); 1025 args->out[chan] = ac_to_float(&ctx->ac, packed); 1026 } 1027 args->compr = 1; /* COMPR flag */ 1028 } 1029 return; 1030 } 1031 1032 if (is_16bit) { 1033 for (unsigned chan = 0; chan < 4; chan++) { 1034 values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, ""); 1035 args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, ""); 1036 } 1037 } else 1038 memcpy(&args->out[0], values, sizeof(values[0]) * 4); 1039 1040 for (unsigned i = 0; i < 4; ++i) 1041 args->out[i] = ac_to_float(&ctx->ac, args->out[i]); 1042} 1043 1044static void 1045radv_export_param(struct radv_shader_context *ctx, unsigned index, LLVMValueRef *values, 1046 unsigned enabled_channels) 1047{ 1048 struct ac_export_args args; 1049 1050 si_llvm_init_export_args(ctx, values, enabled_channels, V_008DFC_SQ_EXP_PARAM + index, &args); 1051 ac_build_export(&ctx->ac, &args); 1052} 1053 1054static LLVMValueRef 1055radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan) 1056{ 1057 LLVMValueRef output = ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)]; 1058 return LLVMBuildLoad(ctx->ac.builder, output, ""); 1059} 1060 1061static void 1062radv_emit_stream_output(struct radv_shader_context *ctx, LLVMValueRef const *so_buffers, 1063 LLVMValueRef const *so_write_offsets, 1064 const struct radv_stream_output *output, 1065 struct radv_shader_output_values *shader_out) 1066{ 1067 unsigned num_comps = util_bitcount(output->component_mask); 1068 unsigned buf = output->buffer; 1069 unsigned offset = output->offset; 1070 unsigned start; 1071 LLVMValueRef out[4]; 1072 1073 assert(num_comps && num_comps <= 4); 1074 if (!num_comps || num_comps > 4) 1075 return; 1076 1077 /* Get the first component. */ 1078 start = ffs(output->component_mask) - 1; 1079 1080 /* Load the output as int. */ 1081 for (int i = 0; i < num_comps; i++) { 1082 out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]); 1083 } 1084 1085 /* Pack the output. */ 1086 LLVMValueRef vdata = NULL; 1087 1088 switch (num_comps) { 1089 case 1: /* as i32 */ 1090 vdata = out[0]; 1091 break; 1092 case 2: /* as v2i32 */ 1093 case 3: /* as v4i32 (aligned to 4) */ 1094 out[3] = LLVMGetUndef(ctx->ac.i32); 1095 FALLTHROUGH; 1096 case 4: /* as v4i32 */ 1097 vdata = ac_build_gather_values(&ctx->ac, out, 1098 !ac_has_vec3_support(ctx->ac.chip_class, false) 1099 ? util_next_power_of_two(num_comps) 1100 : num_comps); 1101 break; 1102 } 1103 1104 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, num_comps, so_write_offsets[buf], 1105 ctx->ac.i32_0, offset, ac_glc | ac_slc); 1106} 1107 1108static void 1109radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) 1110{ 1111 int i; 1112 1113 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ 1114 assert(ctx->args->ac.streamout_config.used); 1115 LLVMValueRef so_vtx_count = ac_build_bfe( 1116 &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config), 1117 LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false); 1118 1119 LLVMValueRef tid = ac_get_thread_id(&ctx->ac); 1120 1121 /* can_emit = tid < so_vtx_count; */ 1122 LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, so_vtx_count, ""); 1123 1124 /* Emit the streamout code conditionally. This actually avoids 1125 * out-of-bounds buffer access. The hw tells us via the SGPR 1126 * (so_vtx_count) which threads are allowed to emit streamout data. 1127 */ 1128 ac_build_ifcc(&ctx->ac, can_emit, 6501); 1129 { 1130 /* The buffer offset is computed as follows: 1131 * ByteOffset = streamout_offset[buffer_id]*4 + 1132 * (streamout_write_index + thread_id)*stride[buffer_id] + 1133 * attrib_offset 1134 */ 1135 LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_write_index); 1136 1137 /* Compute (streamout_write_index + thread_id). */ 1138 so_write_index = LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, ""); 1139 1140 /* Load the descriptor and compute the write offset for each 1141 * enabled buffer. 1142 */ 1143 LLVMValueRef so_write_offset[4] = {0}; 1144 LLVMValueRef so_buffers[4] = {0}; 1145 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers); 1146 1147 for (i = 0; i < 4; i++) { 1148 uint16_t stride = ctx->args->shader_info->so.strides[i]; 1149 1150 if (!stride) 1151 continue; 1152 1153 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i, false); 1154 1155 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 1156 1157 LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_offset[i]); 1158 1159 so_offset = 1160 LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), ""); 1161 1162 so_write_offset[i] = ac_build_imad( 1163 &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, stride * 4, false), so_offset); 1164 } 1165 1166 /* Write streamout data. */ 1167 for (i = 0; i < ctx->args->shader_info->so.num_outputs; i++) { 1168 struct radv_shader_output_values shader_out = {0}; 1169 struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i]; 1170 1171 if (stream != output->stream) 1172 continue; 1173 1174 for (int j = 0; j < 4; j++) { 1175 shader_out.values[j] = radv_load_output(ctx, output->location, j); 1176 } 1177 1178 radv_emit_stream_output(ctx, so_buffers, so_write_offset, output, &shader_out); 1179 } 1180 } 1181 ac_build_endif(&ctx->ac, 6501); 1182} 1183 1184static void 1185radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs, 1186 unsigned noutput, struct radv_vs_output_info *outinfo, 1187 bool export_clip_dists) 1188{ 1189 for (unsigned i = 0; i < noutput; i++) { 1190 unsigned slot_name = outputs[i].slot_name; 1191 unsigned usage_mask = outputs[i].usage_mask; 1192 1193 if (slot_name != VARYING_SLOT_LAYER && slot_name != VARYING_SLOT_PRIMITIVE_ID && 1194 slot_name != VARYING_SLOT_VIEWPORT && slot_name != VARYING_SLOT_CLIP_DIST0 && 1195 slot_name != VARYING_SLOT_CLIP_DIST1 && slot_name < VARYING_SLOT_VAR0) 1196 continue; 1197 1198 if ((slot_name == VARYING_SLOT_CLIP_DIST0 || slot_name == VARYING_SLOT_CLIP_DIST1) && 1199 !export_clip_dists) 1200 continue; 1201 1202 radv_export_param(ctx, outinfo->vs_output_param_offset[slot_name], outputs[i].values, 1203 usage_mask); 1204 } 1205} 1206 1207/* Generate export instructions for hardware VS shader stage or NGG GS stage 1208 * (position and parameter data only). 1209 */ 1210static void 1211radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs, 1212 unsigned noutput, struct radv_vs_output_info *outinfo, bool export_clip_dists) 1213{ 1214 LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_value = NULL; 1215 LLVMValueRef primitive_shading_rate = NULL; 1216 struct ac_export_args pos_args[4] = {0}; 1217 unsigned pos_idx, index; 1218 int i; 1219 1220 /* Build position exports */ 1221 for (i = 0; i < noutput; i++) { 1222 switch (outputs[i].slot_name) { 1223 case VARYING_SLOT_POS: 1224 si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS, &pos_args[0]); 1225 break; 1226 case VARYING_SLOT_PSIZ: 1227 psize_value = outputs[i].values[0]; 1228 break; 1229 case VARYING_SLOT_LAYER: 1230 layer_value = outputs[i].values[0]; 1231 break; 1232 case VARYING_SLOT_VIEWPORT: 1233 viewport_value = outputs[i].values[0]; 1234 break; 1235 case VARYING_SLOT_PRIMITIVE_SHADING_RATE: 1236 primitive_shading_rate = outputs[i].values[0]; 1237 break; 1238 case VARYING_SLOT_CLIP_DIST0: 1239 case VARYING_SLOT_CLIP_DIST1: 1240 index = 2 + outputs[i].slot_index; 1241 si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS + index, 1242 &pos_args[index]); 1243 break; 1244 default: 1245 break; 1246 } 1247 } 1248 1249 /* We need to add the position output manually if it's missing. */ 1250 if (!pos_args[0].out[0]) { 1251 pos_args[0].enabled_channels = 0xf; /* writemask */ 1252 pos_args[0].valid_mask = 0; /* EXEC mask */ 1253 pos_args[0].done = 0; /* last export? */ 1254 pos_args[0].target = V_008DFC_SQ_EXP_POS; 1255 pos_args[0].compr = 0; /* COMPR flag */ 1256 pos_args[0].out[0] = ctx->ac.f32_0; /* X */ 1257 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */ 1258 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */ 1259 pos_args[0].out[3] = ctx->ac.f32_1; /* W */ 1260 } 1261 1262 bool writes_primitive_shading_rate = outinfo->writes_primitive_shading_rate || 1263 ctx->args->options->force_vrs_rates; 1264 1265 if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_layer || 1266 outinfo->writes_viewport_index || writes_primitive_shading_rate) { 1267 pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) | 1268 (writes_primitive_shading_rate == true ? 2 : 0) | 1269 (outinfo->writes_layer == true ? 4 : 0)); 1270 pos_args[1].valid_mask = 0; 1271 pos_args[1].done = 0; 1272 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1; 1273 pos_args[1].compr = 0; 1274 pos_args[1].out[0] = ctx->ac.f32_0; /* X */ 1275 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */ 1276 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */ 1277 pos_args[1].out[3] = ctx->ac.f32_0; /* W */ 1278 1279 if (outinfo->writes_pointsize == true) 1280 pos_args[1].out[0] = psize_value; 1281 if (outinfo->writes_layer == true) 1282 pos_args[1].out[2] = layer_value; 1283 if (outinfo->writes_viewport_index == true) { 1284 if (ctx->args->options->chip_class >= GFX9) { 1285 /* GFX9 has the layer in out.z[10:0] and the viewport 1286 * index in out.z[19:16]. 1287 */ 1288 LLVMValueRef v = viewport_value; 1289 v = ac_to_integer(&ctx->ac, v); 1290 v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, false), ""); 1291 v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), ""); 1292 1293 pos_args[1].out[2] = ac_to_float(&ctx->ac, v); 1294 pos_args[1].enabled_channels |= 1 << 2; 1295 } else { 1296 pos_args[1].out[3] = viewport_value; 1297 pos_args[1].enabled_channels |= 1 << 3; 1298 } 1299 } 1300 1301 if (outinfo->writes_primitive_shading_rate) { 1302 pos_args[1].out[1] = primitive_shading_rate; 1303 } else if (ctx->args->options->force_vrs_rates) { 1304 /* Bits [2:3] = VRS rate X 1305 * Bits [4:5] = VRS rate Y 1306 * 1307 * The range is [-2, 1]. Values: 1308 * 1: 2x coarser shading rate in that direction. 1309 * 0: normal shading rate 1310 * -1: 2x finer shading rate (sample shading, not directional) 1311 * -2: 4x finer shading rate (sample shading, not directional) 1312 * 1313 * Sample shading can't go above 8 samples, so both numbers can't be -2 at the same time. 1314 */ 1315 LLVMValueRef rates = LLVMConstInt(ctx->ac.i32, ctx->args->options->force_vrs_rates, false); 1316 LLVMValueRef cond; 1317 LLVMValueRef v; 1318 1319 /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */ 1320 cond = LLVMBuildFCmp(ctx->ac.builder, LLVMRealUNE, pos_args[0].out[3], ctx->ac.f32_1, ""); 1321 v = LLVMBuildSelect(ctx->ac.builder, cond, rates, ctx->ac.i32_0, ""); 1322 1323 pos_args[1].out[1] = ac_to_float(&ctx->ac, v); 1324 } 1325 } 1326 1327 /* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. 1328 * Setting valid_mask=1 prevents it and has no other effect. 1329 */ 1330 if (ctx->ac.chip_class == GFX10) 1331 pos_args[0].valid_mask = 1; 1332 1333 pos_idx = 0; 1334 for (i = 0; i < 4; i++) { 1335 if (!pos_args[i].out[0]) 1336 continue; 1337 1338 /* Specify the target we are exporting */ 1339 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++; 1340 1341 if (pos_idx == outinfo->pos_exports) 1342 /* Specify that this is the last export */ 1343 pos_args[i].done = 1; 1344 1345 ac_build_export(&ctx->ac, &pos_args[i]); 1346 } 1347 1348 /* Build parameter exports */ 1349 radv_build_param_exports(ctx, outputs, noutput, outinfo, export_clip_dists); 1350} 1351 1352static void 1353handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, bool export_clip_dists, 1354 struct radv_vs_output_info *outinfo) 1355{ 1356 struct radv_shader_output_values *outputs; 1357 unsigned noutput = 0; 1358 1359 if (ctx->args->options->key.has_multiview_view_index) { 1360 LLVMValueRef *tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]; 1361 if (!*tmp_out) { 1362 for (unsigned i = 0; i < 4; ++i) 1363 ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] = 1364 ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, ""); 1365 } 1366 1367 LLVMValueRef view_index = ac_get_arg(&ctx->ac, ctx->args->ac.view_index); 1368 LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, view_index), *tmp_out); 1369 ctx->output_mask |= 1ull << VARYING_SLOT_LAYER; 1370 } 1371 1372 if (ctx->args->shader_info->so.num_outputs && !ctx->args->is_gs_copy_shader) { 1373 /* The GS copy shader emission already emits streamout. */ 1374 radv_emit_streamout(ctx, 0); 1375 } 1376 1377 /* Allocate a temporary array for the output values. */ 1378 unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_prim_id; 1379 outputs = malloc(num_outputs * sizeof(outputs[0])); 1380 1381 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { 1382 if (!(ctx->output_mask & (1ull << i))) 1383 continue; 1384 1385 outputs[noutput].slot_name = i; 1386 outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1; 1387 1388 if (ctx->stage == MESA_SHADER_VERTEX && !ctx->args->is_gs_copy_shader) { 1389 outputs[noutput].usage_mask = ctx->args->shader_info->vs.output_usage_mask[i]; 1390 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { 1391 outputs[noutput].usage_mask = ctx->args->shader_info->tes.output_usage_mask[i]; 1392 } else { 1393 assert(ctx->args->is_gs_copy_shader); 1394 outputs[noutput].usage_mask = ctx->args->shader_info->gs.output_usage_mask[i]; 1395 } 1396 1397 for (unsigned j = 0; j < 4; j++) { 1398 outputs[noutput].values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j)); 1399 } 1400 1401 noutput++; 1402 } 1403 1404 /* Export PrimitiveID. */ 1405 if (export_prim_id) { 1406 outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID; 1407 outputs[noutput].slot_index = 0; 1408 outputs[noutput].usage_mask = 0x1; 1409 if (ctx->stage == MESA_SHADER_TESS_EVAL) 1410 outputs[noutput].values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id); 1411 else 1412 outputs[noutput].values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.vs_prim_id); 1413 for (unsigned j = 1; j < 4; j++) 1414 outputs[noutput].values[j] = ctx->ac.f32_0; 1415 noutput++; 1416 } 1417 1418 radv_llvm_export_vs(ctx, outputs, noutput, outinfo, export_clip_dists); 1419 1420 free(outputs); 1421} 1422 1423static LLVMValueRef 1424get_wave_id_in_tg(struct radv_shader_context *ctx) 1425{ 1426 return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 24, 4); 1427} 1428 1429static LLVMValueRef 1430get_tgsize(struct radv_shader_context *ctx) 1431{ 1432 return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 28, 4); 1433} 1434 1435static LLVMValueRef 1436get_thread_id_in_tg(struct radv_shader_context *ctx) 1437{ 1438 LLVMBuilderRef builder = ctx->ac.builder; 1439 LLVMValueRef tmp; 1440 tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx), 1441 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""); 1442 return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), ""); 1443} 1444 1445static LLVMValueRef 1446ngg_get_vtx_cnt(struct radv_shader_context *ctx) 1447{ 1448 return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info), 1449 LLVMConstInt(ctx->ac.i32, 12, false), LLVMConstInt(ctx->ac.i32, 9, false), 1450 false); 1451} 1452 1453static LLVMValueRef 1454ngg_get_prim_cnt(struct radv_shader_context *ctx) 1455{ 1456 return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info), 1457 LLVMConstInt(ctx->ac.i32, 22, false), LLVMConstInt(ctx->ac.i32, 9, false), 1458 false); 1459} 1460 1461static LLVMValueRef 1462ngg_gs_get_vertex_storage(struct radv_shader_context *ctx) 1463{ 1464 unsigned num_outputs = util_bitcount64(ctx->output_mask); 1465 1466 if (ctx->args->options->key.has_multiview_view_index) 1467 num_outputs++; 1468 1469 LLVMTypeRef elements[2] = { 1470 LLVMArrayType(ctx->ac.i32, 4 * num_outputs), 1471 LLVMArrayType(ctx->ac.i8, 4), 1472 }; 1473 LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false); 1474 type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS); 1475 return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, ""); 1476} 1477 1478/** 1479 * Return a pointer to the LDS storage reserved for the N'th vertex, where N 1480 * is in emit order; that is: 1481 * - during the epilogue, N is the threadidx (relative to the entire threadgroup) 1482 * - during vertex emit, i.e. while the API GS shader invocation is running, 1483 * N = threadidx * gs_max_out_vertices + emitidx 1484 * 1485 * Goals of the LDS memory layout: 1486 * 1. Eliminate bank conflicts on write for geometry shaders that have all emits 1487 * in uniform control flow 1488 * 2. Eliminate bank conflicts on read for export if, additionally, there is no 1489 * culling 1490 * 3. Agnostic to the number of waves (since we don't know it before compiling) 1491 * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.) 1492 * 5. Avoid wasting memory. 1493 * 1494 * We use an AoS layout due to point 4 (this also helps point 3). In an AoS 1495 * layout, elimination of bank conflicts requires that each vertex occupy an 1496 * odd number of dwords. We use the additional dword to store the output stream 1497 * index as well as a flag to indicate whether this vertex ends a primitive 1498 * for rasterization. 1499 * 1500 * Swizzling is required to satisfy points 1 and 2 simultaneously. 1501 * 1502 * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx). 1503 * Indices are swizzled in groups of 32, which ensures point 1 without 1504 * disturbing point 2. 1505 * 1506 * \return an LDS pointer to type {[N x i32], [4 x i8]} 1507 */ 1508static LLVMValueRef 1509ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx) 1510{ 1511 LLVMBuilderRef builder = ctx->ac.builder; 1512 LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx); 1513 1514 /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */ 1515 unsigned write_stride_2exp = ffs(MAX2(ctx->shader->info.gs.vertices_out, 1)) - 1; 1516 if (write_stride_2exp) { 1517 LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), ""); 1518 LLVMValueRef swizzle = LLVMBuildAnd( 1519 builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), ""); 1520 vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, ""); 1521 } 1522 1523 return ac_build_gep0(&ctx->ac, storage, vertexidx); 1524} 1525 1526static LLVMValueRef 1527ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, LLVMValueRef emitidx) 1528{ 1529 LLVMBuilderRef builder = ctx->ac.builder; 1530 LLVMValueRef tmp; 1531 1532 tmp = LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false); 1533 tmp = LLVMBuildMul(builder, tmp, gsthread, ""); 1534 const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, ""); 1535 return ngg_gs_vertex_ptr(ctx, vertexidx); 1536} 1537 1538static LLVMValueRef 1539ngg_gs_get_emit_output_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr, 1540 unsigned out_idx) 1541{ 1542 LLVMValueRef gep_idx[3] = { 1543 ctx->ac.i32_0, /* implied C-style array */ 1544 ctx->ac.i32_0, /* first struct entry */ 1545 LLVMConstInt(ctx->ac.i32, out_idx, false), 1546 }; 1547 return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); 1548} 1549 1550static LLVMValueRef 1551ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr, 1552 unsigned stream) 1553{ 1554 LLVMValueRef gep_idx[3] = { 1555 ctx->ac.i32_0, /* implied C-style array */ 1556 ctx->ac.i32_1, /* second struct entry */ 1557 LLVMConstInt(ctx->ac.i32, stream, false), 1558 }; 1559 return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); 1560} 1561 1562static void 1563handle_ngg_outputs_post_2(struct radv_shader_context *ctx) 1564{ 1565 LLVMBuilderRef builder = ctx->ac.builder; 1566 LLVMValueRef tmp; 1567 1568 assert((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) && 1569 !ctx->args->is_gs_copy_shader); 1570 1571 LLVMValueRef prims_in_wave = 1572 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8); 1573 LLVMValueRef vtx_in_wave = 1574 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 0, 8); 1575 LLVMValueRef is_gs_thread = 1576 LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), prims_in_wave, ""); 1577 LLVMValueRef is_es_thread = 1578 LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), vtx_in_wave, ""); 1579 LLVMValueRef vtxindex[] = { 1580 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 0, 16), 1581 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 16, 16), 1582 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[1]), 0, 16), 1583 }; 1584 1585 /* Determine the number of vertices per primitive. */ 1586 unsigned num_vertices; 1587 1588 if (ctx->stage == MESA_SHADER_VERTEX) { 1589 num_vertices = 3; /* TODO: optimize for points & lines */ 1590 } else { 1591 assert(ctx->stage == MESA_SHADER_TESS_EVAL); 1592 1593 if (ctx->shader->info.tess.point_mode) 1594 num_vertices = 1; 1595 else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES) 1596 num_vertices = 2; 1597 else 1598 num_vertices = 3; 1599 } 1600 1601 /* Copy Primitive IDs from GS threads to the LDS address corresponding 1602 * to the ES thread of the provoking vertex. 1603 */ 1604 if (ctx->stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.outinfo.export_prim_id) { 1605 ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); 1606 1607 LLVMValueRef provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, 0, false); 1608 1609 /* For provoking vertex last mode, use num_vtx_in_prim - 1. */ 1610 if (ctx->args->options->key.vs.provoking_vtx_last) { 1611 uint8_t outprim = si_conv_prim_to_gs_out(ctx->args->options->key.vs.topology); 1612 provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, outprim, false); 1613 } 1614 1615 /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */ 1616 LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3); 1617 LLVMValueRef provoking_vtx_index = 1618 LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, ""); 1619 1620 LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_prim_id), 1621 ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index)); 1622 ac_build_endif(&ctx->ac, 5400); 1623 } 1624 1625 /* TODO: primitive culling */ 1626 1627 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx), 1628 ngg_get_prim_cnt(ctx)); 1629 1630 /* TODO: streamout queries */ 1631 /* Export primitive data to the index buffer. 1632 * 1633 * For the first version, we will always build up all three indices 1634 * independent of the primitive type. The additional garbage data 1635 * shouldn't hurt. 1636 * 1637 * TODO: culling depends on the primitive type, so can have some 1638 * interaction here. 1639 */ 1640 ac_build_ifcc(&ctx->ac, is_gs_thread, 6001); 1641 { 1642 struct ac_ngg_prim prim = {0}; 1643 1644 if (ctx->args->shader_info->is_ngg_passthrough) { 1645 prim.passthrough = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]); 1646 } else { 1647 prim.num_vertices = num_vertices; 1648 prim.isnull = ctx->ac.i1false; 1649 prim.edgeflags = ctx->ac.i32_0; 1650 memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); 1651 } 1652 1653 ac_build_export_prim(&ctx->ac, &prim); 1654 } 1655 ac_build_endif(&ctx->ac, 6001); 1656 1657 /* Export per-vertex data (positions and parameters). */ 1658 ac_build_ifcc(&ctx->ac, is_es_thread, 6002); 1659 { 1660 struct radv_vs_output_info *outinfo = ctx->stage == MESA_SHADER_TESS_EVAL 1661 ? &ctx->args->shader_info->tes.outinfo 1662 : &ctx->args->shader_info->vs.outinfo; 1663 1664 /* Exporting the primitive ID is handled below. */ 1665 /* TODO: use the new VS export path */ 1666 handle_vs_outputs_post(ctx, false, outinfo->export_clip_dists, outinfo); 1667 1668 if (outinfo->export_prim_id) { 1669 LLVMValueRef values[4]; 1670 1671 if (ctx->stage == MESA_SHADER_VERTEX) { 1672 /* Wait for GS stores to finish. */ 1673 ac_build_s_barrier(&ctx->ac); 1674 1675 tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, get_thread_id_in_tg(ctx)); 1676 values[0] = LLVMBuildLoad(builder, tmp, ""); 1677 } else { 1678 assert(ctx->stage == MESA_SHADER_TESS_EVAL); 1679 values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id); 1680 } 1681 1682 values[0] = ac_to_float(&ctx->ac, values[0]); 1683 for (unsigned j = 1; j < 4; j++) 1684 values[j] = ctx->ac.f32_0; 1685 1686 radv_export_param(ctx, outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID], values, 1687 0x1); 1688 } 1689 } 1690 ac_build_endif(&ctx->ac, 6002); 1691} 1692 1693static void 1694gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx) 1695{ 1696 /* Zero out the part of LDS scratch that is used to accumulate the 1697 * per-stream generated primitive count. 1698 */ 1699 LLVMBuilderRef builder = ctx->ac.builder; 1700 LLVMValueRef scratchptr = ctx->gs_ngg_scratch; 1701 LLVMValueRef tid = get_thread_id_in_tg(ctx); 1702 LLVMBasicBlockRef merge_block; 1703 LLVMValueRef cond; 1704 1705 LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->ac.builder)); 1706 LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, ""); 1707 merge_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, ""); 1708 1709 cond = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), ""); 1710 LLVMBuildCondBr(ctx->ac.builder, cond, then_block, merge_block); 1711 LLVMPositionBuilderAtEnd(ctx->ac.builder, then_block); 1712 1713 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid); 1714 LLVMBuildStore(builder, ctx->ac.i32_0, ptr); 1715 1716 LLVMBuildBr(ctx->ac.builder, merge_block); 1717 LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block); 1718 1719 ac_build_s_barrier(&ctx->ac); 1720} 1721 1722static void 1723gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx) 1724{ 1725 LLVMBuilderRef builder = ctx->ac.builder; 1726 LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false); 1727 LLVMValueRef tmp; 1728 1729 /* Zero out remaining (non-emitted) primitive flags. 1730 * 1731 * Note: Alternatively, we could pass the relevant gs_next_vertex to 1732 * the emit threads via LDS. This is likely worse in the expected 1733 * typical case where each GS thread emits the full set of 1734 * vertices. 1735 */ 1736 for (unsigned stream = 0; stream < 4; ++stream) { 1737 unsigned num_components; 1738 1739 num_components = ctx->args->shader_info->gs.num_stream_output_components[stream]; 1740 if (!num_components) 1741 continue; 1742 1743 const LLVMValueRef gsthread = get_thread_id_in_tg(ctx); 1744 1745 ac_build_bgnloop(&ctx->ac, 5100); 1746 1747 const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], ""); 1748 tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx, 1749 LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); 1750 ac_build_ifcc(&ctx->ac, tmp, 5101); 1751 ac_build_break(&ctx->ac); 1752 ac_build_endif(&ctx->ac, 5101); 1753 1754 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); 1755 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); 1756 1757 tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx); 1758 LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream)); 1759 1760 ac_build_endloop(&ctx->ac, 5100); 1761 } 1762 1763 /* Accumulate generated primitives counts across the entire threadgroup. */ 1764 for (unsigned stream = 0; stream < 4; ++stream) { 1765 unsigned num_components; 1766 1767 num_components = ctx->args->shader_info->gs.num_stream_output_components[stream]; 1768 if (!num_components) 1769 continue; 1770 1771 LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); 1772 numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size); 1773 1774 tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, ""); 1775 ac_build_ifcc(&ctx->ac, tmp, 5105); 1776 { 1777 LLVMBuildAtomicRMW( 1778 builder, LLVMAtomicRMWBinOpAdd, 1779 ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)), 1780 numprims, LLVMAtomicOrderingMonotonic, false); 1781 } 1782 ac_build_endif(&ctx->ac, 5105); 1783 } 1784} 1785 1786static void 1787gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) 1788{ 1789 const unsigned verts_per_prim = 1790 si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive); 1791 LLVMBuilderRef builder = ctx->ac.builder; 1792 LLVMValueRef tmp, tmp2; 1793 1794 ac_build_s_barrier(&ctx->ac); 1795 1796 const LLVMValueRef tid = get_thread_id_in_tg(ctx); 1797 LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx); 1798 1799 /* Write shader query data. */ 1800 tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state); 1801 tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); 1802 ac_build_ifcc(&ctx->ac, tmp, 5109); 1803 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), ""); 1804 ac_build_ifcc(&ctx->ac, tmp, 5110); 1805 { 1806 tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), ""); 1807 1808 ac_llvm_add_target_dep_function_attr(ctx->main_function, "amdgpu-gds-size", 256); 1809 1810 LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); 1811 LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); 1812 1813 const char *sync_scope = "workgroup-one-as"; 1814 1815 /* Use a plain GDS atomic to accumulate the number of generated 1816 * primitives. 1817 */ 1818 ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase, tmp, sync_scope); 1819 } 1820 ac_build_endif(&ctx->ac, 5110); 1821 ac_build_endif(&ctx->ac, 5109); 1822 1823 /* TODO: culling */ 1824 1825 /* Determine vertex liveness. */ 1826 LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive"); 1827 1828 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); 1829 ac_build_ifcc(&ctx->ac, tmp, 5120); 1830 { 1831 for (unsigned i = 0; i < verts_per_prim; ++i) { 1832 const LLVMValueRef primidx = 1833 LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), ""); 1834 1835 if (i > 0) { 1836 tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, ""); 1837 ac_build_ifcc(&ctx->ac, tmp, 5121 + i); 1838 } 1839 1840 /* Load primitive liveness */ 1841 tmp = ngg_gs_vertex_ptr(ctx, primidx); 1842 tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); 1843 const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); 1844 1845 tmp = LLVMBuildLoad(builder, vertliveptr, ""); 1846 tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr); 1847 1848 if (i > 0) 1849 ac_build_endif(&ctx->ac, 5121 + i); 1850 } 1851 } 1852 ac_build_endif(&ctx->ac, 5120); 1853 1854 /* Inclusive scan addition across the current wave. */ 1855 LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, ""); 1856 struct ac_wg_scan vertlive_scan = {0}; 1857 vertlive_scan.op = nir_op_iadd; 1858 vertlive_scan.enable_reduce = true; 1859 vertlive_scan.enable_exclusive = true; 1860 vertlive_scan.src = vertlive; 1861 vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0); 1862 vertlive_scan.waveidx = get_wave_id_in_tg(ctx); 1863 vertlive_scan.numwaves = get_tgsize(ctx); 1864 vertlive_scan.maxwaves = 8; 1865 1866 ac_build_wg_scan(&ctx->ac, &vertlive_scan); 1867 1868 /* Skip all exports (including index exports) when possible. At least on 1869 * early gfx10 revisions this is also to avoid hangs. 1870 */ 1871 LLVMValueRef have_exports = 1872 LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, ""); 1873 num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, ""); 1874 1875 /* Allocate export space. Send this message as early as possible, to 1876 * hide the latency of the SQ <-> SPI roundtrip. 1877 * 1878 * Note: We could consider compacting primitives for export as well. 1879 * PA processes 1 non-null prim / clock, but it fetches 4 DW of 1880 * prim data per clock and skips null primitives at no additional 1881 * cost. So compacting primitives can only be beneficial when 1882 * there are 4 or more contiguous null primitives in the export 1883 * (in the common case of single-dword prim exports). 1884 */ 1885 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce, 1886 num_emit_threads); 1887 1888 /* Setup the reverse vertex compaction permutation. We re-use stream 1 1889 * of the primitive liveness flags, relying on the fact that each 1890 * threadgroup can have at most 256 threads. */ 1891 ac_build_ifcc(&ctx->ac, vertlive, 5130); 1892 { 1893 tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive); 1894 tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, ""); 1895 LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1)); 1896 } 1897 ac_build_endif(&ctx->ac, 5130); 1898 1899 ac_build_s_barrier(&ctx->ac); 1900 1901 /* Export primitive data */ 1902 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); 1903 ac_build_ifcc(&ctx->ac, tmp, 5140); 1904 { 1905 LLVMValueRef flags; 1906 struct ac_ngg_prim prim = {0}; 1907 prim.num_vertices = verts_per_prim; 1908 1909 tmp = ngg_gs_vertex_ptr(ctx, tid); 1910 flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); 1911 prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), ""); 1912 prim.edgeflags = ctx->ac.i32_0; 1913 1914 for (unsigned i = 0; i < verts_per_prim; ++i) { 1915 prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, 1916 LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); 1917 } 1918 1919 /* Geometry shaders output triangle strips, but NGG expects triangles. */ 1920 if (verts_per_prim == 3) { 1921 LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, ""); 1922 is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, ""); 1923 1924 LLVMValueRef flatshade_first = 1925 LLVMConstInt(ctx->ac.i1, !ctx->args->options->key.vs.provoking_vtx_last, false); 1926 1927 ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index); 1928 } 1929 1930 ac_build_export_prim(&ctx->ac, &prim); 1931 } 1932 ac_build_endif(&ctx->ac, 5140); 1933 1934 /* Export position and parameter data */ 1935 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, ""); 1936 ac_build_ifcc(&ctx->ac, tmp, 5145); 1937 { 1938 struct radv_vs_output_info *outinfo = &ctx->args->shader_info->vs.outinfo; 1939 bool export_view_index = ctx->args->options->key.has_multiview_view_index; 1940 struct radv_shader_output_values *outputs; 1941 unsigned noutput = 0; 1942 1943 /* Allocate a temporary array for the output values. */ 1944 unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_view_index; 1945 outputs = calloc(num_outputs, sizeof(outputs[0])); 1946 1947 tmp = ngg_gs_vertex_ptr(ctx, tid); 1948 tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), ""); 1949 tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); 1950 const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp); 1951 1952 unsigned out_idx = 0; 1953 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { 1954 unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i]; 1955 int length = util_last_bit(output_usage_mask); 1956 1957 if (!(ctx->output_mask & (1ull << i))) 1958 continue; 1959 1960 outputs[noutput].slot_name = i; 1961 outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1; 1962 outputs[noutput].usage_mask = output_usage_mask; 1963 1964 for (unsigned j = 0; j < length; j++, out_idx++) { 1965 if (!(output_usage_mask & (1 << j))) 1966 continue; 1967 1968 tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx); 1969 tmp = LLVMBuildLoad(builder, tmp, ""); 1970 1971 LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]); 1972 if (ac_get_type_size(type) == 2) { 1973 tmp = ac_to_integer(&ctx->ac, tmp); 1974 tmp = LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i16, ""); 1975 } 1976 1977 outputs[noutput].values[j] = ac_to_float(&ctx->ac, tmp); 1978 } 1979 1980 for (unsigned j = length; j < 4; j++) 1981 outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32); 1982 1983 noutput++; 1984 } 1985 1986 /* Export ViewIndex. */ 1987 if (export_view_index) { 1988 outputs[noutput].slot_name = VARYING_SLOT_LAYER; 1989 outputs[noutput].slot_index = 0; 1990 outputs[noutput].usage_mask = 0x1; 1991 outputs[noutput].values[0] = 1992 ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.view_index)); 1993 for (unsigned j = 1; j < 4; j++) 1994 outputs[noutput].values[j] = ctx->ac.f32_0; 1995 noutput++; 1996 } 1997 1998 radv_llvm_export_vs(ctx, outputs, noutput, outinfo, outinfo->export_clip_dists); 1999 FREE(outputs); 2000 } 2001 ac_build_endif(&ctx->ac, 5145); 2002} 2003 2004static void 2005gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, LLVMValueRef vertexidx, 2006 LLVMValueRef *addrs) 2007{ 2008 LLVMBuilderRef builder = ctx->ac.builder; 2009 LLVMValueRef tmp; 2010 2011 const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx); 2012 unsigned out_idx = 0; 2013 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { 2014 unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i]; 2015 uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i]; 2016 LLVMValueRef *out_ptr = &addrs[i * 4]; 2017 int length = util_last_bit(output_usage_mask); 2018 2019 if (!(ctx->output_mask & (1ull << i)) || output_stream != stream) 2020 continue; 2021 2022 for (unsigned j = 0; j < length; j++, out_idx++) { 2023 if (!(output_usage_mask & (1 << j))) 2024 continue; 2025 2026 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); 2027 out_val = ac_to_integer(&ctx->ac, out_val); 2028 out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, ""); 2029 2030 LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx)); 2031 } 2032 } 2033 assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size); 2034 2035 /* Store the current number of emitted vertices to zero out remaining 2036 * primitive flags in case the geometry shader doesn't emit the maximum 2037 * number of vertices. 2038 */ 2039 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); 2040 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); 2041 2042 /* Determine and store whether this vertex completed a primitive. */ 2043 const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], ""); 2044 2045 tmp = LLVMConstInt( 2046 ctx->ac.i32, si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) - 1, false); 2047 const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, ""); 2048 2049 /* Since the geometry shader emits triangle strips, we need to 2050 * track which primitive is odd and swap vertex indices to get 2051 * the correct vertex order. 2052 */ 2053 LLVMValueRef is_odd = ctx->ac.i1false; 2054 if (stream == 0 && si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) == 3) { 2055 tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, ""); 2056 is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, ""); 2057 } 2058 2059 tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, ""); 2060 LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]); 2061 2062 /* The per-vertex primitive flag encoding: 2063 * bit 0: whether this vertex finishes a primitive 2064 * bit 1: whether the primitive is odd (if we are emitting triangle strips) 2065 */ 2066 tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, ""); 2067 tmp = LLVMBuildOr( 2068 builder, tmp, 2069 LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), ""); 2070 LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream)); 2071 2072 tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); 2073 tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), ""); 2074 LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]); 2075} 2076 2077static bool 2078si_export_mrt_color(struct radv_shader_context *ctx, LLVMValueRef *color, unsigned index, 2079 struct ac_export_args *args) 2080{ 2081 /* Export */ 2082 si_llvm_init_export_args(ctx, color, 0xf, V_008DFC_SQ_EXP_MRT + index, args); 2083 if (!args->enabled_channels) 2084 return false; /* unnecessary NULL export */ 2085 2086 return true; 2087} 2088 2089static void 2090radv_export_mrt_z(struct radv_shader_context *ctx, LLVMValueRef depth, LLVMValueRef stencil, 2091 LLVMValueRef samplemask) 2092{ 2093 struct ac_export_args args; 2094 2095 ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args); 2096 2097 ac_build_export(&ctx->ac, &args); 2098} 2099 2100static void 2101handle_fs_outputs_post(struct radv_shader_context *ctx) 2102{ 2103 unsigned index = 0; 2104 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; 2105 struct ac_export_args color_args[8]; 2106 2107 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { 2108 LLVMValueRef values[4]; 2109 2110 if (!(ctx->output_mask & (1ull << i))) 2111 continue; 2112 2113 if (i < FRAG_RESULT_DATA0) 2114 continue; 2115 2116 for (unsigned j = 0; j < 4; j++) 2117 values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j)); 2118 2119 bool ret = si_export_mrt_color(ctx, values, i - FRAG_RESULT_DATA0, &color_args[index]); 2120 if (ret) 2121 index++; 2122 } 2123 2124 /* Process depth, stencil, samplemask. */ 2125 if (ctx->args->shader_info->ps.writes_z) { 2126 depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0)); 2127 } 2128 if (ctx->args->shader_info->ps.writes_stencil) { 2129 stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0)); 2130 } 2131 if (ctx->args->shader_info->ps.writes_sample_mask) { 2132 samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0)); 2133 } 2134 2135 /* Set the DONE bit on last non-null color export only if Z isn't 2136 * exported. 2137 */ 2138 if (index > 0 && !ctx->args->shader_info->ps.writes_z && 2139 !ctx->args->shader_info->ps.writes_stencil && 2140 !ctx->args->shader_info->ps.writes_sample_mask) { 2141 unsigned last = index - 1; 2142 2143 color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */ 2144 color_args[last].done = 1; /* DONE bit */ 2145 } 2146 2147 /* Export PS outputs. */ 2148 for (unsigned i = 0; i < index; i++) 2149 ac_build_export(&ctx->ac, &color_args[i]); 2150 2151 if (depth || stencil || samplemask) 2152 radv_export_mrt_z(ctx, depth, stencil, samplemask); 2153 else if (!index) 2154 ac_build_export_null(&ctx->ac); 2155} 2156 2157static void 2158emit_gs_epilogue(struct radv_shader_context *ctx) 2159{ 2160 if (ctx->args->shader_info->is_ngg) { 2161 gfx10_ngg_gs_emit_epilogue_1(ctx); 2162 return; 2163 } 2164 2165 if (ctx->ac.chip_class >= GFX10) 2166 LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, ""); 2167 2168 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id); 2169} 2170 2171static void 2172handle_shader_outputs_post(struct ac_shader_abi *abi) 2173{ 2174 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); 2175 2176 switch (ctx->stage) { 2177 case MESA_SHADER_VERTEX: 2178 if (ctx->args->shader_info->vs.as_ls) 2179 break; /* Lowered in NIR */ 2180 else if (ctx->args->shader_info->vs.as_es) 2181 break; /* Lowered in NIR */ 2182 else if (ctx->args->shader_info->is_ngg) 2183 break; 2184 else 2185 handle_vs_outputs_post(ctx, ctx->args->shader_info->vs.outinfo.export_prim_id, 2186 ctx->args->shader_info->vs.outinfo.export_clip_dists, 2187 &ctx->args->shader_info->vs.outinfo); 2188 break; 2189 case MESA_SHADER_FRAGMENT: 2190 handle_fs_outputs_post(ctx); 2191 break; 2192 case MESA_SHADER_GEOMETRY: 2193 emit_gs_epilogue(ctx); 2194 break; 2195 case MESA_SHADER_TESS_CTRL: 2196 break; /* Lowered in NIR */ 2197 case MESA_SHADER_TESS_EVAL: 2198 if (ctx->args->shader_info->tes.as_es) 2199 break; /* Lowered in NIR */ 2200 else if (ctx->args->shader_info->is_ngg) 2201 break; 2202 else 2203 handle_vs_outputs_post(ctx, ctx->args->shader_info->tes.outinfo.export_prim_id, 2204 ctx->args->shader_info->tes.outinfo.export_clip_dists, 2205 &ctx->args->shader_info->tes.outinfo); 2206 break; 2207 default: 2208 break; 2209 } 2210} 2211 2212static void 2213ac_llvm_finalize_module(struct radv_shader_context *ctx, LLVMPassManagerRef passmgr, 2214 const struct radv_nir_compiler_options *options) 2215{ 2216 LLVMRunPassManager(passmgr, ctx->ac.module); 2217 LLVMDisposeBuilder(ctx->ac.builder); 2218 2219 ac_llvm_context_dispose(&ctx->ac); 2220} 2221 2222static void 2223ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx) 2224{ 2225 struct radv_vs_output_info *outinfo; 2226 2227 switch (ctx->stage) { 2228 case MESA_SHADER_FRAGMENT: 2229 case MESA_SHADER_COMPUTE: 2230 case MESA_SHADER_TESS_CTRL: 2231 case MESA_SHADER_GEOMETRY: 2232 return; 2233 case MESA_SHADER_VERTEX: 2234 if (ctx->args->shader_info->vs.as_ls || 2235 ctx->args->shader_info->vs.as_es) 2236 return; 2237 outinfo = &ctx->args->shader_info->vs.outinfo; 2238 break; 2239 case MESA_SHADER_TESS_EVAL: 2240 if (ctx->args->shader_info->tes.as_es) 2241 return; 2242 outinfo = &ctx->args->shader_info->tes.outinfo; 2243 break; 2244 default: 2245 unreachable("Unhandled shader type"); 2246 } 2247 2248 ac_optimize_vs_outputs(&ctx->ac, ctx->main_function, outinfo->vs_output_param_offset, 2249 VARYING_SLOT_MAX, 0, &outinfo->param_exports); 2250} 2251 2252static void 2253ac_setup_rings(struct radv_shader_context *ctx) 2254{ 2255 if (ctx->args->options->chip_class <= GFX8 && 2256 (ctx->stage == MESA_SHADER_GEOMETRY || 2257 (ctx->stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.as_es) || 2258 (ctx->stage == MESA_SHADER_TESS_EVAL && ctx->args->shader_info->tes.as_es))) { 2259 unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS; 2260 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false); 2261 2262 ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, offset); 2263 } 2264 2265 if (ctx->args->is_gs_copy_shader) { 2266 ctx->gsvs_ring[0] = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, 2267 LLVMConstInt(ctx->ac.i32, RING_GSVS_VS, false)); 2268 } 2269 2270 if (ctx->stage == MESA_SHADER_GEOMETRY) { 2271 /* The conceptual layout of the GSVS ring is 2272 * v0c0 .. vLv0 v0c1 .. vLc1 .. 2273 * but the real memory layout is swizzled across 2274 * threads: 2275 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL 2276 * t16v0c0 .. 2277 * Override the buffer descriptor accordingly. 2278 */ 2279 LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2); 2280 uint64_t stream_offset = 0; 2281 unsigned num_records = ctx->ac.wave_size; 2282 LLVMValueRef base_ring; 2283 2284 base_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, 2285 LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false)); 2286 2287 for (unsigned stream = 0; stream < 4; stream++) { 2288 unsigned num_components, stride; 2289 LLVMValueRef ring, tmp; 2290 2291 num_components = ctx->args->shader_info->gs.num_stream_output_components[stream]; 2292 2293 if (!num_components) 2294 continue; 2295 2296 stride = 4 * num_components * ctx->shader->info.gs.vertices_out; 2297 2298 /* Limit on the stride field for <= GFX7. */ 2299 assert(stride < (1 << 14)); 2300 2301 ring = LLVMBuildBitCast(ctx->ac.builder, base_ring, v2i64, ""); 2302 tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_0, ""); 2303 tmp = LLVMBuildAdd(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), ""); 2304 ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_0, ""); 2305 2306 stream_offset += stride * ctx->ac.wave_size; 2307 2308 ring = LLVMBuildBitCast(ctx->ac.builder, ring, ctx->ac.v4i32, ""); 2309 2310 tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_1, ""); 2311 tmp = LLVMBuildOr(ctx->ac.builder, tmp, 2312 LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride), false), ""); 2313 ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_1, ""); 2314 2315 ring = LLVMBuildInsertElement(ctx->ac.builder, ring, 2316 LLVMConstInt(ctx->ac.i32, num_records, false), 2317 LLVMConstInt(ctx->ac.i32, 2, false), ""); 2318 2319 ctx->gsvs_ring[stream] = ring; 2320 } 2321 } 2322 2323 if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL) { 2324 ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr( 2325 &ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_OFFCHIP, false)); 2326 ctx->hs_ring_tess_factor = ac_build_load_to_sgpr( 2327 &ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_FACTOR, false)); 2328 } 2329} 2330 2331/* Fixup the HW not emitting the TCS regs if there are no HS threads. */ 2332static void 2333ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx) 2334{ 2335 LLVMValueRef count = 2336 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8); 2337 LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count, ctx->ac.i32_0, ""); 2338 ctx->abi.instance_id = 2339 LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id), 2340 ctx->abi.instance_id, ""); 2341 ctx->vs_rel_patch_id = 2342 LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids), 2343 ctx->vs_rel_patch_id, ""); 2344 ctx->abi.vertex_id = 2345 LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_patch_id), 2346 ctx->abi.vertex_id, ""); 2347} 2348 2349static void 2350prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged) 2351{ 2352 if (merged) { 2353 for (int i = 5; i >= 0; --i) { 2354 ctx->gs_vtx_offset[i] = ac_unpack_param( 2355 &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i / 2]), (i & 1) * 16, 16); 2356 } 2357 2358 ctx->gs_wave_id = 2359 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 16, 8); 2360 } else { 2361 for (int i = 0; i < 6; i++) 2362 ctx->gs_vtx_offset[i] = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i]); 2363 ctx->gs_wave_id = ac_get_arg(&ctx->ac, ctx->args->ac.gs_wave_id); 2364 } 2365} 2366 2367/* Ensure that the esgs ring is declared. 2368 * 2369 * We declare it with 64KB alignment as a hint that the 2370 * pointer value will always be 0. 2371 */ 2372static void 2373declare_esgs_ring(struct radv_shader_context *ctx) 2374{ 2375 if (ctx->esgs_ring) 2376 return; 2377 2378 assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring")); 2379 2380 ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), 2381 "esgs_ring", AC_ADDR_SPACE_LDS); 2382 LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage); 2383 LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); 2384} 2385 2386static LLVMModuleRef 2387ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders, 2388 int shader_count, const struct radv_shader_args *args) 2389{ 2390 struct radv_shader_context ctx = {0}; 2391 ctx.args = args; 2392 2393 enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT; 2394 2395 if (shaders[0]->info.float_controls_execution_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) { 2396 float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO; 2397 } 2398 2399 ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family, 2400 args->options->info, float_mode, args->shader_info->wave_size, 2401 args->shader_info->ballot_bit_size); 2402 ctx.context = ctx.ac.context; 2403 2404 ctx.max_workgroup_size = args->shader_info->workgroup_size; 2405 2406 if (ctx.ac.chip_class >= GFX10) { 2407 if (is_pre_gs_stage(shaders[0]->info.stage) && args->shader_info->is_ngg) { 2408 ctx.max_workgroup_size = 128; 2409 } 2410 } 2411 2412 create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2); 2413 2414 ctx.abi.emit_outputs = handle_shader_outputs_post; 2415 ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter; 2416 ctx.abi.load_ubo = radv_load_ubo; 2417 ctx.abi.load_ssbo = radv_load_ssbo; 2418 ctx.abi.load_sampler_desc = radv_get_sampler_desc; 2419 ctx.abi.load_resource = radv_load_resource; 2420 ctx.abi.load_ring_tess_factors = load_ring_tess_factors; 2421 ctx.abi.load_ring_tess_offchip = load_ring_tess_offchip; 2422 ctx.abi.load_ring_esgs = load_ring_esgs; 2423 ctx.abi.clamp_shadow_reference = false; 2424 ctx.abi.adjust_frag_coord_z = args->options->adjust_frag_coord_z; 2425 ctx.abi.robust_buffer_access = args->options->robust_buffer_access; 2426 2427 bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && args->shader_info->is_ngg; 2428 if (shader_count >= 2 || is_ngg) 2429 ac_init_exec_full_mask(&ctx.ac); 2430 2431 if (args->ac.vertex_id.used) 2432 ctx.abi.vertex_id = ac_get_arg(&ctx.ac, args->ac.vertex_id); 2433 if (args->ac.vs_rel_patch_id.used) 2434 ctx.vs_rel_patch_id = ac_get_arg(&ctx.ac, args->ac.vs_rel_patch_id); 2435 if (args->ac.instance_id.used) 2436 ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id); 2437 2438 if (args->options->has_ls_vgpr_init_bug && 2439 shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL) 2440 ac_nir_fixup_ls_hs_input_vgprs(&ctx); 2441 2442 if (is_ngg) { 2443 /* Declare scratch space base for streamout and vertex 2444 * compaction. Whether space is actually allocated is 2445 * determined during linking / PM4 creation. 2446 * 2447 * Add an extra dword per vertex to ensure an odd stride, which 2448 * avoids bank conflicts for SoA accesses. 2449 */ 2450 if (!args->shader_info->is_ngg_passthrough) 2451 declare_esgs_ring(&ctx); 2452 2453 /* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */ 2454 if (ctx.ac.chip_class == GFX10 && shader_count == 1) 2455 ac_build_s_barrier(&ctx.ac); 2456 } 2457 2458 for (int shader_idx = 0; shader_idx < shader_count; ++shader_idx) { 2459 ctx.stage = shaders[shader_idx]->info.stage; 2460 ctx.shader = shaders[shader_idx]; 2461 ctx.output_mask = 0; 2462 2463 if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY) { 2464 for (int i = 0; i < 4; i++) { 2465 ctx.gs_next_vertex[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); 2466 } 2467 if (args->shader_info->is_ngg) { 2468 for (unsigned i = 0; i < 4; ++i) { 2469 ctx.gs_curprim_verts[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); 2470 ctx.gs_generated_prims[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); 2471 } 2472 2473 LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8); 2474 ctx.gs_ngg_scratch = 2475 LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS); 2476 LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32)); 2477 LLVMSetAlignment(ctx.gs_ngg_scratch, 4); 2478 2479 ctx.gs_ngg_emit = LLVMAddGlobalInAddressSpace( 2480 ctx.ac.module, LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); 2481 LLVMSetLinkage(ctx.gs_ngg_emit, LLVMExternalLinkage); 2482 LLVMSetAlignment(ctx.gs_ngg_emit, 4); 2483 } 2484 2485 ctx.abi.emit_primitive = visit_end_primitive; 2486 } else if (shaders[shader_idx]->info.stage == MESA_SHADER_TESS_EVAL) { 2487 } else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX) { 2488 ctx.abi.load_base_vertex = radv_load_base_vertex; 2489 ctx.abi.load_inputs = radv_load_vs_inputs; 2490 } else if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT) { 2491 ctx.abi.load_sample_position = load_sample_position; 2492 ctx.abi.load_sample_mask_in = load_sample_mask_in; 2493 } 2494 2495 if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX && 2496 args->shader_info->is_ngg && 2497 args->shader_info->vs.outinfo.export_prim_id) { 2498 declare_esgs_ring(&ctx); 2499 } 2500 2501 bool nested_barrier = false; 2502 2503 if (shader_idx) { 2504 if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY && 2505 args->shader_info->is_ngg) { 2506 gfx10_ngg_gs_emit_prologue(&ctx); 2507 nested_barrier = false; 2508 } else { 2509 nested_barrier = true; 2510 } 2511 } 2512 2513 if (nested_barrier) { 2514 /* Execute a barrier before the second shader in 2515 * a merged shader. 2516 * 2517 * Execute the barrier inside the conditional block, 2518 * so that empty waves can jump directly to s_endpgm, 2519 * which will also signal the barrier. 2520 * 2521 * This is possible in gfx9, because an empty wave 2522 * for the second shader does not participate in 2523 * the epilogue. With NGG, empty waves may still 2524 * be required to export data (e.g. GS output vertices), 2525 * so we cannot let them exit early. 2526 * 2527 * If the shader is TCS and the TCS epilog is present 2528 * and contains a barrier, it will wait there and then 2529 * reach s_endpgm. 2530 */ 2531 ac_emit_barrier(&ctx.ac, ctx.stage); 2532 } 2533 2534 nir_foreach_shader_out_variable(variable, shaders[shader_idx]) scan_shader_output_decl( 2535 &ctx, variable, shaders[shader_idx], shaders[shader_idx]->info.stage); 2536 2537 ac_setup_rings(&ctx); 2538 2539 LLVMBasicBlockRef merge_block = NULL; 2540 if (shader_count >= 2 || is_ngg) { 2541 LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); 2542 LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); 2543 merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); 2544 2545 LLVMValueRef count = ac_unpack_param( 2546 &ctx.ac, ac_get_arg(&ctx.ac, args->ac.merged_wave_info), 8 * shader_idx, 8); 2547 LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac); 2548 LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT, thread_id, count, ""); 2549 LLVMBuildCondBr(ctx.ac.builder, cond, then_block, merge_block); 2550 2551 LLVMPositionBuilderAtEnd(ctx.ac.builder, then_block); 2552 } 2553 2554 if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT) 2555 prepare_interp_optimize(&ctx, shaders[shader_idx]); 2556 else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY) 2557 prepare_gs_input_vgprs(&ctx, shader_count >= 2); 2558 2559 ac_nir_translate(&ctx.ac, &ctx.abi, &args->ac, shaders[shader_idx]); 2560 2561 if (shader_count >= 2 || is_ngg) { 2562 LLVMBuildBr(ctx.ac.builder, merge_block); 2563 LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block); 2564 } 2565 2566 /* This needs to be outside the if wrapping the shader body, as sometimes 2567 * the HW generates waves with 0 es/vs threads. */ 2568 if (is_pre_gs_stage(shaders[shader_idx]->info.stage) && 2569 args->shader_info->is_ngg && shader_idx == shader_count - 1) { 2570 handle_ngg_outputs_post_2(&ctx); 2571 } else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY && 2572 args->shader_info->is_ngg) { 2573 gfx10_ngg_gs_emit_epilogue_2(&ctx); 2574 } 2575 } 2576 2577 LLVMBuildRetVoid(ctx.ac.builder); 2578 2579 if (args->options->dump_preoptir) { 2580 fprintf(stderr, "%s LLVM IR:\n\n", 2581 radv_get_shader_name(args->shader_info, shaders[shader_count - 1]->info.stage)); 2582 ac_dump_module(ctx.ac.module); 2583 fprintf(stderr, "\n"); 2584 } 2585 2586 ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options); 2587 2588 if (shader_count == 1) 2589 ac_nir_eliminate_const_vs_outputs(&ctx); 2590 2591 return ctx.ac.module; 2592} 2593 2594static void 2595ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context) 2596{ 2597 unsigned *retval = (unsigned *)context; 2598 LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di); 2599 char *description = LLVMGetDiagInfoDescription(di); 2600 2601 if (severity == LLVMDSError) { 2602 *retval = 1; 2603 fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description); 2604 } 2605 2606 LLVMDisposeMessage(description); 2607} 2608 2609static unsigned 2610radv_llvm_compile(LLVMModuleRef M, char **pelf_buffer, size_t *pelf_size, 2611 struct ac_llvm_compiler *ac_llvm) 2612{ 2613 unsigned retval = 0; 2614 LLVMContextRef llvm_ctx; 2615 2616 /* Setup Diagnostic Handler*/ 2617 llvm_ctx = LLVMGetModuleContext(M); 2618 2619 LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler, &retval); 2620 2621 /* Compile IR*/ 2622 if (!radv_compile_to_elf(ac_llvm, M, pelf_buffer, pelf_size)) 2623 retval = 1; 2624 return retval; 2625} 2626 2627static void 2628ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, LLVMModuleRef llvm_module, 2629 struct radv_shader_binary **rbinary, gl_shader_stage stage, const char *name, 2630 const struct radv_nir_compiler_options *options) 2631{ 2632 char *elf_buffer = NULL; 2633 size_t elf_size = 0; 2634 char *llvm_ir_string = NULL; 2635 2636 if (options->dump_shader) { 2637 fprintf(stderr, "%s LLVM IR:\n\n", name); 2638 ac_dump_module(llvm_module); 2639 fprintf(stderr, "\n"); 2640 } 2641 2642 if (options->record_ir) { 2643 char *llvm_ir = LLVMPrintModuleToString(llvm_module); 2644 llvm_ir_string = strdup(llvm_ir); 2645 LLVMDisposeMessage(llvm_ir); 2646 } 2647 2648 int v = radv_llvm_compile(llvm_module, &elf_buffer, &elf_size, ac_llvm); 2649 if (v) { 2650 fprintf(stderr, "compile failed\n"); 2651 } 2652 2653 LLVMContextRef ctx = LLVMGetModuleContext(llvm_module); 2654 LLVMDisposeModule(llvm_module); 2655 LLVMContextDispose(ctx); 2656 2657 size_t llvm_ir_size = llvm_ir_string ? strlen(llvm_ir_string) : 0; 2658 size_t alloc_size = sizeof(struct radv_shader_binary_rtld) + elf_size + llvm_ir_size + 1; 2659 struct radv_shader_binary_rtld *rbin = calloc(1, alloc_size); 2660 memcpy(rbin->data, elf_buffer, elf_size); 2661 if (llvm_ir_string) 2662 memcpy(rbin->data + elf_size, llvm_ir_string, llvm_ir_size + 1); 2663 2664 rbin->base.type = RADV_BINARY_TYPE_RTLD; 2665 rbin->base.stage = stage; 2666 rbin->base.total_size = alloc_size; 2667 rbin->elf_size = elf_size; 2668 rbin->llvm_ir_size = llvm_ir_size; 2669 *rbinary = &rbin->base; 2670 2671 free(llvm_ir_string); 2672 free(elf_buffer); 2673} 2674 2675static void 2676radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary, 2677 const struct radv_shader_args *args, struct nir_shader *const *nir, 2678 int nir_count) 2679{ 2680 2681 LLVMModuleRef llvm_module; 2682 2683 llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, args); 2684 2685 ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, nir[nir_count - 1]->info.stage, 2686 radv_get_shader_name(args->shader_info, nir[nir_count - 1]->info.stage), 2687 args->options); 2688} 2689 2690static void 2691ac_gs_copy_shader_emit(struct radv_shader_context *ctx) 2692{ 2693 LLVMValueRef vtx_offset = 2694 LLVMBuildMul(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id), 2695 LLVMConstInt(ctx->ac.i32, 4, false), ""); 2696 LLVMValueRef stream_id; 2697 2698 /* Fetch the vertex stream ID. */ 2699 if (ctx->args->shader_info->so.num_outputs) { 2700 stream_id = 2701 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config), 24, 2); 2702 } else { 2703 stream_id = ctx->ac.i32_0; 2704 } 2705 2706 LLVMBasicBlockRef end_bb; 2707 LLVMValueRef switch_inst; 2708 2709 end_bb = LLVMAppendBasicBlockInContext(ctx->ac.context, ctx->main_function, "end"); 2710 switch_inst = LLVMBuildSwitch(ctx->ac.builder, stream_id, end_bb, 4); 2711 2712 for (unsigned stream = 0; stream < 4; stream++) { 2713 unsigned num_components = ctx->args->shader_info->gs.num_stream_output_components[stream]; 2714 LLVMBasicBlockRef bb; 2715 unsigned offset; 2716 2717 if (stream > 0 && !num_components) 2718 continue; 2719 2720 if (stream > 0 && !ctx->args->shader_info->so.num_outputs) 2721 continue; 2722 2723 bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out"); 2724 LLVMAddCase(switch_inst, LLVMConstInt(ctx->ac.i32, stream, 0), bb); 2725 LLVMPositionBuilderAtEnd(ctx->ac.builder, bb); 2726 2727 offset = 0; 2728 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { 2729 unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i]; 2730 unsigned output_stream = ctx->args->shader_info->gs.output_streams[i]; 2731 int length = util_last_bit(output_usage_mask); 2732 2733 if (!(ctx->output_mask & (1ull << i)) || output_stream != stream) 2734 continue; 2735 2736 for (unsigned j = 0; j < length; j++) { 2737 LLVMValueRef value, soffset; 2738 2739 if (!(output_usage_mask & (1 << j))) 2740 continue; 2741 2742 soffset = LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out * 16 * 4, 2743 false); 2744 2745 offset++; 2746 2747 value = ac_build_buffer_load(&ctx->ac, ctx->gsvs_ring[0], 1, ctx->ac.i32_0, vtx_offset, 2748 soffset, 0, ctx->ac.f32, ac_glc | ac_slc, true, false); 2749 2750 LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]); 2751 if (ac_get_type_size(type) == 2) { 2752 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->ac.i32, ""); 2753 value = LLVMBuildTrunc(ctx->ac.builder, value, ctx->ac.i16, ""); 2754 } 2755 2756 LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, value), 2757 ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]); 2758 } 2759 } 2760 2761 if (ctx->args->shader_info->so.num_outputs) 2762 radv_emit_streamout(ctx, stream); 2763 2764 if (stream == 0) { 2765 handle_vs_outputs_post(ctx, false, ctx->args->shader_info->vs.outinfo.export_clip_dists, 2766 &ctx->args->shader_info->vs.outinfo); 2767 } 2768 2769 LLVMBuildBr(ctx->ac.builder, end_bb); 2770 } 2771 2772 LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb); 2773} 2774 2775static void 2776radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader, 2777 struct radv_shader_binary **rbinary, 2778 const struct radv_shader_args *args) 2779{ 2780 struct radv_shader_context ctx = {0}; 2781 ctx.args = args; 2782 2783 assert(args->is_gs_copy_shader); 2784 2785 ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family, 2786 args->options->info, AC_FLOAT_MODE_DEFAULT, 64, 64); 2787 ctx.context = ctx.ac.context; 2788 2789 ctx.stage = MESA_SHADER_VERTEX; 2790 ctx.shader = geom_shader; 2791 2792 create_function(&ctx, MESA_SHADER_VERTEX, false); 2793 2794 ac_setup_rings(&ctx); 2795 2796 nir_foreach_shader_out_variable(variable, geom_shader) 2797 { 2798 scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX); 2799 ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader, variable, MESA_SHADER_VERTEX); 2800 } 2801 2802 ac_gs_copy_shader_emit(&ctx); 2803 2804 LLVMBuildRetVoid(ctx.ac.builder); 2805 2806 ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options); 2807 2808 ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, MESA_SHADER_VERTEX, "GS Copy Shader", 2809 args->options); 2810 (*rbinary)->is_gs_copy_shader = true; 2811} 2812 2813void 2814llvm_compile_shader(struct radv_device *device, unsigned shader_count, 2815 struct nir_shader *const *shaders, struct radv_shader_binary **binary, 2816 struct radv_shader_args *args) 2817{ 2818 enum ac_target_machine_options tm_options = 0; 2819 struct ac_llvm_compiler ac_llvm; 2820 2821 tm_options |= AC_TM_SUPPORTS_SPILL; 2822 if (args->options->check_ir) 2823 tm_options |= AC_TM_CHECK_IR; 2824 2825 radv_init_llvm_compiler(&ac_llvm, args->options->family, tm_options, 2826 args->shader_info->wave_size); 2827 2828 if (args->is_gs_copy_shader) { 2829 radv_compile_gs_copy_shader(&ac_llvm, *shaders, binary, args); 2830 } else { 2831 radv_compile_nir_shader(&ac_llvm, binary, args, shaders, shader_count); 2832 } 2833} 2834