1/* 2 * Copyright 2020 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "si_pipe.h" 26#include "si_shader_internal.h" 27#include "sid.h" 28#include "util/u_memory.h" 29 30LLVMValueRef si_is_es_thread(struct si_shader_context *ctx) 31{ 32 /* Return true if the current thread should execute an ES thread. */ 33 return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), 34 si_unpack_param(ctx, ctx->args.merged_wave_info, 0, 8), ""); 35} 36 37LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx) 38{ 39 /* Return true if the current thread should execute a GS thread. */ 40 return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), 41 si_unpack_param(ctx, ctx->args.merged_wave_info, 8, 8), ""); 42} 43 44static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index, 45 unsigned vtx_offset_param, LLVMTypeRef type, 46 unsigned swizzle) 47{ 48 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 49 struct si_shader *shader = ctx->shader; 50 LLVMValueRef vtx_offset, soffset; 51 struct si_shader_info *info = &shader->selector->info; 52 unsigned param; 53 LLVMValueRef value; 54 55 param = si_shader_io_get_unique_index(info->input[input_index].semantic, false); 56 57 /* GFX9 has the ESGS ring in LDS. */ 58 if (ctx->screen->info.chip_class >= GFX9) { 59 unsigned index = vtx_offset_param; 60 vtx_offset = 61 si_unpack_param(ctx, ctx->args.gs_vtx_offset[index / 2], (index & 1) * 16, 16); 62 63 unsigned offset = param * 4 + swizzle; 64 vtx_offset = 65 LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), ""); 66 67 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset); 68 LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, ""); 69 return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); 70 } 71 72 /* GFX6: input load from the ESGS ring in memory. */ 73 /* Get the vertex offset parameter on GFX6. */ 74 LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->args.gs_vtx_offset[vtx_offset_param]); 75 76 vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), ""); 77 78 soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0); 79 80 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0, 81 ctx->ac.f32, ac_glc, true, false); 82 return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); 83} 84 85static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, 86 unsigned driver_location, unsigned component, 87 unsigned num_components, unsigned vertex_index, 88 LLVMTypeRef type) 89{ 90 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 91 92 LLVMValueRef value[4]; 93 for (unsigned i = component; i < component + num_components; i++) { 94 value[i] = si_llvm_load_input_gs(&ctx->abi, driver_location, 95 vertex_index, type, i); 96 } 97 98 return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); 99} 100 101/* Pass GS inputs from ES to GS on GFX9. */ 102static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) 103{ 104 if (!ctx->shader->is_monolithic) 105 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); 106 107 LLVMValueRef ret = ctx->return_value; 108 109 ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0); 110 ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1); 111 if (ctx->shader->key.as_ngg) 112 ret = si_insert_input_ptr(ctx, ret, ctx->args.gs_tg_info, 2); 113 else 114 ret = si_insert_input_ret(ctx, ret, ctx->args.gs2vs_offset, 2); 115 ret = si_insert_input_ret(ctx, ret, ctx->args.merged_wave_info, 3); 116 ret = si_insert_input_ret(ctx, ret, ctx->args.scratch_offset, 5); 117 118 ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS); 119 ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images, 120 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); 121 if (ctx->screen->use_ngg) { 122 ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); 123 } 124 125 unsigned vgpr = 8 + SI_NUM_VS_STATE_RESOURCE_SGPRS; 126 127 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[0], vgpr++); 128 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[1], vgpr++); 129 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++); 130 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++); 131 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[2], vgpr++); 132 ctx->return_value = ret; 133} 134 135void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi) 136{ 137 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 138 struct si_shader *es = ctx->shader; 139 struct si_shader_info *info = &es->selector->info; 140 LLVMValueRef *addrs = abi->outputs; 141 LLVMValueRef lds_base = NULL; 142 unsigned chan; 143 int i; 144 145 if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) { 146 unsigned itemsize_dw = es->selector->esgs_itemsize / 4; 147 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); 148 LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->args.merged_wave_info, 24, 4); 149 vertex_idx = 150 LLVMBuildOr(ctx->ac.builder, vertex_idx, 151 LLVMBuildMul(ctx->ac.builder, wave_idx, 152 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""), 153 ""); 154 lds_base = 155 LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), ""); 156 } 157 158 for (i = 0; i < info->num_outputs; i++) { 159 int param; 160 161 if (info->output_semantic[i] == VARYING_SLOT_VIEWPORT || 162 info->output_semantic[i] == VARYING_SLOT_LAYER) 163 continue; 164 165 param = si_shader_io_get_unique_index(info->output_semantic[i], false); 166 167 for (chan = 0; chan < 4; chan++) { 168 if (!(info->output_usagemask[i] & (1 << chan))) 169 continue; 170 171 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); 172 out_val = ac_to_integer(&ctx->ac, out_val); 173 174 /* GFX9 has the ESGS ring in LDS. */ 175 if (ctx->screen->info.chip_class >= GFX9) { 176 LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false); 177 idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, ""); 178 ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val); 179 continue; 180 } 181 182 ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL, 183 ac_get_arg(&ctx->ac, ctx->args.es2gs_offset), 184 (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled); 185 } 186 } 187 188 if (ctx->screen->info.chip_class >= GFX9) 189 si_set_es_return_value_for_gs(ctx); 190} 191 192static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx) 193{ 194 if (ctx->screen->info.chip_class >= GFX9) 195 return si_unpack_param(ctx, ctx->args.merged_wave_info, 16, 8); 196 else 197 return ac_get_arg(&ctx->ac, ctx->args.gs_wave_id); 198} 199 200static void emit_gs_epilogue(struct si_shader_context *ctx) 201{ 202 if (ctx->shader->key.as_ngg) { 203 gfx10_ngg_gs_emit_epilogue(ctx); 204 return; 205 } 206 207 if (ctx->screen->info.chip_class >= GFX10) 208 LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, ""); 209 210 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx)); 211 212 if (ctx->screen->info.chip_class >= GFX9) 213 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); 214} 215 216static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi) 217{ 218 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 219 struct si_shader_info UNUSED *info = &ctx->shader->selector->info; 220 221 assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS); 222 223 emit_gs_epilogue(ctx); 224} 225 226/* Emit one vertex from the geometry shader */ 227static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs) 228{ 229 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 230 231 if (ctx->shader->key.as_ngg) { 232 gfx10_ngg_gs_emit_vertex(ctx, stream, addrs); 233 return; 234 } 235 236 struct si_shader_info *info = &ctx->shader->selector->info; 237 struct si_shader *shader = ctx->shader; 238 LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->args.gs2vs_offset); 239 LLVMValueRef gs_next_vertex; 240 LLVMValueRef can_emit; 241 unsigned chan, offset; 242 int i; 243 244 /* Write vertex attribute values to GSVS ring */ 245 gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], ""); 246 247 /* If this thread has already emitted the declared maximum number of 248 * vertices, skip the write: excessive vertex emissions are not 249 * supposed to have any effect. 250 * 251 * If the shader has no writes to memory, kill it instead. This skips 252 * further memory loads and may allow LLVM to skip to the end 253 * altogether. 254 */ 255 can_emit = 256 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, 257 LLVMConstInt(ctx->ac.i32, shader->selector->info.base.gs.vertices_out, 0), ""); 258 259 bool use_kill = !info->base.writes_memory; 260 if (use_kill) { 261 ac_build_kill_if_false(&ctx->ac, can_emit); 262 } else { 263 ac_build_ifcc(&ctx->ac, can_emit, 6505); 264 } 265 266 offset = 0; 267 for (i = 0; i < info->num_outputs; i++) { 268 for (chan = 0; chan < 4; chan++) { 269 if (!(info->output_usagemask[i] & (1 << chan)) || 270 ((info->output_streams[i] >> (2 * chan)) & 3) != stream) 271 continue; 272 273 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); 274 LLVMValueRef voffset = 275 LLVMConstInt(ctx->ac.i32, offset * shader->selector->info.base.gs.vertices_out, 0); 276 offset++; 277 278 voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); 279 voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), ""); 280 281 out_val = ac_to_integer(&ctx->ac, out_val); 282 283 ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset, 284 0, ac_glc | ac_slc | ac_swizzled); 285 } 286 } 287 288 gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, ""); 289 LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); 290 291 /* Signal vertex emission if vertex data was written. */ 292 if (offset) { 293 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), 294 si_get_gs_wave_id(ctx)); 295 } 296 297 if (!use_kill) 298 ac_build_endif(&ctx->ac, 6505); 299} 300 301/* Cut one primitive from the geometry shader */ 302static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream) 303{ 304 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 305 306 if (ctx->shader->key.as_ngg) { 307 LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]); 308 return; 309 } 310 311 /* Signal primitive cut */ 312 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), 313 si_get_gs_wave_id(ctx)); 314} 315 316void si_preload_esgs_ring(struct si_shader_context *ctx) 317{ 318 if (ctx->screen->info.chip_class <= GFX8) { 319 unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS; 320 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0); 321 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings); 322 323 ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 324 } else { 325 if (USE_LDS_SYMBOLS) { 326 /* Declare the ESGS ring as an explicit LDS symbol. */ 327 si_llvm_declare_esgs_ring(ctx); 328 } else { 329 ac_declare_lds_as_pointer(&ctx->ac); 330 ctx->esgs_ring = ctx->ac.lds; 331 } 332 } 333} 334 335void si_preload_gs_rings(struct si_shader_context *ctx) 336{ 337 const struct si_shader_selector *sel = ctx->shader->selector; 338 LLVMBuilderRef builder = ctx->ac.builder; 339 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0); 340 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings); 341 LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 342 343 /* The conceptual layout of the GSVS ring is 344 * v0c0 .. vLv0 v0c1 .. vLc1 .. 345 * but the real memory layout is swizzled across 346 * threads: 347 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL 348 * t16v0c0 .. 349 * Override the buffer descriptor accordingly. 350 */ 351 LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2); 352 uint64_t stream_offset = 0; 353 354 for (unsigned stream = 0; stream < 4; ++stream) { 355 unsigned num_components; 356 unsigned stride; 357 unsigned num_records; 358 LLVMValueRef ring, tmp; 359 360 num_components = sel->info.num_stream_output_components[stream]; 361 if (!num_components) 362 continue; 363 364 stride = 4 * num_components * sel->info.base.gs.vertices_out; 365 366 /* Limit on the stride field for <= GFX7. */ 367 assert(stride < (1 << 14)); 368 369 num_records = ctx->ac.wave_size; 370 371 ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); 372 tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, ""); 373 tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), ""); 374 stream_offset += stride * ctx->ac.wave_size; 375 376 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, ""); 377 ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, ""); 378 tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, ""); 379 tmp = LLVMBuildOr( 380 builder, tmp, 381 LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), ""); 382 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, ""); 383 ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0), 384 LLVMConstInt(ctx->ac.i32, 2, 0), ""); 385 386 uint32_t rsrc3 = 387 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 388 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 389 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ 390 S_008F0C_ADD_TID_ENABLE(1); 391 392 if (ctx->ac.chip_class >= GFX10) { 393 rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 394 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1); 395 } else { 396 rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 397 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | 398 S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */ 399 } 400 401 ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false), 402 LLVMConstInt(ctx->ac.i32, 3, 0), ""); 403 404 ctx->gsvs_ring[stream] = ring; 405 } 406} 407 408/* Generate code for the hardware VS shader stage to go with a geometry shader */ 409struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, 410 struct ac_llvm_compiler *compiler, 411 struct si_shader_selector *gs_selector, 412 struct pipe_debug_callback *debug) 413{ 414 struct si_shader_context ctx; 415 struct si_shader *shader; 416 LLVMBuilderRef builder; 417 struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS]; 418 struct si_shader_info *gsinfo = &gs_selector->info; 419 int i; 420 421 shader = CALLOC_STRUCT(si_shader); 422 if (!shader) 423 return NULL; 424 425 /* We can leave the fence as permanently signaled because the GS copy 426 * shader only becomes visible globally after it has been compiled. */ 427 util_queue_fence_init(&shader->ready); 428 429 shader->selector = gs_selector; 430 shader->is_gs_copy_shader = true; 431 432 si_llvm_context_init(&ctx, sscreen, compiler, 433 si_get_wave_size(sscreen, MESA_SHADER_VERTEX, 434 false, false)); 435 ctx.shader = shader; 436 ctx.stage = MESA_SHADER_VERTEX; 437 438 builder = ctx.ac.builder; 439 440 si_llvm_create_main_func(&ctx, false); 441 442 LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.internal_bindings); 443 ctx.gsvs_ring[0] = 444 ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0)); 445 446 LLVMValueRef voffset = 447 LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), ""); 448 449 /* Fetch the vertex stream ID.*/ 450 LLVMValueRef stream_id; 451 452 if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) 453 stream_id = si_unpack_param(&ctx, ctx.args.streamout_config, 24, 2); 454 else 455 stream_id = ctx.ac.i32_0; 456 457 /* Fill in output information. */ 458 for (i = 0; i < gsinfo->num_outputs; ++i) { 459 outputs[i].semantic = gsinfo->output_semantic[i]; 460 461 for (int chan = 0; chan < 4; chan++) { 462 outputs[i].vertex_stream[chan] = (gsinfo->output_streams[i] >> (2 * chan)) & 3; 463 } 464 } 465 466 LLVMBasicBlockRef end_bb; 467 LLVMValueRef switch_inst; 468 469 end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end"); 470 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); 471 472 for (int stream = 0; stream < 4; stream++) { 473 LLVMBasicBlockRef bb; 474 unsigned offset; 475 476 if (!gsinfo->num_stream_output_components[stream]) 477 continue; 478 479 if (stream > 0 && !gs_selector->so.num_outputs) 480 continue; 481 482 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out"); 483 LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb); 484 LLVMPositionBuilderAtEnd(builder, bb); 485 486 /* Fetch vertex data from GSVS ring */ 487 offset = 0; 488 for (i = 0; i < gsinfo->num_outputs; ++i) { 489 for (unsigned chan = 0; chan < 4; chan++) { 490 if (!(gsinfo->output_usagemask[i] & (1 << chan)) || 491 outputs[i].vertex_stream[chan] != stream) { 492 outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32); 493 continue; 494 } 495 496 LLVMValueRef soffset = 497 LLVMConstInt(ctx.ac.i32, offset * gs_selector->info.base.gs.vertices_out * 16 * 4, 0); 498 offset++; 499 500 outputs[i].values[chan] = 501 ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0, 502 ctx.ac.f32, ac_glc | ac_slc, true, false); 503 } 504 } 505 506 /* Streamout and exports. */ 507 if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) { 508 si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream); 509 } 510 511 if (stream == 0) 512 si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs); 513 514 LLVMBuildBr(builder, end_bb); 515 } 516 517 LLVMPositionBuilderAtEnd(builder, end_bb); 518 519 LLVMBuildRetVoid(ctx.ac.builder); 520 521 ctx.stage = MESA_SHADER_GEOMETRY; /* override for shader dumping */ 522 si_llvm_optimize_module(&ctx); 523 524 bool ok = false; 525 if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac, 526 debug, MESA_SHADER_GEOMETRY, "GS Copy Shader", false)) { 527 if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY)) 528 fprintf(stderr, "GS Copy Shader:\n"); 529 si_shader_dump(sscreen, ctx.shader, debug, stderr, true); 530 531 if (!ctx.shader->config.scratch_bytes_per_wave) 532 ok = si_shader_binary_upload(sscreen, ctx.shader, 0); 533 else 534 ok = true; 535 } 536 537 si_llvm_dispose(&ctx); 538 539 if (!ok) { 540 FREE(shader); 541 shader = NULL; 542 } else { 543 si_fix_resource_usage(sscreen, shader); 544 } 545 return shader; 546} 547 548/** 549 * Build the GS prolog function. Rotate the input vertices for triangle strips 550 * with adjacency. 551 */ 552void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key) 553{ 554 unsigned num_sgprs, num_vgprs; 555 LLVMBuilderRef builder = ctx->ac.builder; 556 LLVMTypeRef returns[AC_MAX_ARGS]; 557 LLVMValueRef func, ret; 558 559 memset(&ctx->args, 0, sizeof(ctx->args)); 560 561 if (ctx->screen->info.chip_class >= GFX9) { 562 /* Other user SGPRs are not needed by GS. */ 563 num_sgprs = 8 + SI_NUM_VS_STATE_RESOURCE_SGPRS; 564 num_vgprs = 5; /* ES inputs are not needed by GS */ 565 } else { 566 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; 567 num_vgprs = 8; 568 } 569 570 for (unsigned i = 0; i < num_sgprs; ++i) { 571 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); 572 returns[i] = ctx->ac.i32; 573 } 574 575 for (unsigned i = 0; i < num_vgprs; ++i) { 576 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); 577 returns[num_sgprs + i] = ctx->ac.f32; 578 } 579 580 /* Create the function. */ 581 si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0); 582 func = ctx->main_fn; 583 584 /* Copy inputs to outputs. This should be no-op, as the registers match, 585 * but it will prevent the compiler from overwriting them unintentionally. 586 */ 587 ret = ctx->return_value; 588 for (unsigned i = 0; i < num_sgprs; i++) { 589 LLVMValueRef p = LLVMGetParam(func, i); 590 ret = LLVMBuildInsertValue(builder, ret, p, i, ""); 591 } 592 for (unsigned i = 0; i < num_vgprs; i++) { 593 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); 594 p = ac_to_float(&ctx->ac, p); 595 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); 596 } 597 598 if (key->gs_prolog.states.tri_strip_adj_fix) { 599 /* Remap the input vertices for every other primitive. */ 600 const struct ac_arg gfx6_vtx_params[6] = { 601 {.used = true, .arg_index = num_sgprs}, {.used = true, .arg_index = num_sgprs + 1}, 602 {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4}, 603 {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6}, 604 }; 605 const struct ac_arg gfx9_vtx_params[3] = { 606 {.used = true, .arg_index = num_sgprs}, 607 {.used = true, .arg_index = num_sgprs + 1}, 608 {.used = true, .arg_index = num_sgprs + 4}, 609 }; 610 LLVMValueRef vtx_in[6], vtx_out[6]; 611 LLVMValueRef prim_id, rotate; 612 613 if (ctx->screen->info.chip_class >= GFX9) { 614 for (unsigned i = 0; i < 3; i++) { 615 vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16); 616 vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16); 617 } 618 } else { 619 for (unsigned i = 0; i < 6; i++) 620 vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]); 621 } 622 623 prim_id = LLVMGetParam(func, num_sgprs + 2); 624 rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, ""); 625 626 for (unsigned i = 0; i < 6; ++i) { 627 LLVMValueRef base, rotated; 628 base = vtx_in[i]; 629 rotated = vtx_in[(i + 4) % 6]; 630 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, ""); 631 } 632 633 if (ctx->screen->info.chip_class >= GFX9) { 634 for (unsigned i = 0; i < 3; i++) { 635 LLVMValueRef hi, out; 636 637 hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), ""); 638 out = LLVMBuildOr(builder, vtx_out[i * 2], hi, ""); 639 out = ac_to_float(&ctx->ac, out); 640 ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, ""); 641 } 642 } else { 643 for (unsigned i = 0; i < 6; i++) { 644 LLVMValueRef out; 645 646 out = ac_to_float(&ctx->ac, vtx_out[i]); 647 ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, ""); 648 } 649 } 650 } 651 652 LLVMBuildRet(builder, ret); 653} 654 655void si_llvm_init_gs_callbacks(struct si_shader_context *ctx) 656{ 657 ctx->abi.load_inputs = si_nir_load_input_gs; 658 ctx->abi.emit_vertex = si_llvm_emit_vertex; 659 ctx->abi.emit_primitive = si_llvm_emit_primitive; 660 ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue; 661} 662