1/* 2 * Copyright 2012 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "ac_exp_param.h" 26#include "ac_rtld.h" 27#include "compiler/nir/nir.h" 28#include "compiler/nir/nir_serialize.h" 29#include "si_pipe.h" 30#include "si_shader_internal.h" 31#include "sid.h" 32#include "tgsi/tgsi_from_mesa.h" 33#include "tgsi/tgsi_strings.h" 34#include "util/u_memory.h" 35 36static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0"; 37 38static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1"; 39 40static void si_dump_shader_key(const struct si_shader *shader, FILE *f); 41 42/** Whether the shader runs as a combination of multiple API shaders */ 43bool si_is_multi_part_shader(struct si_shader *shader) 44{ 45 if (shader->selector->screen->info.chip_class <= GFX8) 46 return false; 47 48 return shader->key.as_ls || shader->key.as_es || 49 shader->selector->info.stage == MESA_SHADER_TESS_CTRL || 50 shader->selector->info.stage == MESA_SHADER_GEOMETRY; 51} 52 53/** Whether the shader runs on a merged HW stage (LSHS or ESGS) */ 54bool si_is_merged_shader(struct si_shader *shader) 55{ 56 return shader->key.as_ngg || si_is_multi_part_shader(shader); 57} 58 59/** 60 * Returns a unique index for a per-patch semantic name and index. The index 61 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs 62 * can be calculated. 63 */ 64unsigned si_shader_io_get_unique_index_patch(unsigned semantic) 65{ 66 switch (semantic) { 67 case VARYING_SLOT_TESS_LEVEL_OUTER: 68 return 0; 69 case VARYING_SLOT_TESS_LEVEL_INNER: 70 return 1; 71 default: 72 if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30) 73 return 2 + (semantic - VARYING_SLOT_PATCH0); 74 75 assert(!"invalid semantic"); 76 return 0; 77 } 78} 79 80/** 81 * Returns a unique index for a semantic name and index. The index must be 82 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be 83 * calculated. 84 */ 85unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying) 86{ 87 switch (semantic) { 88 case VARYING_SLOT_POS: 89 return 0; 90 default: 91 /* Since some shader stages use the highest used IO index 92 * to determine the size to allocate for inputs/outputs 93 * (in LDS, tess and GS rings). GENERIC should be placed right 94 * after POSITION to make that size as small as possible. 95 */ 96 if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31) 97 return 1 + (semantic - VARYING_SLOT_VAR0); /* 1..32 */ 98 99 /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as 100 * legacy desktop GL varyings because they are mutually exclusive. 101 */ 102 if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT) 103 return 33 + (semantic - VARYING_SLOT_VAR0_16BIT); /* 33..48 */ 104 105 assert(!"invalid generic index"); 106 return 0; 107 108 /* Legacy desktop GL varyings. */ 109 case VARYING_SLOT_FOGC: 110 return 33; 111 case VARYING_SLOT_COL0: 112 return 34; 113 case VARYING_SLOT_COL1: 114 return 35; 115 case VARYING_SLOT_BFC0: 116 /* If it's a varying, COLOR and BCOLOR alias. */ 117 if (is_varying) 118 return 34; 119 else 120 return 36; 121 case VARYING_SLOT_BFC1: 122 if (is_varying) 123 return 35; 124 else 125 return 37; 126 case VARYING_SLOT_TEX0: 127 case VARYING_SLOT_TEX1: 128 case VARYING_SLOT_TEX2: 129 case VARYING_SLOT_TEX3: 130 case VARYING_SLOT_TEX4: 131 case VARYING_SLOT_TEX5: 132 case VARYING_SLOT_TEX6: 133 case VARYING_SLOT_TEX7: 134 return 38 + (semantic - VARYING_SLOT_TEX0); 135 case VARYING_SLOT_CLIP_VERTEX: 136 return 46; 137 138 /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */ 139 case VARYING_SLOT_CLIP_DIST0: 140 return 49; 141 case VARYING_SLOT_CLIP_DIST1: 142 return 50; 143 case VARYING_SLOT_PSIZ: 144 return 51; 145 146 /* These can't be written by LS, HS, and ES. */ 147 case VARYING_SLOT_LAYER: 148 return 52; 149 case VARYING_SLOT_VIEWPORT: 150 return 53; 151 case VARYING_SLOT_PRIMITIVE_ID: 152 return 54; 153 } 154} 155 156static void si_dump_streamout(struct pipe_stream_output_info *so) 157{ 158 unsigned i; 159 160 if (so->num_outputs) 161 fprintf(stderr, "STREAMOUT\n"); 162 163 for (i = 0; i < so->num_outputs; i++) { 164 unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component; 165 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer, 166 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 167 so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "", 168 mask & 4 ? "z" : "", mask & 8 ? "w" : ""); 169 } 170} 171 172static void declare_streamout_params(struct si_shader_context *ctx, 173 struct pipe_stream_output_info *so) 174{ 175 if (ctx->screen->use_ngg_streamout) { 176 if (ctx->stage == MESA_SHADER_TESS_EVAL) 177 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); 178 return; 179 } 180 181 /* Streamout SGPRs. */ 182 if (so->num_outputs) { 183 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_config); 184 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_write_index); 185 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { 186 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); 187 } 188 189 /* A streamout buffer offset is loaded if the stride is non-zero. */ 190 for (int i = 0; i < 4; i++) { 191 if (!so->stride[i]) 192 continue; 193 194 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_offset[i]); 195 } 196} 197 198unsigned si_get_max_workgroup_size(const struct si_shader *shader) 199{ 200 switch (shader->selector->info.stage) { 201 case MESA_SHADER_VERTEX: 202 case MESA_SHADER_TESS_EVAL: 203 return shader->key.as_ngg ? 128 : 0; 204 205 case MESA_SHADER_TESS_CTRL: 206 /* Return this so that LLVM doesn't remove s_barrier 207 * instructions on chips where we use s_barrier. */ 208 return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0; 209 210 case MESA_SHADER_GEOMETRY: 211 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0; 212 213 case MESA_SHADER_COMPUTE: 214 break; /* see below */ 215 216 default: 217 return 0; 218 } 219 220 /* Compile a variable block size using the maximum variable size. */ 221 if (shader->selector->info.base.workgroup_size_variable) 222 return SI_MAX_VARIABLE_THREADS_PER_BLOCK; 223 224 uint16_t *local_size = shader->selector->info.base.workgroup_size; 225 unsigned max_work_group_size = (uint32_t)local_size[0] * 226 (uint32_t)local_size[1] * 227 (uint32_t)local_size[2]; 228 assert(max_work_group_size); 229 return max_work_group_size; 230} 231 232static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params) 233{ 234 enum ac_arg_type const_shader_buf_type; 235 236 if (ctx->shader->selector->info.base.num_ubos == 1 && 237 ctx->shader->selector->info.base.num_ssbos == 0) 238 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR; 239 else 240 const_shader_buf_type = AC_ARG_CONST_DESC_PTR; 241 242 ac_add_arg( 243 &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type, 244 assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers); 245} 246 247static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params) 248{ 249 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, 250 assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images); 251} 252 253static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params) 254{ 255 declare_const_and_shader_buffers(ctx, assign_params); 256 declare_samplers_and_images(ctx, assign_params); 257} 258 259static void declare_global_desc_pointers(struct si_shader_context *ctx) 260{ 261 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->internal_bindings); 262 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, 263 &ctx->bindless_samplers_and_images); 264} 265 266static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx) 267{ 268 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); 269 if (!ctx->shader->is_gs_copy_shader) { 270 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex); 271 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id); 272 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance); 273 } 274} 275 276static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx) 277{ 278 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->args.vertex_buffers); 279 280 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; 281 if (num_vbos_in_user_sgprs) { 282 unsigned user_sgprs = ctx->args.num_sgprs_used; 283 284 if (si_is_merged_shader(ctx->shader)) 285 user_sgprs -= 8; 286 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST); 287 288 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */ 289 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++) 290 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ 291 292 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors)); 293 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++) 294 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]); 295 } 296} 297 298static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs) 299{ 300 struct si_shader *shader = ctx->shader; 301 302 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id); 303 if (shader->key.as_ls) { 304 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_rel_patch_id); 305 if (ctx->screen->info.chip_class >= GFX10) { 306 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ 307 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); 308 } else { 309 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); 310 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ 311 } 312 } else if (ctx->screen->info.chip_class >= GFX10) { 313 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ 314 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, 315 &ctx->args.vs_prim_id); /* user vgpr or PrimID (legacy) */ 316 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); 317 } else { 318 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); 319 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_prim_id); 320 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ 321 } 322 323 if (!shader->is_gs_copy_shader) { 324 /* Vertex load indices. */ 325 if (shader->selector->info.num_inputs) { 326 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0); 327 for (unsigned i = 1; i < shader->selector->info.num_inputs; i++) 328 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); 329 } 330 *num_prolog_vgprs += shader->selector->info.num_inputs; 331 } 332} 333 334static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property) 335{ 336 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */ 337 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */ 338 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */ 339 340 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { 341 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */ 342 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */ 343 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */ 344 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */ 345 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { 346 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */ 347 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */ 348 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */ 349 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */ 350 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */ 351 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */ 352 } 353} 354 355static void declare_tes_input_vgprs(struct si_shader_context *ctx) 356{ 357 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_u); 358 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_v); 359 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_rel_patch_id); 360 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id); 361} 362 363enum 364{ 365 /* Convenient merged shader definitions. */ 366 SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES, 367 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY, 368}; 369 370void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers, 371 enum ac_arg_type type, struct ac_arg *arg, unsigned idx) 372{ 373 assert(args->arg_count == idx); 374 ac_add_arg(args, file, registers, type, arg); 375} 376 377void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) 378{ 379 struct si_shader *shader = ctx->shader; 380 unsigned i, num_returns, num_return_sgprs; 381 unsigned num_prolog_vgprs = 0; 382 unsigned stage = ctx->stage; 383 384 memset(&ctx->args, 0, sizeof(ctx->args)); 385 386 /* Set MERGED shaders. */ 387 if (ctx->screen->info.chip_class >= GFX9) { 388 if (shader->key.as_ls || stage == MESA_SHADER_TESS_CTRL) 389 stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */ 390 else if (shader->key.as_es || shader->key.as_ngg || stage == MESA_SHADER_GEOMETRY) 391 stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY; 392 } 393 394 switch (stage) { 395 case MESA_SHADER_VERTEX: 396 declare_global_desc_pointers(ctx); 397 398 if (shader->selector->info.base.vs.blit_sgprs_amd) { 399 declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd); 400 401 /* VGPRs */ 402 declare_vs_input_vgprs(ctx, &num_prolog_vgprs); 403 break; 404 } 405 406 declare_per_stage_desc_pointers(ctx, true); 407 declare_vs_specific_input_sgprs(ctx); 408 if (!shader->is_gs_copy_shader) 409 declare_vb_descriptor_input_sgprs(ctx); 410 411 if (shader->key.as_es) { 412 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset); 413 } else if (shader->key.as_ls) { 414 /* no extra parameters */ 415 } else { 416 /* The locations of the other parameters are assigned dynamically. */ 417 declare_streamout_params(ctx, &shader->selector->so); 418 } 419 420 /* VGPRs */ 421 declare_vs_input_vgprs(ctx, &num_prolog_vgprs); 422 break; 423 424 case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */ 425 declare_global_desc_pointers(ctx); 426 declare_per_stage_desc_pointers(ctx, true); 427 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); 428 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); 429 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); 430 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); 431 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); 432 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset); 433 434 /* VGPRs */ 435 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); 436 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); 437 438 /* param_tcs_offchip_offset and param_tcs_factor_offset are 439 * placed after the user SGPRs. 440 */ 441 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) 442 ac_add_return(&ctx->args, AC_ARG_SGPR); 443 for (i = 0; i < 11; i++) 444 ac_add_return(&ctx->args, AC_ARG_VGPR); 445 break; 446 447 case SI_SHADER_MERGED_VERTEX_TESSCTRL: 448 /* Merged stages have 8 system SGPRs at the beginning. */ 449 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */ 450 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_TESS_CTRL); 451 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); 452 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info); 453 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset); 454 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset); 455 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ 456 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ 457 458 declare_global_desc_pointers(ctx); 459 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_VERTEX); 460 declare_vs_specific_input_sgprs(ctx); 461 462 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); 463 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); 464 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); 465 if (ctx->stage == MESA_SHADER_VERTEX) 466 declare_vb_descriptor_input_sgprs(ctx); 467 468 /* VGPRs (first TCS, then VS) */ 469 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); 470 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); 471 472 if (ctx->stage == MESA_SHADER_VERTEX) { 473 declare_vs_input_vgprs(ctx, &num_prolog_vgprs); 474 475 /* LS return values are inputs to the TCS main shader part. */ 476 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) 477 ac_add_return(&ctx->args, AC_ARG_SGPR); 478 for (i = 0; i < 2; i++) 479 ac_add_return(&ctx->args, AC_ARG_VGPR); 480 481 /* VS outputs passed via VGPRs to TCS. */ 482 if (shader->key.opt.same_patch_vertices) { 483 unsigned num_outputs = util_last_bit64(shader->selector->outputs_written); 484 for (i = 0; i < num_outputs * 4; i++) 485 ac_add_return(&ctx->args, AC_ARG_VGPR); 486 } 487 } else { 488 /* TCS inputs are passed via VGPRs from VS. */ 489 if (shader->key.opt.same_patch_vertices) { 490 unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->outputs_written); 491 for (i = 0; i < num_inputs * 4; i++) 492 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); 493 } 494 495 /* TCS return values are inputs to the TCS epilog. 496 * 497 * param_tcs_offchip_offset, param_tcs_factor_offset, 498 * param_tcs_offchip_layout, and internal_bindings 499 * should be passed to the epilog. 500 */ 501 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++) 502 ac_add_return(&ctx->args, AC_ARG_SGPR); 503 for (i = 0; i < 11; i++) 504 ac_add_return(&ctx->args, AC_ARG_VGPR); 505 } 506 break; 507 508 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY: 509 /* Merged stages have 8 system SGPRs at the beginning. */ 510 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */ 511 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY); 512 513 if (ctx->shader->key.as_ngg) 514 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_tg_info); 515 else 516 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset); 517 518 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info); 519 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); 520 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset); 521 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, 522 &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */ 523 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, 524 NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */ 525 526 declare_global_desc_pointers(ctx); 527 if (ctx->stage != MESA_SHADER_VERTEX || !shader->selector->info.base.vs.blit_sgprs_amd) { 528 declare_per_stage_desc_pointers( 529 ctx, (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)); 530 } 531 532 if (ctx->stage == MESA_SHADER_VERTEX) { 533 if (shader->selector->info.base.vs.blit_sgprs_amd) 534 declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd); 535 else 536 declare_vs_specific_input_sgprs(ctx); 537 } else { 538 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); 539 540 if (ctx->stage == MESA_SHADER_TESS_EVAL) { 541 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); 542 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); 543 } 544 } 545 546 if (ctx->stage == MESA_SHADER_VERTEX) 547 declare_vb_descriptor_input_sgprs(ctx); 548 549 /* VGPRs (first GS, then VS/TES) */ 550 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]); 551 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]); 552 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); 553 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); 554 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]); 555 556 if (ctx->stage == MESA_SHADER_VERTEX) { 557 declare_vs_input_vgprs(ctx, &num_prolog_vgprs); 558 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { 559 declare_tes_input_vgprs(ctx); 560 } 561 562 if ((ctx->shader->key.as_es || ngg_cull_shader) && 563 (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) { 564 unsigned num_user_sgprs, num_vgprs; 565 566 if (ctx->stage == MESA_SHADER_VERTEX && ngg_cull_shader) { 567 /* For the NGG cull shader, add 1 SGPR to hold 568 * the vertex buffer pointer. 569 */ 570 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + 1; 571 572 if (shader->selector->num_vbos_in_user_sgprs) { 573 assert(num_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST); 574 num_user_sgprs = 575 SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4; 576 } 577 } else if (ctx->stage == MESA_SHADER_TESS_EVAL && ngg_cull_shader) { 578 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; 579 } else { 580 num_user_sgprs = SI_NUM_VS_STATE_RESOURCE_SGPRS; 581 } 582 583 /* The NGG cull shader has to return all 9 VGPRs. 584 * 585 * The normal merged ESGS shader only has to return the 5 VGPRs 586 * for the GS stage. 587 */ 588 num_vgprs = ngg_cull_shader ? 9 : 5; 589 590 /* ES return values are inputs to GS. */ 591 for (i = 0; i < 8 + num_user_sgprs; i++) 592 ac_add_return(&ctx->args, AC_ARG_SGPR); 593 for (i = 0; i < num_vgprs; i++) 594 ac_add_return(&ctx->args, AC_ARG_VGPR); 595 } 596 break; 597 598 case MESA_SHADER_TESS_EVAL: 599 declare_global_desc_pointers(ctx); 600 declare_per_stage_desc_pointers(ctx, true); 601 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); 602 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); 603 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); 604 605 if (shader->key.as_es) { 606 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); 607 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); 608 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset); 609 } else { 610 declare_streamout_params(ctx, &shader->selector->so); 611 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); 612 } 613 614 /* VGPRs */ 615 declare_tes_input_vgprs(ctx); 616 break; 617 618 case MESA_SHADER_GEOMETRY: 619 declare_global_desc_pointers(ctx); 620 declare_per_stage_desc_pointers(ctx, true); 621 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset); 622 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_wave_id); 623 624 /* VGPRs */ 625 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]); 626 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]); 627 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); 628 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]); 629 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[3]); 630 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[4]); 631 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[5]); 632 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); 633 break; 634 635 case MESA_SHADER_FRAGMENT: 636 declare_global_desc_pointers(ctx); 637 declare_per_stage_desc_pointers(ctx, true); 638 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF); 639 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask, 640 SI_PARAM_PRIM_MASK); 641 642 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample, 643 SI_PARAM_PERSP_SAMPLE); 644 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center, 645 SI_PARAM_PERSP_CENTER); 646 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid, 647 SI_PARAM_PERSP_CENTROID); 648 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL); 649 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample, 650 SI_PARAM_LINEAR_SAMPLE); 651 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center, 652 SI_PARAM_LINEAR_CENTER); 653 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid, 654 SI_PARAM_LINEAR_CENTROID); 655 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX); 656 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0], 657 SI_PARAM_POS_X_FLOAT); 658 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1], 659 SI_PARAM_POS_Y_FLOAT); 660 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2], 661 SI_PARAM_POS_Z_FLOAT); 662 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3], 663 SI_PARAM_POS_W_FLOAT); 664 shader->info.face_vgpr_index = ctx->args.num_vgprs_used; 665 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face, 666 SI_PARAM_FRONT_FACE); 667 shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used; 668 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary, 669 SI_PARAM_ANCILLARY); 670 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage, 671 SI_PARAM_SAMPLE_COVERAGE); 672 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt, 673 SI_PARAM_POS_FIXED_PT); 674 675 /* Color inputs from the prolog. */ 676 if (shader->selector->info.colors_read) { 677 unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read); 678 679 for (i = 0; i < num_color_elements; i++) 680 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); 681 682 num_prolog_vgprs += num_color_elements; 683 } 684 685 /* Outputs for the epilog. */ 686 num_return_sgprs = SI_SGPR_ALPHA_REF + 1; 687 num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 + 688 shader->selector->info.writes_z + shader->selector->info.writes_stencil + 689 shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */; 690 691 num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); 692 693 for (i = 0; i < num_return_sgprs; i++) 694 ac_add_return(&ctx->args, AC_ARG_SGPR); 695 for (; i < num_returns; i++) 696 ac_add_return(&ctx->args, AC_ARG_VGPR); 697 break; 698 699 case MESA_SHADER_COMPUTE: 700 declare_global_desc_pointers(ctx); 701 declare_per_stage_desc_pointers(ctx, true); 702 if (shader->selector->info.uses_grid_size) 703 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups); 704 if (shader->selector->info.uses_variable_block_size) 705 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->block_size); 706 707 unsigned cs_user_data_dwords = 708 shader->selector->info.base.cs.user_data_components_amd; 709 if (cs_user_data_dwords) { 710 ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data); 711 } 712 713 /* Some descriptors can be in user SGPRs. */ 714 /* Shader buffers in user SGPRs. */ 715 for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) { 716 while (ctx->args.num_sgprs_used % 4 != 0) 717 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); 718 719 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]); 720 } 721 /* Images in user SGPRs. */ 722 for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) { 723 unsigned num_sgprs = shader->selector->info.base.image_buffers & (1 << i) ? 4 : 8; 724 725 while (ctx->args.num_sgprs_used % num_sgprs != 0) 726 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); 727 728 ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]); 729 } 730 731 /* Hardware SGPRs. */ 732 for (i = 0; i < 3; i++) { 733 if (shader->selector->info.uses_block_id[i]) { 734 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]); 735 } 736 } 737 if (shader->selector->info.uses_subgroup_info) 738 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size); 739 740 /* Hardware VGPRs. */ 741 if (!ctx->screen->info.has_graphics && ctx->screen->info.family >= CHIP_ALDEBARAN) 742 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.local_invocation_ids); 743 else 744 ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids); 745 break; 746 default: 747 assert(0 && "unimplemented shader"); 748 return; 749 } 750 751 shader->info.num_input_sgprs = ctx->args.num_sgprs_used; 752 shader->info.num_input_vgprs = ctx->args.num_vgprs_used; 753 754 assert(shader->info.num_input_vgprs >= num_prolog_vgprs); 755 shader->info.num_input_vgprs -= num_prolog_vgprs; 756} 757 758/* For the UMR disassembler. */ 759#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ 760#define DEBUGGER_NUM_MARKERS 5 761 762static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader, 763 struct ac_rtld_binary *rtld) 764{ 765 const struct si_shader_selector *sel = shader->selector; 766 const char *part_elfs[5]; 767 size_t part_sizes[5]; 768 unsigned num_parts = 0; 769 770#define add_part(shader_or_part) \ 771 if (shader_or_part) { \ 772 part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \ 773 part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \ 774 num_parts++; \ 775 } 776 777 add_part(shader->prolog); 778 add_part(shader->previous_stage); 779 add_part(shader->prolog2); 780 add_part(shader); 781 add_part(shader->epilog); 782 783#undef add_part 784 785 struct ac_rtld_symbol lds_symbols[2]; 786 unsigned num_lds_symbols = 0; 787 788 if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader && 789 (sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) { 790 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; 791 sym->name = "esgs_ring"; 792 sym->size = shader->gs_info.esgs_ring_size * 4; 793 sym->align = 64 * 1024; 794 } 795 796 if (shader->key.as_ngg && sel->info.stage == MESA_SHADER_GEOMETRY) { 797 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; 798 sym->name = "ngg_emit"; 799 sym->size = shader->ngg.ngg_emit_size * 4; 800 sym->align = 4; 801 } 802 803 bool ok = ac_rtld_open( 804 rtld, (struct ac_rtld_open_info){.info = &screen->info, 805 .options = 806 { 807 .halt_at_entry = screen->options.halt_shaders, 808 }, 809 .shader_type = sel->info.stage, 810 .wave_size = si_get_shader_wave_size(shader), 811 .num_parts = num_parts, 812 .elf_ptrs = part_elfs, 813 .elf_sizes = part_sizes, 814 .num_shared_lds_symbols = num_lds_symbols, 815 .shared_lds_symbols = lds_symbols}); 816 817 if (rtld->lds_size > 0) { 818 unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256; 819 shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity; 820 } 821 822 return ok; 823} 824 825static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader) 826{ 827 struct ac_rtld_binary rtld; 828 si_shader_binary_open(screen, shader, &rtld); 829 uint64_t size = rtld.exec_size; 830 ac_rtld_close(&rtld); 831 return size; 832} 833 834static bool si_get_external_symbol(void *data, const char *name, uint64_t *value) 835{ 836 uint64_t *scratch_va = data; 837 838 if (!strcmp(scratch_rsrc_dword0_symbol, name)) { 839 *value = (uint32_t)*scratch_va; 840 return true; 841 } 842 if (!strcmp(scratch_rsrc_dword1_symbol, name)) { 843 /* Enable scratch coalescing. */ 844 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1); 845 return true; 846 } 847 848 return false; 849} 850 851bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader, 852 uint64_t scratch_va) 853{ 854 struct ac_rtld_binary binary; 855 if (!si_shader_binary_open(sscreen, shader, &binary)) 856 return false; 857 858 si_resource_reference(&shader->bo, NULL); 859 shader->bo = si_aligned_buffer_create( 860 &sscreen->b, 861 (sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) | 862 SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT, 863 PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256); 864 if (!shader->bo) 865 return false; 866 867 /* Upload. */ 868 struct ac_rtld_upload_info u = {}; 869 u.binary = &binary; 870 u.get_external_symbol = si_get_external_symbol; 871 u.cb_data = &scratch_va; 872 u.rx_va = shader->bo->gpu_address; 873 u.rx_ptr = sscreen->ws->buffer_map(sscreen->ws, 874 shader->bo->buf, NULL, 875 PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY); 876 if (!u.rx_ptr) 877 return false; 878 879 int size = ac_rtld_upload(&u); 880 881 if (sscreen->debug_flags & DBG(SQTT)) { 882 /* Remember the uploaded code */ 883 shader->binary.uploaded_code_size = size; 884 shader->binary.uploaded_code = malloc(size); 885 memcpy(shader->binary.uploaded_code, u.rx_ptr, size); 886 } 887 888 sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf); 889 ac_rtld_close(&binary); 890 891 return size >= 0; 892} 893 894static void si_shader_dump_disassembly(struct si_screen *screen, 895 const struct si_shader_binary *binary, 896 gl_shader_stage stage, unsigned wave_size, 897 struct pipe_debug_callback *debug, const char *name, 898 FILE *file) 899{ 900 struct ac_rtld_binary rtld_binary; 901 902 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){ 903 .info = &screen->info, 904 .shader_type = stage, 905 .wave_size = wave_size, 906 .num_parts = 1, 907 .elf_ptrs = &binary->elf_buffer, 908 .elf_sizes = &binary->elf_size})) 909 return; 910 911 const char *disasm; 912 size_t nbytes; 913 914 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes)) 915 goto out; 916 917 if (nbytes > INT_MAX) 918 goto out; 919 920 if (debug && debug->debug_message) { 921 /* Very long debug messages are cut off, so send the 922 * disassembly one line at a time. This causes more 923 * overhead, but on the plus side it simplifies 924 * parsing of resulting logs. 925 */ 926 pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin"); 927 928 uint64_t line = 0; 929 while (line < nbytes) { 930 int count = nbytes - line; 931 const char *nl = memchr(disasm + line, '\n', nbytes - line); 932 if (nl) 933 count = nl - (disasm + line); 934 935 if (count) { 936 pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line); 937 } 938 939 line += count + 1; 940 } 941 942 pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End"); 943 } 944 945 if (file) { 946 fprintf(file, "Shader %s disassembly:\n", name); 947 fprintf(file, "%*s", (int)nbytes, disasm); 948 } 949 950out: 951 ac_rtld_close(&rtld_binary); 952} 953 954static void si_calculate_max_simd_waves(struct si_shader *shader) 955{ 956 struct si_screen *sscreen = shader->selector->screen; 957 struct ac_shader_config *conf = &shader->config; 958 unsigned num_inputs = shader->selector->info.num_inputs; 959 unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256; 960 unsigned lds_per_wave = 0; 961 unsigned max_simd_waves; 962 963 max_simd_waves = sscreen->info.max_wave64_per_simd; 964 965 /* Compute LDS usage for PS. */ 966 switch (shader->selector->info.stage) { 967 case MESA_SHADER_FRAGMENT: 968 /* The minimum usage per wave is (num_inputs * 48). The maximum 969 * usage is (num_inputs * 48 * 16). 970 * We can get anything in between and it varies between waves. 971 * 972 * The 48 bytes per input for a single primitive is equal to 973 * 4 bytes/component * 4 components/input * 3 points. 974 * 975 * Other stages don't know the size at compile time or don't 976 * allocate LDS per wave, but instead they do it per thread group. 977 */ 978 lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment); 979 break; 980 case MESA_SHADER_COMPUTE: { 981 unsigned max_workgroup_size = si_get_max_workgroup_size(shader); 982 lds_per_wave = (conf->lds_size * lds_increment) / 983 DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size); 984 } 985 break; 986 default:; 987 } 988 989 /* Compute the per-SIMD wave counts. */ 990 if (conf->num_sgprs) { 991 max_simd_waves = 992 MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs); 993 } 994 995 if (conf->num_vgprs) { 996 /* Always print wave limits as Wave64, so that we can compare 997 * Wave32 and Wave64 with shader-db fairly. */ 998 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd; 999 max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs); 1000 } 1001 1002 unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4; 1003 if (lds_per_wave) 1004 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave); 1005 1006 shader->info.max_simd_waves = max_simd_waves; 1007} 1008 1009void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader, 1010 struct pipe_debug_callback *debug) 1011{ 1012 const struct ac_shader_config *conf = &shader->config; 1013 1014 if (screen->options.debug_disassembly) 1015 si_shader_dump_disassembly(screen, &shader->binary, shader->selector->info.stage, 1016 si_get_shader_wave_size(shader), debug, "main", NULL); 1017 1018 pipe_debug_message(debug, SHADER_INFO, 1019 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " 1020 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d " 1021 "Spilled VGPRs: %d PrivMem VGPRs: %d", 1022 conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader), 1023 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves, 1024 conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs); 1025} 1026 1027static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file, 1028 bool check_debug_option) 1029{ 1030 const struct ac_shader_config *conf = &shader->config; 1031 1032 if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->info.stage)) { 1033 if (shader->selector->info.stage == MESA_SHADER_FRAGMENT) { 1034 fprintf(file, 1035 "*** SHADER CONFIG ***\n" 1036 "SPI_PS_INPUT_ADDR = 0x%04x\n" 1037 "SPI_PS_INPUT_ENA = 0x%04x\n", 1038 conf->spi_ps_input_addr, conf->spi_ps_input_ena); 1039 } 1040 1041 fprintf(file, 1042 "*** SHADER STATS ***\n" 1043 "SGPRS: %d\n" 1044 "VGPRS: %d\n" 1045 "Spilled SGPRs: %d\n" 1046 "Spilled VGPRs: %d\n" 1047 "Private memory VGPRs: %d\n" 1048 "Code Size: %d bytes\n" 1049 "LDS: %d blocks\n" 1050 "Scratch: %d bytes per wave\n" 1051 "Max Waves: %d\n" 1052 "********************\n\n\n", 1053 conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs, 1054 shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader), 1055 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves); 1056 } 1057} 1058 1059const char *si_get_shader_name(const struct si_shader *shader) 1060{ 1061 switch (shader->selector->info.stage) { 1062 case MESA_SHADER_VERTEX: 1063 if (shader->key.as_es) 1064 return "Vertex Shader as ES"; 1065 else if (shader->key.as_ls) 1066 return "Vertex Shader as LS"; 1067 else if (shader->key.as_ngg) 1068 return "Vertex Shader as ESGS"; 1069 else 1070 return "Vertex Shader as VS"; 1071 case MESA_SHADER_TESS_CTRL: 1072 return "Tessellation Control Shader"; 1073 case MESA_SHADER_TESS_EVAL: 1074 if (shader->key.as_es) 1075 return "Tessellation Evaluation Shader as ES"; 1076 else if (shader->key.as_ngg) 1077 return "Tessellation Evaluation Shader as ESGS"; 1078 else 1079 return "Tessellation Evaluation Shader as VS"; 1080 case MESA_SHADER_GEOMETRY: 1081 if (shader->is_gs_copy_shader) 1082 return "GS Copy Shader as VS"; 1083 else 1084 return "Geometry Shader"; 1085 case MESA_SHADER_FRAGMENT: 1086 return "Pixel Shader"; 1087 case MESA_SHADER_COMPUTE: 1088 return "Compute Shader"; 1089 default: 1090 return "Unknown Shader"; 1091 } 1092} 1093 1094void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, 1095 struct pipe_debug_callback *debug, FILE *file, bool check_debug_option) 1096{ 1097 gl_shader_stage stage = shader->selector->info.stage; 1098 1099 if (!check_debug_option || si_can_dump_shader(sscreen, stage)) 1100 si_dump_shader_key(shader, file); 1101 1102 if (!check_debug_option && shader->binary.llvm_ir_string) { 1103 if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) { 1104 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader)); 1105 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string); 1106 } 1107 1108 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader)); 1109 fprintf(file, "%s\n", shader->binary.llvm_ir_string); 1110 } 1111 1112 if (!check_debug_option || 1113 (si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) { 1114 unsigned wave_size = si_get_shader_wave_size(shader); 1115 1116 fprintf(file, "\n%s:\n", si_get_shader_name(shader)); 1117 1118 if (shader->prolog) 1119 si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, wave_size, debug, 1120 "prolog", file); 1121 if (shader->previous_stage) 1122 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage, 1123 wave_size, debug, "previous stage", file); 1124 if (shader->prolog2) 1125 si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, wave_size, 1126 debug, "prolog2", file); 1127 1128 si_shader_dump_disassembly(sscreen, &shader->binary, stage, wave_size, debug, "main", 1129 file); 1130 1131 if (shader->epilog) 1132 si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, wave_size, debug, 1133 "epilog", file); 1134 fprintf(file, "\n"); 1135 } 1136 1137 si_shader_dump_stats(sscreen, shader, file, check_debug_option); 1138} 1139 1140static void si_dump_shader_key_vs(const struct si_shader_key *key, 1141 const struct si_vs_prolog_bits *prolog, const char *prefix, 1142 FILE *f) 1143{ 1144 fprintf(f, " %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one); 1145 fprintf(f, " %s.instance_divisor_is_fetched = %u\n", prefix, 1146 prolog->instance_divisor_is_fetched); 1147 fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix); 1148 1149 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode); 1150 fprintf(f, " mono.vs.fix_fetch = {"); 1151 for (int i = 0; i < SI_MAX_ATTRIBS; i++) { 1152 union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i]; 1153 if (i) 1154 fprintf(f, ", "); 1155 if (!fix.bits) 1156 fprintf(f, "0"); 1157 else 1158 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1, 1159 fix.u.format); 1160 } 1161 fprintf(f, "}\n"); 1162} 1163 1164static void si_dump_shader_key(const struct si_shader *shader, FILE *f) 1165{ 1166 const struct si_shader_key *key = &shader->key; 1167 gl_shader_stage stage = shader->selector->info.stage; 1168 1169 fprintf(f, "SHADER KEY\n"); 1170 1171 switch (stage) { 1172 case MESA_SHADER_VERTEX: 1173 si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f); 1174 fprintf(f, " as_es = %u\n", key->as_es); 1175 fprintf(f, " as_ls = %u\n", key->as_ls); 1176 fprintf(f, " as_ngg = %u\n", key->as_ngg); 1177 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id); 1178 break; 1179 1180 case MESA_SHADER_TESS_CTRL: 1181 if (shader->selector->screen->info.chip_class >= GFX9) { 1182 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f); 1183 } 1184 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); 1185 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n", 1186 key->mono.u.ff_tcs_inputs_to_copy); 1187 fprintf(f, " opt.prefer_mono = %u\n", key->opt.prefer_mono); 1188 fprintf(f, " opt.same_patch_vertices = %u\n", key->opt.same_patch_vertices); 1189 break; 1190 1191 case MESA_SHADER_TESS_EVAL: 1192 fprintf(f, " as_es = %u\n", key->as_es); 1193 fprintf(f, " as_ngg = %u\n", key->as_ngg); 1194 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id); 1195 break; 1196 1197 case MESA_SHADER_GEOMETRY: 1198 if (shader->is_gs_copy_shader) 1199 break; 1200 1201 if (shader->selector->screen->info.chip_class >= GFX9 && 1202 key->part.gs.es->info.stage == MESA_SHADER_VERTEX) { 1203 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f); 1204 } 1205 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", 1206 key->part.gs.prolog.tri_strip_adj_fix); 1207 fprintf(f, " as_ngg = %u\n", key->as_ngg); 1208 break; 1209 1210 case MESA_SHADER_COMPUTE: 1211 break; 1212 1213 case MESA_SHADER_FRAGMENT: 1214 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side); 1215 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors); 1216 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple); 1217 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", 1218 key->part.ps.prolog.force_persp_sample_interp); 1219 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", 1220 key->part.ps.prolog.force_linear_sample_interp); 1221 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", 1222 key->part.ps.prolog.force_persp_center_interp); 1223 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", 1224 key->part.ps.prolog.force_linear_center_interp); 1225 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", 1226 key->part.ps.prolog.bc_optimize_for_persp); 1227 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", 1228 key->part.ps.prolog.bc_optimize_for_linear); 1229 fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", 1230 key->part.ps.prolog.samplemask_log_ps_iter); 1231 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", 1232 key->part.ps.epilog.spi_shader_col_format); 1233 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8); 1234 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10); 1235 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf); 1236 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func); 1237 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one); 1238 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", 1239 key->part.ps.epilog.poly_line_smoothing); 1240 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color); 1241 fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", 1242 key->mono.u.ps.interpolate_at_sample_force_center); 1243 fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa); 1244 fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D); 1245 fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered); 1246 break; 1247 1248 default: 1249 assert(0); 1250 } 1251 1252 if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL || 1253 stage == MESA_SHADER_VERTEX) && 1254 !key->as_es && !key->as_ls) { 1255 fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs); 1256 fprintf(f, " opt.kill_pointsize = 0x%x\n", key->opt.kill_pointsize); 1257 fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->opt.kill_clip_distances); 1258 if (stage != MESA_SHADER_GEOMETRY) 1259 fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling); 1260 } 1261 1262 fprintf(f, " opt.prefer_mono = %u\n", key->opt.prefer_mono); 1263 fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n", 1264 key->opt.inline_uniforms, 1265 key->opt.inlined_uniform_values[0], 1266 key->opt.inlined_uniform_values[1], 1267 key->opt.inlined_uniform_values[2], 1268 key->opt.inlined_uniform_values[3]); 1269} 1270 1271bool si_vs_needs_prolog(const struct si_shader_selector *sel, 1272 const struct si_vs_prolog_bits *prolog_key, 1273 const struct si_shader_key *key, bool ngg_cull_shader) 1274{ 1275 /* VGPR initialization fixup for Vega10 and Raven is always done in the 1276 * VS prolog. */ 1277 return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix || 1278 /* The 2nd VS prolog loads input VGPRs from LDS */ 1279 (key->opt.ngg_culling && !ngg_cull_shader); 1280} 1281 1282/** 1283 * Compute the VS prolog key, which contains all the information needed to 1284 * build the VS prolog function, and set shader->info bits where needed. 1285 * 1286 * \param info Shader info of the vertex shader. 1287 * \param num_input_sgprs Number of input SGPRs for the vertex shader. 1288 * \param has_old_ Whether the preceding shader part is the NGG cull shader. 1289 * \param prolog_key Key of the VS prolog 1290 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS. 1291 * \param key Output shader part key. 1292 */ 1293void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs, 1294 bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key, 1295 struct si_shader *shader_out, union si_shader_part_key *key) 1296{ 1297 memset(key, 0, sizeof(*key)); 1298 key->vs_prolog.states = *prolog_key; 1299 key->vs_prolog.num_input_sgprs = num_input_sgprs; 1300 key->vs_prolog.num_inputs = info->num_inputs; 1301 key->vs_prolog.as_ls = shader_out->key.as_ls; 1302 key->vs_prolog.as_es = shader_out->key.as_es; 1303 key->vs_prolog.as_ngg = shader_out->key.as_ngg; 1304 1305 if (!ngg_cull_shader && shader_out->key.opt.ngg_culling) 1306 key->vs_prolog.load_vgprs_after_culling = 1; 1307 1308 if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) { 1309 key->vs_prolog.as_ls = 1; 1310 key->vs_prolog.num_merged_next_stage_vgprs = 2; 1311 } else if (shader_out->selector->info.stage == MESA_SHADER_GEOMETRY) { 1312 key->vs_prolog.as_es = 1; 1313 key->vs_prolog.num_merged_next_stage_vgprs = 5; 1314 } else if (shader_out->key.as_ngg) { 1315 key->vs_prolog.num_merged_next_stage_vgprs = 5; 1316 } 1317 1318 /* Only one of these combinations can be set. as_ngg can be set with as_es. */ 1319 assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg + 1320 (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1); 1321 1322 /* Enable loading the InstanceID VGPR. */ 1323 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); 1324 1325 if ((key->vs_prolog.states.instance_divisor_is_one | 1326 key->vs_prolog.states.instance_divisor_is_fetched) & 1327 input_mask) 1328 shader_out->info.uses_instanceid = true; 1329} 1330 1331struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel, 1332 const struct si_shader_key *key, 1333 bool *free_nir) 1334{ 1335 nir_shader *nir; 1336 *free_nir = false; 1337 1338 if (sel->nir) { 1339 nir = sel->nir; 1340 } else if (sel->nir_binary) { 1341 struct pipe_screen *screen = &sel->screen->b; 1342 const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, 1343 pipe_shader_type_from_mesa(sel->info.stage)); 1344 1345 struct blob_reader blob_reader; 1346 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size); 1347 *free_nir = true; 1348 nir = nir_deserialize(NULL, options, &blob_reader); 1349 } else { 1350 return NULL; 1351 } 1352 1353 if (key && key->opt.inline_uniforms) { 1354 assert(*free_nir); 1355 1356 /* Most places use shader information from the default variant, not 1357 * the optimized variant. These are the things that the driver looks at 1358 * in optimized variants and the list of things that we need to do. 1359 * 1360 * The driver takes into account these things if they suddenly disappear 1361 * from the shader code: 1362 * - Register usage and code size decrease (obvious) 1363 * - Eliminated PS system values are disabled by LLVM 1364 * (FragCoord, FrontFace, barycentrics) 1365 * - VS/TES/GS outputs feeding PS are eliminated if outputs are undef. 1366 * (thanks to an LLVM pass in Mesa - TODO: move it to NIR) 1367 * The storage for eliminated outputs is also not allocated. 1368 * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM) 1369 * - TCS output stores are eliminated 1370 * 1371 * TODO: These are things the driver ignores in the final shader code 1372 * and relies on the default shader info. 1373 * - Other system values are not eliminated 1374 * - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs 1375 * to remove holes 1376 * - uses_discard - if it changed to false 1377 * - writes_memory - if it changed to false 1378 * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not 1379 * eliminated 1380 * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS) 1381 * GS outputs are eliminated except for the temporary LDS. 1382 * Clip distances, gl_PointSize, and PS outputs are eliminated based 1383 * on current states, so we don't care about the shader code. 1384 * 1385 * TODO: Merged shaders don't inline uniforms for the first stage. 1386 * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS. 1387 * (key == NULL for the first stage here) 1388 * 1389 * TODO: Compute shaders don't support inlinable uniforms, because they 1390 * don't have shader variants. 1391 * 1392 * TODO: The driver uses a linear search to find a shader variant. This 1393 * can be really slow if we get too many variants due to uniform inlining. 1394 */ 1395 NIR_PASS_V(nir, nir_inline_uniforms, 1396 nir->info.num_inlinable_uniforms, 1397 key->opt.inlined_uniform_values, 1398 nir->info.inlinable_uniform_dw_offsets); 1399 1400 si_nir_opts(sel->screen, nir, true); 1401 si_nir_late_opts(nir); 1402 1403 /* This must be done again. */ 1404 NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | 1405 nir_var_shader_out); 1406 } 1407 1408 return nir; 1409} 1410 1411bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 1412 struct si_shader *shader, struct pipe_debug_callback *debug) 1413{ 1414 struct si_shader_selector *sel = shader->selector; 1415 bool free_nir; 1416 struct nir_shader *nir = si_get_nir_shader(sel, &shader->key, &free_nir); 1417 1418 /* Dump NIR before doing NIR->LLVM conversion in case the 1419 * conversion fails. */ 1420 if (si_can_dump_shader(sscreen, sel->info.stage) && 1421 !(sscreen->debug_flags & DBG(NO_NIR))) { 1422 nir_print_shader(nir, stderr); 1423 si_dump_streamout(&sel->so); 1424 } 1425 1426 /* Initialize vs_output_ps_input_cntl to default. */ 1427 for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++) 1428 shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED; 1429 shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0; 1430 1431 shader->info.uses_instanceid = sel->info.uses_instanceid; 1432 1433 /* TODO: ACO could compile non-monolithic shaders here (starting 1434 * with PS and NGG VS), but monolithic shaders should be compiled 1435 * by LLVM due to more complicated compilation. 1436 */ 1437 if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir)) 1438 return false; 1439 1440 /* Compute vs_output_ps_input_cntl. */ 1441 if ((sel->info.stage == MESA_SHADER_VERTEX || 1442 sel->info.stage == MESA_SHADER_TESS_EVAL || 1443 sel->info.stage == MESA_SHADER_GEOMETRY) && 1444 !shader->key.as_ls && !shader->key.as_es) { 1445 ubyte *vs_output_param_offset = shader->info.vs_output_param_offset; 1446 1447 if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) 1448 vs_output_param_offset = sel->gs_copy_shader->info.vs_output_param_offset; 1449 1450 /* VS and TES should also set primitive ID output if it's used. */ 1451 unsigned num_outputs_with_prim_id = sel->info.num_outputs + 1452 shader->key.mono.u.vs_export_prim_id; 1453 1454 for (unsigned i = 0; i < num_outputs_with_prim_id; i++) { 1455 unsigned semantic = sel->info.output_semantic[i]; 1456 unsigned offset = vs_output_param_offset[i]; 1457 unsigned ps_input_cntl; 1458 1459 if (offset <= AC_EXP_PARAM_OFFSET_31) { 1460 /* The input is loaded from parameter memory. */ 1461 ps_input_cntl = S_028644_OFFSET(offset); 1462 } else { 1463 /* The input is a DEFAULT_VAL constant. */ 1464 assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && 1465 offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); 1466 offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; 1467 1468 /* OFFSET=0x20 means that DEFAULT_VAL is used. */ 1469 ps_input_cntl = S_028644_OFFSET(0x20) | 1470 S_028644_DEFAULT_VAL(offset); 1471 } 1472 1473 shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl; 1474 } 1475 } 1476 1477 /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */ 1478 if (sel->info.stage == MESA_SHADER_COMPUTE) { 1479 unsigned wave_size = sscreen->compute_wave_size; 1480 unsigned max_vgprs = 1481 sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1); 1482 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd; 1483 unsigned max_sgprs_per_wave = 128; 1484 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */ 1485 unsigned threads_per_tg = si_get_max_workgroup_size(shader); 1486 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size); 1487 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg); 1488 1489 max_vgprs = max_vgprs / waves_per_simd; 1490 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave); 1491 1492 if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) { 1493 fprintf(stderr, 1494 "LLVM failed to compile a shader correctly: " 1495 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n", 1496 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs); 1497 1498 /* Just terminate the process, because dependent 1499 * shaders can hang due to bad input data, but use 1500 * the env var to allow shader-db to work. 1501 */ 1502 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false)) 1503 abort(); 1504 } 1505 } 1506 1507 /* Add the scratch offset to input SGPRs. */ 1508 if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader)) 1509 shader->info.num_input_sgprs += 1; /* scratch byte offset */ 1510 1511 /* Calculate the number of fragment input VGPRs. */ 1512 if (sel->info.stage == MESA_SHADER_FRAGMENT) { 1513 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt( 1514 &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index); 1515 } 1516 1517 si_calculate_max_simd_waves(shader); 1518 si_shader_dump_stats_for_shader_db(sscreen, shader, debug); 1519 return true; 1520} 1521 1522/** 1523 * Create, compile and return a shader part (prolog or epilog). 1524 * 1525 * \param sscreen screen 1526 * \param list list of shader parts of the same category 1527 * \param type shader type 1528 * \param key shader part key 1529 * \param prolog whether the part being requested is a prolog 1530 * \param tm LLVM target machine 1531 * \param debug debug callback 1532 * \param build the callback responsible for building the main function 1533 * \return non-NULL on success 1534 */ 1535static struct si_shader_part * 1536si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, 1537 gl_shader_stage stage, bool prolog, union si_shader_part_key *key, 1538 struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug, 1539 void (*build)(struct si_shader_context *, union si_shader_part_key *), 1540 const char *name) 1541{ 1542 struct si_shader_part *result; 1543 1544 simple_mtx_lock(&sscreen->shader_parts_mutex); 1545 1546 /* Find existing. */ 1547 for (result = *list; result; result = result->next) { 1548 if (memcmp(&result->key, key, sizeof(*key)) == 0) { 1549 simple_mtx_unlock(&sscreen->shader_parts_mutex); 1550 return result; 1551 } 1552 } 1553 1554 /* Compile a new one. */ 1555 result = CALLOC_STRUCT(si_shader_part); 1556 result->key = *key; 1557 1558 struct si_shader_selector sel = {}; 1559 sel.screen = sscreen; 1560 1561 struct si_shader shader = {}; 1562 shader.selector = &sel; 1563 1564 switch (stage) { 1565 case MESA_SHADER_VERTEX: 1566 shader.key.as_ls = key->vs_prolog.as_ls; 1567 shader.key.as_es = key->vs_prolog.as_es; 1568 shader.key.as_ngg = key->vs_prolog.as_ngg; 1569 break; 1570 case MESA_SHADER_TESS_CTRL: 1571 assert(!prolog); 1572 shader.key.part.tcs.epilog = key->tcs_epilog.states; 1573 break; 1574 case MESA_SHADER_GEOMETRY: 1575 assert(prolog); 1576 shader.key.as_ngg = key->gs_prolog.as_ngg; 1577 break; 1578 case MESA_SHADER_FRAGMENT: 1579 if (prolog) 1580 shader.key.part.ps.prolog = key->ps_prolog.states; 1581 else 1582 shader.key.part.ps.epilog = key->ps_epilog.states; 1583 break; 1584 default: 1585 unreachable("bad shader part"); 1586 } 1587 1588 struct si_shader_context ctx; 1589 si_llvm_context_init(&ctx, sscreen, compiler, 1590 si_get_wave_size(sscreen, stage, 1591 shader.key.as_ngg, shader.key.as_es)); 1592 ctx.shader = &shader; 1593 ctx.stage = stage; 1594 1595 build(&ctx, key); 1596 1597 /* Compile. */ 1598 si_llvm_optimize_module(&ctx); 1599 1600 if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug, 1601 ctx.stage, name, false)) { 1602 FREE(result); 1603 result = NULL; 1604 goto out; 1605 } 1606 1607 result->next = *list; 1608 *list = result; 1609 1610out: 1611 si_llvm_dispose(&ctx); 1612 simple_mtx_unlock(&sscreen->shader_parts_mutex); 1613 return result; 1614} 1615 1616static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 1617 struct si_shader *shader, struct pipe_debug_callback *debug, 1618 struct si_shader *main_part, const struct si_vs_prolog_bits *key) 1619{ 1620 struct si_shader_selector *vs = main_part->selector; 1621 1622 if (!si_vs_needs_prolog(vs, key, &shader->key, false)) 1623 return true; 1624 1625 /* Get the prolog. */ 1626 union si_shader_part_key prolog_key; 1627 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader, 1628 &prolog_key); 1629 1630 shader->prolog = 1631 si_get_shader_part(sscreen, &sscreen->vs_prologs, MESA_SHADER_VERTEX, true, &prolog_key, 1632 compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog"); 1633 return shader->prolog != NULL; 1634} 1635 1636/** 1637 * Select and compile (or reuse) vertex shader parts (prolog & epilog). 1638 */ 1639static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 1640 struct si_shader *shader, struct pipe_debug_callback *debug) 1641{ 1642 return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog); 1643} 1644 1645/** 1646 * Select and compile (or reuse) TCS parts (epilog). 1647 */ 1648static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 1649 struct si_shader *shader, struct pipe_debug_callback *debug) 1650{ 1651 if (sscreen->info.chip_class >= GFX9) { 1652 struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls; 1653 1654 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part, 1655 &shader->key.part.tcs.ls_prolog)) 1656 return false; 1657 1658 shader->previous_stage = ls_main_part; 1659 } 1660 1661 /* Get the epilog. */ 1662 union si_shader_part_key epilog_key; 1663 memset(&epilog_key, 0, sizeof(epilog_key)); 1664 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 1665 1666 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false, 1667 &epilog_key, compiler, debug, si_llvm_build_tcs_epilog, 1668 "Tessellation Control Shader Epilog"); 1669 return shader->epilog != NULL; 1670} 1671 1672/** 1673 * Select and compile (or reuse) GS parts (prolog). 1674 */ 1675static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 1676 struct si_shader *shader, struct pipe_debug_callback *debug) 1677{ 1678 if (sscreen->info.chip_class >= GFX9) { 1679 struct si_shader *es_main_part; 1680 1681 if (shader->key.as_ngg) 1682 es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es; 1683 else 1684 es_main_part = shader->key.part.gs.es->main_shader_part_es; 1685 1686 if (shader->key.part.gs.es->info.stage == MESA_SHADER_VERTEX && 1687 !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part, 1688 &shader->key.part.gs.vs_prolog)) 1689 return false; 1690 1691 shader->previous_stage = es_main_part; 1692 } 1693 1694 if (!shader->key.part.gs.prolog.tri_strip_adj_fix) 1695 return true; 1696 1697 union si_shader_part_key prolog_key; 1698 memset(&prolog_key, 0, sizeof(prolog_key)); 1699 prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 1700 prolog_key.gs_prolog.as_ngg = shader->key.as_ngg; 1701 1702 shader->prolog2 = 1703 si_get_shader_part(sscreen, &sscreen->gs_prologs, MESA_SHADER_GEOMETRY, true, &prolog_key, 1704 compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog"); 1705 return shader->prolog2 != NULL; 1706} 1707 1708/** 1709 * Compute the PS prolog key, which contains all the information needed to 1710 * build the PS prolog function, and set related bits in shader->config. 1711 */ 1712void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key, 1713 bool separate_prolog) 1714{ 1715 struct si_shader_info *info = &shader->selector->info; 1716 1717 memset(key, 0, sizeof(*key)); 1718 key->ps_prolog.states = shader->key.part.ps.prolog; 1719 key->ps_prolog.colors_read = info->colors_read; 1720 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; 1721 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; 1722 key->ps_prolog.wqm = 1723 info->base.fs.needs_quad_helper_invocations && 1724 (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp || 1725 key->ps_prolog.states.force_linear_sample_interp || 1726 key->ps_prolog.states.force_persp_center_interp || 1727 key->ps_prolog.states.force_linear_center_interp || 1728 key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear); 1729 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index; 1730 1731 if (info->colors_read) { 1732 ubyte *color = shader->selector->color_attr_index; 1733 1734 if (shader->key.part.ps.prolog.color_two_side) { 1735 /* BCOLORs are stored after the last input. */ 1736 key->ps_prolog.num_interp_inputs = info->num_inputs; 1737 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; 1738 if (separate_prolog) 1739 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); 1740 } 1741 1742 for (unsigned i = 0; i < 2; i++) { 1743 unsigned interp = info->color_interpolate[i]; 1744 unsigned location = info->color_interpolate_loc[i]; 1745 1746 if (!(info->colors_read & (0xf << i * 4))) 1747 continue; 1748 1749 key->ps_prolog.color_attr_index[i] = color[i]; 1750 1751 if (shader->key.part.ps.prolog.flatshade_colors && interp == INTERP_MODE_COLOR) 1752 interp = INTERP_MODE_FLAT; 1753 1754 switch (interp) { 1755 case INTERP_MODE_FLAT: 1756 key->ps_prolog.color_interp_vgpr_index[i] = -1; 1757 break; 1758 case INTERP_MODE_SMOOTH: 1759 case INTERP_MODE_COLOR: 1760 /* Force the interpolation location for colors here. */ 1761 if (shader->key.part.ps.prolog.force_persp_sample_interp) 1762 location = TGSI_INTERPOLATE_LOC_SAMPLE; 1763 if (shader->key.part.ps.prolog.force_persp_center_interp) 1764 location = TGSI_INTERPOLATE_LOC_CENTER; 1765 1766 switch (location) { 1767 case TGSI_INTERPOLATE_LOC_SAMPLE: 1768 key->ps_prolog.color_interp_vgpr_index[i] = 0; 1769 if (separate_prolog) { 1770 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); 1771 } 1772 break; 1773 case TGSI_INTERPOLATE_LOC_CENTER: 1774 key->ps_prolog.color_interp_vgpr_index[i] = 2; 1775 if (separate_prolog) { 1776 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 1777 } 1778 break; 1779 case TGSI_INTERPOLATE_LOC_CENTROID: 1780 key->ps_prolog.color_interp_vgpr_index[i] = 4; 1781 if (separate_prolog) { 1782 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1); 1783 } 1784 break; 1785 default: 1786 assert(0); 1787 } 1788 break; 1789 case INTERP_MODE_NOPERSPECTIVE: 1790 /* Force the interpolation location for colors here. */ 1791 if (shader->key.part.ps.prolog.force_linear_sample_interp) 1792 location = TGSI_INTERPOLATE_LOC_SAMPLE; 1793 if (shader->key.part.ps.prolog.force_linear_center_interp) 1794 location = TGSI_INTERPOLATE_LOC_CENTER; 1795 1796 /* The VGPR assignment for non-monolithic shaders 1797 * works because InitialPSInputAddr is set on the 1798 * main shader and PERSP_PULL_MODEL is never used. 1799 */ 1800 switch (location) { 1801 case TGSI_INTERPOLATE_LOC_SAMPLE: 1802 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9; 1803 if (separate_prolog) { 1804 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); 1805 } 1806 break; 1807 case TGSI_INTERPOLATE_LOC_CENTER: 1808 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11; 1809 if (separate_prolog) { 1810 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 1811 } 1812 break; 1813 case TGSI_INTERPOLATE_LOC_CENTROID: 1814 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13; 1815 if (separate_prolog) { 1816 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1); 1817 } 1818 break; 1819 default: 1820 assert(0); 1821 } 1822 break; 1823 default: 1824 assert(0); 1825 } 1826 } 1827 } 1828} 1829 1830/** 1831 * Check whether a PS prolog is required based on the key. 1832 */ 1833bool si_need_ps_prolog(const union si_shader_part_key *key) 1834{ 1835 return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp || 1836 key->ps_prolog.states.force_linear_sample_interp || 1837 key->ps_prolog.states.force_persp_center_interp || 1838 key->ps_prolog.states.force_linear_center_interp || 1839 key->ps_prolog.states.bc_optimize_for_persp || 1840 key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple || 1841 key->ps_prolog.states.samplemask_log_ps_iter; 1842} 1843 1844/** 1845 * Compute the PS epilog key, which contains all the information needed to 1846 * build the PS epilog function. 1847 */ 1848void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key) 1849{ 1850 struct si_shader_info *info = &shader->selector->info; 1851 memset(key, 0, sizeof(*key)); 1852 key->ps_epilog.colors_written = info->colors_written; 1853 key->ps_epilog.color_types = info->output_color_types; 1854 key->ps_epilog.writes_z = info->writes_z; 1855 key->ps_epilog.writes_stencil = info->writes_stencil; 1856 key->ps_epilog.writes_samplemask = info->writes_samplemask; 1857 key->ps_epilog.states = shader->key.part.ps.epilog; 1858} 1859 1860/** 1861 * Select and compile (or reuse) pixel shader parts (prolog & epilog). 1862 */ 1863static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 1864 struct si_shader *shader, struct pipe_debug_callback *debug) 1865{ 1866 union si_shader_part_key prolog_key; 1867 union si_shader_part_key epilog_key; 1868 1869 /* Get the prolog. */ 1870 si_get_ps_prolog_key(shader, &prolog_key, true); 1871 1872 /* The prolog is a no-op if these aren't set. */ 1873 if (si_need_ps_prolog(&prolog_key)) { 1874 shader->prolog = 1875 si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key, 1876 compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog"); 1877 if (!shader->prolog) 1878 return false; 1879 } 1880 1881 /* Get the epilog. */ 1882 si_get_ps_epilog_key(shader, &epilog_key); 1883 1884 shader->epilog = 1885 si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key, 1886 compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog"); 1887 if (!shader->epilog) 1888 return false; 1889 1890 /* Enable POS_FIXED_PT if polygon stippling is enabled. */ 1891 if (shader->key.part.ps.prolog.poly_stipple) { 1892 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); 1893 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); 1894 } 1895 1896 /* Set up the enable bits for per-sample shading if needed. */ 1897 if (shader->key.part.ps.prolog.force_persp_sample_interp && 1898 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || 1899 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 1900 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; 1901 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; 1902 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); 1903 } 1904 if (shader->key.part.ps.prolog.force_linear_sample_interp && 1905 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || 1906 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 1907 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; 1908 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; 1909 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); 1910 } 1911 if (shader->key.part.ps.prolog.force_persp_center_interp && 1912 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) || 1913 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 1914 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA; 1915 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; 1916 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 1917 } 1918 if (shader->key.part.ps.prolog.force_linear_center_interp && 1919 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) || 1920 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 1921 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA; 1922 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; 1923 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 1924 } 1925 1926 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ 1927 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && 1928 !(shader->config.spi_ps_input_ena & 0xf)) { 1929 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 1930 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); 1931 } 1932 1933 /* At least one pair of interpolation weights must be enabled. */ 1934 if (!(shader->config.spi_ps_input_ena & 0x7f)) { 1935 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 1936 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); 1937 } 1938 1939 /* Samplemask fixup requires the sample ID. */ 1940 if (shader->key.part.ps.prolog.samplemask_log_ps_iter) { 1941 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1); 1942 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)); 1943 } 1944 1945 /* The sample mask input is always enabled, because the API shader always 1946 * passes it through to the epilog. Disable it here if it's unused. 1947 */ 1948 if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask) 1949 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; 1950 1951 return true; 1952} 1953 1954void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size) 1955{ 1956 /* If tessellation is all offchip and on-chip GS isn't used, this 1957 * workaround is not needed. 1958 */ 1959 return; 1960 1961 /* SPI barrier management bug: 1962 * Make sure we have at least 4k of LDS in use to avoid the bug. 1963 * It applies to workgroup sizes of more than one wavefront. 1964 */ 1965 if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI) 1966 *lds_size = MAX2(*lds_size, 8); 1967} 1968 1969void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader) 1970{ 1971 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ 1972 1973 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); 1974 1975 if (shader->selector->info.stage == MESA_SHADER_COMPUTE && 1976 si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) { 1977 si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size); 1978 } 1979} 1980 1981bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 1982 struct si_shader *shader, struct pipe_debug_callback *debug) 1983{ 1984 struct si_shader_selector *sel = shader->selector; 1985 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key); 1986 1987 /* LS, ES, VS are compiled on demand if the main part hasn't been 1988 * compiled for that stage. 1989 * 1990 * GS are compiled on demand if the main part hasn't been compiled 1991 * for the chosen NGG-ness. 1992 * 1993 * Vertex shaders are compiled on demand when a vertex fetch 1994 * workaround must be applied. 1995 */ 1996 if (shader->is_monolithic) { 1997 /* Monolithic shader (compiled as a whole, has many variants, 1998 * may take a long time to compile). 1999 */ 2000 if (!si_compile_shader(sscreen, compiler, shader, debug)) 2001 return false; 2002 } else { 2003 /* The shader consists of several parts: 2004 * 2005 * - the middle part is the user shader, it has 1 variant only 2006 * and it was compiled during the creation of the shader 2007 * selector 2008 * - the prolog part is inserted at the beginning 2009 * - the epilog part is inserted at the end 2010 * 2011 * The prolog and epilog have many (but simple) variants. 2012 * 2013 * Starting with gfx9, geometry and tessellation control 2014 * shaders also contain the prolog and user shader parts of 2015 * the previous shader stage. 2016 */ 2017 2018 if (!mainp) 2019 return false; 2020 2021 /* Copy the compiled shader data over. */ 2022 shader->is_binary_shared = true; 2023 shader->binary = mainp->binary; 2024 shader->config = mainp->config; 2025 shader->info.num_input_sgprs = mainp->info.num_input_sgprs; 2026 shader->info.num_input_vgprs = mainp->info.num_input_vgprs; 2027 shader->info.face_vgpr_index = mainp->info.face_vgpr_index; 2028 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index; 2029 memcpy(shader->info.vs_output_ps_input_cntl, mainp->info.vs_output_ps_input_cntl, 2030 sizeof(mainp->info.vs_output_ps_input_cntl)); 2031 shader->info.uses_instanceid = mainp->info.uses_instanceid; 2032 shader->info.nr_pos_exports = mainp->info.nr_pos_exports; 2033 shader->info.nr_param_exports = mainp->info.nr_param_exports; 2034 2035 /* Select prologs and/or epilogs. */ 2036 switch (sel->info.stage) { 2037 case MESA_SHADER_VERTEX: 2038 if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug)) 2039 return false; 2040 break; 2041 case MESA_SHADER_TESS_CTRL: 2042 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug)) 2043 return false; 2044 break; 2045 case MESA_SHADER_TESS_EVAL: 2046 break; 2047 case MESA_SHADER_GEOMETRY: 2048 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug)) 2049 return false; 2050 break; 2051 case MESA_SHADER_FRAGMENT: 2052 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug)) 2053 return false; 2054 2055 /* Make sure we have at least as many VGPRs as there 2056 * are allocated inputs. 2057 */ 2058 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs); 2059 break; 2060 default:; 2061 } 2062 2063 /* Update SGPR and VGPR counts. */ 2064 if (shader->prolog) { 2065 shader->config.num_sgprs = 2066 MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs); 2067 shader->config.num_vgprs = 2068 MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs); 2069 } 2070 if (shader->previous_stage) { 2071 shader->config.num_sgprs = 2072 MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs); 2073 shader->config.num_vgprs = 2074 MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs); 2075 shader->config.spilled_sgprs = 2076 MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs); 2077 shader->config.spilled_vgprs = 2078 MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs); 2079 shader->info.private_mem_vgprs = 2080 MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs); 2081 shader->config.scratch_bytes_per_wave = 2082 MAX2(shader->config.scratch_bytes_per_wave, 2083 shader->previous_stage->config.scratch_bytes_per_wave); 2084 shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid; 2085 } 2086 if (shader->prolog2) { 2087 shader->config.num_sgprs = 2088 MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs); 2089 shader->config.num_vgprs = 2090 MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs); 2091 } 2092 if (shader->epilog) { 2093 shader->config.num_sgprs = 2094 MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs); 2095 shader->config.num_vgprs = 2096 MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs); 2097 } 2098 si_calculate_max_simd_waves(shader); 2099 } 2100 2101 if (shader->key.as_ngg) { 2102 assert(!shader->key.as_es && !shader->key.as_ls); 2103 if (!gfx10_ngg_calculate_subgroup_info(shader)) { 2104 fprintf(stderr, "Failed to compute subgroup info\n"); 2105 return false; 2106 } 2107 } else if (sscreen->info.chip_class >= GFX9 && sel->info.stage == MESA_SHADER_GEOMETRY) { 2108 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info); 2109 } 2110 2111 shader->uses_vs_state_provoking_vertex = 2112 sscreen->use_ngg && 2113 /* Used to convert triangle strips from GS to triangles. */ 2114 ((sel->info.stage == MESA_SHADER_GEOMETRY && 2115 util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) || 2116 (sel->info.stage == MESA_SHADER_VERTEX && 2117 /* Used to export PrimitiveID from the correct vertex. */ 2118 shader->key.mono.u.vs_export_prim_id)); 2119 2120 shader->uses_vs_state_outprim = sscreen->use_ngg && 2121 /* Only used by streamout in vertex shaders. */ 2122 sel->info.stage == MESA_SHADER_VERTEX && 2123 sel->so.num_outputs; 2124 2125 if (sel->info.stage == MESA_SHADER_VERTEX) { 2126 shader->uses_base_instance = sel->info.uses_base_instance || 2127 shader->key.part.vs.prolog.instance_divisor_is_one || 2128 shader->key.part.vs.prolog.instance_divisor_is_fetched; 2129 } else if (sel->info.stage == MESA_SHADER_TESS_CTRL) { 2130 shader->uses_base_instance = shader->previous_stage_sel && 2131 (shader->previous_stage_sel->info.uses_base_instance || 2132 shader->key.part.tcs.ls_prolog.instance_divisor_is_one || 2133 shader->key.part.tcs.ls_prolog.instance_divisor_is_fetched); 2134 } else if (sel->info.stage == MESA_SHADER_GEOMETRY) { 2135 shader->uses_base_instance = shader->previous_stage_sel && 2136 (shader->previous_stage_sel->info.uses_base_instance || 2137 shader->key.part.gs.vs_prolog.instance_divisor_is_one || 2138 shader->key.part.gs.vs_prolog.instance_divisor_is_fetched); 2139 } 2140 2141 si_fix_resource_usage(sscreen, shader); 2142 si_shader_dump(sscreen, shader, debug, stderr, true); 2143 2144 /* Upload. */ 2145 if (!si_shader_binary_upload(sscreen, shader, 0)) { 2146 fprintf(stderr, "LLVM failed to upload shader\n"); 2147 return false; 2148 } 2149 2150 return true; 2151} 2152 2153void si_shader_binary_clean(struct si_shader_binary *binary) 2154{ 2155 free((void *)binary->elf_buffer); 2156 binary->elf_buffer = NULL; 2157 2158 free(binary->llvm_ir_string); 2159 binary->llvm_ir_string = NULL; 2160 2161 free(binary->uploaded_code); 2162 binary->uploaded_code = NULL; 2163 binary->uploaded_code_size = 0; 2164} 2165 2166void si_shader_destroy(struct si_shader *shader) 2167{ 2168 if (shader->scratch_bo) 2169 si_resource_reference(&shader->scratch_bo, NULL); 2170 2171 si_resource_reference(&shader->bo, NULL); 2172 2173 if (!shader->is_binary_shared) 2174 si_shader_binary_clean(&shader->binary); 2175 2176 free(shader->shader_log); 2177} 2178