1/* 2 * Copyright 2012 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "util/u_memory.h" 26#include "util/u_string.h" 27#include "tgsi/tgsi_build.h" 28#include "tgsi/tgsi_util.h" 29#include "tgsi/tgsi_dump.h" 30 31#include "ac_exp_param.h" 32#include "ac_shader_util.h" 33#include "ac_llvm_util.h" 34#include "si_shader_internal.h" 35#include "si_pipe.h" 36#include "sid.h" 37 38#include "compiler/nir/nir.h" 39 40static const char *scratch_rsrc_dword0_symbol = 41 "SCRATCH_RSRC_DWORD0"; 42 43static const char *scratch_rsrc_dword1_symbol = 44 "SCRATCH_RSRC_DWORD1"; 45 46struct si_shader_output_values 47{ 48 LLVMValueRef values[4]; 49 unsigned semantic_name; 50 unsigned semantic_index; 51 ubyte vertex_stream[4]; 52}; 53 54/** 55 * Used to collect types and other info about arguments of the LLVM function 56 * before the function is created. 57 */ 58struct si_function_info { 59 LLVMTypeRef types[100]; 60 LLVMValueRef *assign[100]; 61 unsigned num_sgpr_params; 62 unsigned num_params; 63}; 64 65enum si_arg_regfile { 66 ARG_SGPR, 67 ARG_VGPR 68}; 69 70static void si_init_shader_ctx(struct si_shader_context *ctx, 71 struct si_screen *sscreen, 72 struct ac_llvm_compiler *compiler); 73 74static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, 75 struct lp_build_tgsi_context *bld_base, 76 struct lp_build_emit_data *emit_data); 77 78static void si_dump_shader_key(unsigned processor, const struct si_shader *shader, 79 FILE *f); 80 81static void si_build_vs_prolog_function(struct si_shader_context *ctx, 82 union si_shader_part_key *key); 83static void si_build_tcs_epilog_function(struct si_shader_context *ctx, 84 union si_shader_part_key *key); 85static void si_build_ps_prolog_function(struct si_shader_context *ctx, 86 union si_shader_part_key *key); 87static void si_build_ps_epilog_function(struct si_shader_context *ctx, 88 union si_shader_part_key *key); 89static void si_fix_resource_usage(struct si_screen *sscreen, 90 struct si_shader *shader); 91 92/* Ideally pass the sample mask input to the PS epilog as v14, which 93 * is its usual location, so that the shader doesn't have to add v_mov. 94 */ 95#define PS_EPILOG_SAMPLEMASK_MIN_LOC 14 96 97static bool llvm_type_is_64bit(struct si_shader_context *ctx, 98 LLVMTypeRef type) 99{ 100 if (type == ctx->ac.i64 || type == ctx->ac.f64) 101 return true; 102 103 return false; 104} 105 106static bool is_merged_shader(struct si_shader_context *ctx) 107{ 108 if (ctx->screen->info.chip_class <= VI) 109 return false; 110 111 return ctx->shader->key.as_ls || 112 ctx->shader->key.as_es || 113 ctx->type == PIPE_SHADER_TESS_CTRL || 114 ctx->type == PIPE_SHADER_GEOMETRY; 115} 116 117static void si_init_function_info(struct si_function_info *fninfo) 118{ 119 fninfo->num_params = 0; 120 fninfo->num_sgpr_params = 0; 121} 122 123static unsigned add_arg_assign(struct si_function_info *fninfo, 124 enum si_arg_regfile regfile, LLVMTypeRef type, 125 LLVMValueRef *assign) 126{ 127 assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params); 128 129 unsigned idx = fninfo->num_params++; 130 assert(idx < ARRAY_SIZE(fninfo->types)); 131 132 if (regfile == ARG_SGPR) 133 fninfo->num_sgpr_params = fninfo->num_params; 134 135 fninfo->types[idx] = type; 136 fninfo->assign[idx] = assign; 137 return idx; 138} 139 140static unsigned add_arg(struct si_function_info *fninfo, 141 enum si_arg_regfile regfile, LLVMTypeRef type) 142{ 143 return add_arg_assign(fninfo, regfile, type, NULL); 144} 145 146static void add_arg_assign_checked(struct si_function_info *fninfo, 147 enum si_arg_regfile regfile, LLVMTypeRef type, 148 LLVMValueRef *assign, unsigned idx) 149{ 150 MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign); 151 assert(actual == idx); 152} 153 154static void add_arg_checked(struct si_function_info *fninfo, 155 enum si_arg_regfile regfile, LLVMTypeRef type, 156 unsigned idx) 157{ 158 add_arg_assign_checked(fninfo, regfile, type, NULL, idx); 159} 160 161/** 162 * Returns a unique index for a per-patch semantic name and index. The index 163 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs 164 * can be calculated. 165 */ 166unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index) 167{ 168 switch (semantic_name) { 169 case TGSI_SEMANTIC_TESSOUTER: 170 return 0; 171 case TGSI_SEMANTIC_TESSINNER: 172 return 1; 173 case TGSI_SEMANTIC_PATCH: 174 assert(index < 30); 175 return 2 + index; 176 177 default: 178 assert(!"invalid semantic name"); 179 return 0; 180 } 181} 182 183/** 184 * Returns a unique index for a semantic name and index. The index must be 185 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be 186 * calculated. 187 */ 188unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, 189 unsigned is_varying) 190{ 191 switch (semantic_name) { 192 case TGSI_SEMANTIC_POSITION: 193 return 0; 194 case TGSI_SEMANTIC_GENERIC: 195 /* Since some shader stages use the the highest used IO index 196 * to determine the size to allocate for inputs/outputs 197 * (in LDS, tess and GS rings). GENERIC should be placed right 198 * after POSITION to make that size as small as possible. 199 */ 200 if (index < SI_MAX_IO_GENERIC) 201 return 1 + index; 202 203 assert(!"invalid generic index"); 204 return 0; 205 case TGSI_SEMANTIC_PSIZE: 206 return SI_MAX_IO_GENERIC + 1; 207 case TGSI_SEMANTIC_CLIPDIST: 208 assert(index <= 1); 209 return SI_MAX_IO_GENERIC + 2 + index; 210 case TGSI_SEMANTIC_FOG: 211 return SI_MAX_IO_GENERIC + 4; 212 case TGSI_SEMANTIC_LAYER: 213 return SI_MAX_IO_GENERIC + 5; 214 case TGSI_SEMANTIC_VIEWPORT_INDEX: 215 return SI_MAX_IO_GENERIC + 6; 216 case TGSI_SEMANTIC_PRIMID: 217 return SI_MAX_IO_GENERIC + 7; 218 case TGSI_SEMANTIC_COLOR: 219 assert(index < 2); 220 return SI_MAX_IO_GENERIC + 8 + index; 221 case TGSI_SEMANTIC_BCOLOR: 222 assert(index < 2); 223 /* If it's a varying, COLOR and BCOLOR alias. */ 224 if (is_varying) 225 return SI_MAX_IO_GENERIC + 8 + index; 226 else 227 return SI_MAX_IO_GENERIC + 10 + index; 228 case TGSI_SEMANTIC_TEXCOORD: 229 assert(index < 8); 230 STATIC_ASSERT(SI_MAX_IO_GENERIC + 12 + 8 <= 63); 231 return SI_MAX_IO_GENERIC + 12 + index; 232 case TGSI_SEMANTIC_CLIPVERTEX: 233 return 63; 234 default: 235 fprintf(stderr, "invalid semantic name = %u\n", semantic_name); 236 assert(!"invalid semantic name"); 237 return 0; 238 } 239} 240 241/** 242 * Get the value of a shader input parameter and extract a bitfield. 243 */ 244static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, 245 LLVMValueRef value, unsigned rshift, 246 unsigned bitwidth) 247{ 248 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) 249 value = ac_to_integer(&ctx->ac, value); 250 251 if (rshift) 252 value = LLVMBuildLShr(ctx->ac.builder, value, 253 LLVMConstInt(ctx->i32, rshift, 0), ""); 254 255 if (rshift + bitwidth < 32) { 256 unsigned mask = (1 << bitwidth) - 1; 257 value = LLVMBuildAnd(ctx->ac.builder, value, 258 LLVMConstInt(ctx->i32, mask, 0), ""); 259 } 260 261 return value; 262} 263 264LLVMValueRef si_unpack_param(struct si_shader_context *ctx, 265 unsigned param, unsigned rshift, 266 unsigned bitwidth) 267{ 268 LLVMValueRef value = LLVMGetParam(ctx->main_fn, param); 269 270 return unpack_llvm_param(ctx, value, rshift, bitwidth); 271} 272 273static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx) 274{ 275 switch (ctx->type) { 276 case PIPE_SHADER_TESS_CTRL: 277 return unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 0, 8); 278 279 case PIPE_SHADER_TESS_EVAL: 280 return LLVMGetParam(ctx->main_fn, 281 ctx->param_tes_rel_patch_id); 282 283 default: 284 assert(0); 285 return NULL; 286 } 287} 288 289/* Tessellation shaders pass outputs to the next shader using LDS. 290 * 291 * LS outputs = TCS inputs 292 * TCS outputs = TES inputs 293 * 294 * The LDS layout is: 295 * - TCS inputs for patch 0 296 * - TCS inputs for patch 1 297 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) 298 * - ... 299 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset 300 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset 301 * - TCS outputs for patch 1 302 * - Per-patch TCS outputs for patch 1 303 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) 304 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) 305 * - ... 306 * 307 * All three shaders VS(LS), TCS, TES share the same LDS space. 308 */ 309 310static LLVMValueRef 311get_tcs_in_patch_stride(struct si_shader_context *ctx) 312{ 313 return si_unpack_param(ctx, ctx->param_vs_state_bits, 8, 13); 314} 315 316static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx) 317{ 318 assert(ctx->type == PIPE_SHADER_TESS_CTRL); 319 320 if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) 321 return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4; 322 323 return util_last_bit64(ctx->shader->selector->outputs_written) * 4; 324} 325 326static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx) 327{ 328 unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx); 329 330 return LLVMConstInt(ctx->i32, stride, 0); 331} 332 333static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx) 334{ 335 if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) 336 return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13); 337 338 const struct tgsi_shader_info *info = &ctx->shader->selector->info; 339 unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; 340 unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx); 341 unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written); 342 unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + 343 num_patch_outputs * 4; 344 return LLVMConstInt(ctx->i32, patch_dw_stride, 0); 345} 346 347static LLVMValueRef 348get_tcs_out_patch0_offset(struct si_shader_context *ctx) 349{ 350 return LLVMBuildMul(ctx->ac.builder, 351 si_unpack_param(ctx, 352 ctx->param_tcs_out_lds_offsets, 353 0, 16), 354 LLVMConstInt(ctx->i32, 4, 0), ""); 355} 356 357static LLVMValueRef 358get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) 359{ 360 return LLVMBuildMul(ctx->ac.builder, 361 si_unpack_param(ctx, 362 ctx->param_tcs_out_lds_offsets, 363 16, 16), 364 LLVMConstInt(ctx->i32, 4, 0), ""); 365} 366 367static LLVMValueRef 368get_tcs_in_current_patch_offset(struct si_shader_context *ctx) 369{ 370 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx); 371 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); 372 373 return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, ""); 374} 375 376static LLVMValueRef 377get_tcs_out_current_patch_offset(struct si_shader_context *ctx) 378{ 379 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx); 380 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); 381 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); 382 383 return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset); 384} 385 386static LLVMValueRef 387get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) 388{ 389 LLVMValueRef patch0_patch_data_offset = 390 get_tcs_out_patch0_patch_data_offset(ctx); 391 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); 392 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); 393 394 return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset); 395} 396 397static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx) 398{ 399 unsigned tcs_out_vertices = 400 ctx->shader->selector ? 401 ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0; 402 403 /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */ 404 if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices) 405 return LLVMConstInt(ctx->i32, tcs_out_vertices, 0); 406 407 return si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6); 408} 409 410static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) 411{ 412 unsigned stride; 413 414 switch (ctx->type) { 415 case PIPE_SHADER_VERTEX: 416 stride = ctx->shader->selector->lshs_vertex_stride / 4; 417 return LLVMConstInt(ctx->i32, stride, 0); 418 419 case PIPE_SHADER_TESS_CTRL: 420 if (ctx->screen->info.chip_class >= GFX9 && 421 ctx->shader->is_monolithic) { 422 stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4; 423 return LLVMConstInt(ctx->i32, stride, 0); 424 } 425 return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8); 426 427 default: 428 assert(0); 429 return NULL; 430 } 431} 432 433/* Bitcast <4 x float> to <2 x double>, extract the component, and convert 434 * to float. */ 435static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx, 436 LLVMValueRef vec4, 437 unsigned double_index) 438{ 439 LLVMBuilderRef builder = ctx->ac.builder; 440 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context); 441 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4, 442 LLVMVectorType(f64, 2), ""); 443 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0); 444 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, ""); 445 return LLVMBuildFPTrunc(builder, value, ctx->f32, ""); 446} 447 448static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, 449 LLVMValueRef i32, unsigned index) 450{ 451 assert(index <= 1); 452 453 if (index == 1) 454 return LLVMBuildAShr(ctx->ac.builder, i32, 455 LLVMConstInt(ctx->i32, 16, 0), ""); 456 457 return LLVMBuildSExt(ctx->ac.builder, 458 LLVMBuildTrunc(ctx->ac.builder, i32, 459 ctx->ac.i16, ""), 460 ctx->i32, ""); 461} 462 463void si_llvm_load_input_vs( 464 struct si_shader_context *ctx, 465 unsigned input_index, 466 LLVMValueRef out[4]) 467{ 468 const struct tgsi_shader_info *info = &ctx->shader->selector->info; 469 unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; 470 471 if (vs_blit_property) { 472 LLVMValueRef vertex_id = ctx->abi.vertex_id; 473 LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder, 474 LLVMIntULE, vertex_id, 475 ctx->i32_1, ""); 476 /* Use LLVMIntNE, because we have 3 vertices and only 477 * the middle one should use y2. 478 */ 479 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, 480 LLVMIntNE, vertex_id, 481 ctx->i32_1, ""); 482 483 if (input_index == 0) { 484 /* Position: */ 485 LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, 486 ctx->param_vs_blit_inputs); 487 LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, 488 ctx->param_vs_blit_inputs + 1); 489 490 LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0); 491 LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1); 492 LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0); 493 LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1); 494 495 LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, 496 x1, x2, ""); 497 LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, 498 y1, y2, ""); 499 500 out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, ""); 501 out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, ""); 502 out[2] = LLVMGetParam(ctx->main_fn, 503 ctx->param_vs_blit_inputs + 2); 504 out[3] = ctx->ac.f32_1; 505 return; 506 } 507 508 /* Color or texture coordinates: */ 509 assert(input_index == 1); 510 511 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { 512 for (int i = 0; i < 4; i++) { 513 out[i] = LLVMGetParam(ctx->main_fn, 514 ctx->param_vs_blit_inputs + 3 + i); 515 } 516 } else { 517 assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD); 518 LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, 519 ctx->param_vs_blit_inputs + 3); 520 LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, 521 ctx->param_vs_blit_inputs + 4); 522 LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, 523 ctx->param_vs_blit_inputs + 5); 524 LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, 525 ctx->param_vs_blit_inputs + 6); 526 527 out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, 528 x1, x2, ""); 529 out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, 530 y1, y2, ""); 531 out[2] = LLVMGetParam(ctx->main_fn, 532 ctx->param_vs_blit_inputs + 7); 533 out[3] = LLVMGetParam(ctx->main_fn, 534 ctx->param_vs_blit_inputs + 8); 535 } 536 return; 537 } 538 539 unsigned chan; 540 unsigned fix_fetch; 541 unsigned num_fetches; 542 unsigned fetch_stride; 543 unsigned num_channels; 544 545 LLVMValueRef t_list_ptr; 546 LLVMValueRef t_offset; 547 LLVMValueRef t_list; 548 LLVMValueRef vertex_index; 549 LLVMValueRef input[3]; 550 551 /* Load the T list */ 552 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers); 553 554 t_offset = LLVMConstInt(ctx->i32, input_index, 0); 555 556 t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); 557 558 vertex_index = LLVMGetParam(ctx->main_fn, 559 ctx->param_vertex_index0 + 560 input_index); 561 562 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index]; 563 564 /* Do multiple loads for special formats. */ 565 switch (fix_fetch) { 566 case SI_FIX_FETCH_RG_64_FLOAT: 567 num_fetches = 1; /* 1 2-dword or 4-dword load */ 568 fetch_stride = 0; 569 if (util_last_bit(info->input_usage_mask[input_index]) >= 2) 570 num_channels = 4; /* 2 doubles in 4 dwords */ 571 else 572 num_channels = 2; /* 1 double in 2 dwords */ 573 break; 574 case SI_FIX_FETCH_RGB_64_FLOAT: 575 num_fetches = 3; /* 3 2-dword loads */ 576 fetch_stride = 8; 577 num_channels = 2; 578 break; 579 case SI_FIX_FETCH_RGBA_64_FLOAT: 580 num_fetches = 2; /* 2 4-dword loads */ 581 fetch_stride = 16; 582 num_channels = 4; 583 break; 584 case SI_FIX_FETCH_RGB_8: 585 case SI_FIX_FETCH_RGB_8_INT: 586 num_fetches = 3; 587 fetch_stride = 1; 588 num_channels = 1; 589 break; 590 case SI_FIX_FETCH_RGB_16: 591 case SI_FIX_FETCH_RGB_16_INT: 592 num_fetches = 3; 593 fetch_stride = 2; 594 num_channels = 1; 595 break; 596 default: 597 num_fetches = 1; 598 fetch_stride = 0; 599 num_channels = util_last_bit(info->input_usage_mask[input_index]); 600 } 601 602 for (unsigned i = 0; i < num_fetches; i++) { 603 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0); 604 605 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list, 606 vertex_index, voffset, 607 num_channels, false, true); 608 input[i] = ac_build_expand_to_vec4(&ctx->ac, input[i], num_channels); 609 } 610 611 /* Break up the vec4 into individual components */ 612 for (chan = 0; chan < 4; chan++) { 613 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0); 614 out[chan] = LLVMBuildExtractElement(ctx->ac.builder, 615 input[0], llvm_chan, ""); 616 } 617 618 switch (fix_fetch) { 619 case SI_FIX_FETCH_A2_SNORM: 620 case SI_FIX_FETCH_A2_SSCALED: 621 case SI_FIX_FETCH_A2_SINT: { 622 /* The hardware returns an unsigned value; convert it to a 623 * signed one. 624 */ 625 LLVMValueRef tmp = out[3]; 626 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0); 627 628 /* First, recover the sign-extended signed integer value. */ 629 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) 630 tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, ""); 631 else 632 tmp = ac_to_integer(&ctx->ac, tmp); 633 634 /* For the integer-like cases, do a natural sign extension. 635 * 636 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 637 * and happen to contain 0, 1, 2, 3 as the two LSBs of the 638 * exponent. 639 */ 640 tmp = LLVMBuildShl(ctx->ac.builder, tmp, 641 fix_fetch == SI_FIX_FETCH_A2_SNORM ? 642 LLVMConstInt(ctx->i32, 7, 0) : c30, ""); 643 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, ""); 644 645 /* Convert back to the right type. */ 646 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) { 647 LLVMValueRef clamp; 648 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); 649 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, ""); 650 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, ""); 651 tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, ""); 652 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) { 653 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, ""); 654 } 655 656 out[3] = tmp; 657 break; 658 } 659 case SI_FIX_FETCH_RGBA_32_UNORM: 660 case SI_FIX_FETCH_RGBX_32_UNORM: 661 for (chan = 0; chan < 4; chan++) { 662 out[chan] = ac_to_integer(&ctx->ac, out[chan]); 663 out[chan] = LLVMBuildUIToFP(ctx->ac.builder, 664 out[chan], ctx->f32, ""); 665 out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan], 666 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), ""); 667 } 668 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ 669 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM) 670 out[3] = LLVMConstReal(ctx->f32, 1); 671 break; 672 case SI_FIX_FETCH_RGBA_32_SNORM: 673 case SI_FIX_FETCH_RGBX_32_SNORM: 674 case SI_FIX_FETCH_RGBA_32_FIXED: 675 case SI_FIX_FETCH_RGBX_32_FIXED: { 676 double scale; 677 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED) 678 scale = 1.0 / 0x10000; 679 else 680 scale = 1.0 / INT_MAX; 681 682 for (chan = 0; chan < 4; chan++) { 683 out[chan] = ac_to_integer(&ctx->ac, out[chan]); 684 out[chan] = LLVMBuildSIToFP(ctx->ac.builder, 685 out[chan], ctx->f32, ""); 686 out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan], 687 LLVMConstReal(ctx->f32, scale), ""); 688 } 689 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ 690 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM || 691 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED) 692 out[3] = LLVMConstReal(ctx->f32, 1); 693 break; 694 } 695 case SI_FIX_FETCH_RGBA_32_USCALED: 696 for (chan = 0; chan < 4; chan++) { 697 out[chan] = ac_to_integer(&ctx->ac, out[chan]); 698 out[chan] = LLVMBuildUIToFP(ctx->ac.builder, 699 out[chan], ctx->f32, ""); 700 } 701 break; 702 case SI_FIX_FETCH_RGBA_32_SSCALED: 703 for (chan = 0; chan < 4; chan++) { 704 out[chan] = ac_to_integer(&ctx->ac, out[chan]); 705 out[chan] = LLVMBuildSIToFP(ctx->ac.builder, 706 out[chan], ctx->f32, ""); 707 } 708 break; 709 case SI_FIX_FETCH_RG_64_FLOAT: 710 for (chan = 0; chan < 2; chan++) 711 out[chan] = extract_double_to_float(ctx, input[0], chan); 712 713 out[2] = LLVMConstReal(ctx->f32, 0); 714 out[3] = LLVMConstReal(ctx->f32, 1); 715 break; 716 case SI_FIX_FETCH_RGB_64_FLOAT: 717 for (chan = 0; chan < 3; chan++) 718 out[chan] = extract_double_to_float(ctx, input[chan], 0); 719 720 out[3] = LLVMConstReal(ctx->f32, 1); 721 break; 722 case SI_FIX_FETCH_RGBA_64_FLOAT: 723 for (chan = 0; chan < 4; chan++) { 724 out[chan] = extract_double_to_float(ctx, input[chan / 2], 725 chan % 2); 726 } 727 break; 728 case SI_FIX_FETCH_RGB_8: 729 case SI_FIX_FETCH_RGB_8_INT: 730 case SI_FIX_FETCH_RGB_16: 731 case SI_FIX_FETCH_RGB_16_INT: 732 for (chan = 0; chan < 3; chan++) { 733 out[chan] = LLVMBuildExtractElement(ctx->ac.builder, 734 input[chan], 735 ctx->i32_0, ""); 736 } 737 if (fix_fetch == SI_FIX_FETCH_RGB_8 || 738 fix_fetch == SI_FIX_FETCH_RGB_16) { 739 out[3] = LLVMConstReal(ctx->f32, 1); 740 } else { 741 out[3] = ac_to_float(&ctx->ac, ctx->i32_1); 742 } 743 break; 744 } 745} 746 747static void declare_input_vs( 748 struct si_shader_context *ctx, 749 unsigned input_index, 750 const struct tgsi_full_declaration *decl, 751 LLVMValueRef out[4]) 752{ 753 si_llvm_load_input_vs(ctx, input_index, out); 754} 755 756static LLVMValueRef get_primitive_id(struct si_shader_context *ctx, 757 unsigned swizzle) 758{ 759 if (swizzle > 0) 760 return ctx->i32_0; 761 762 switch (ctx->type) { 763 case PIPE_SHADER_VERTEX: 764 return LLVMGetParam(ctx->main_fn, 765 ctx->param_vs_prim_id); 766 case PIPE_SHADER_TESS_CTRL: 767 return ctx->abi.tcs_patch_id; 768 case PIPE_SHADER_TESS_EVAL: 769 return ctx->abi.tes_patch_id; 770 case PIPE_SHADER_GEOMETRY: 771 return ctx->abi.gs_prim_id; 772 default: 773 assert(0); 774 return ctx->i32_0; 775 } 776} 777 778/** 779 * Return the value of tgsi_ind_register for indexing. 780 * This is the indirect index with the constant offset added to it. 781 */ 782LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx, 783 const struct tgsi_ind_register *ind, 784 unsigned addr_mul, 785 int rel_index) 786{ 787 LLVMValueRef result; 788 789 if (ind->File == TGSI_FILE_ADDRESS) { 790 result = ctx->addrs[ind->Index][ind->Swizzle]; 791 result = LLVMBuildLoad(ctx->ac.builder, result, ""); 792 } else { 793 struct tgsi_full_src_register src = {}; 794 795 src.Register.File = ind->File; 796 src.Register.Index = ind->Index; 797 798 /* Set the second index to 0 for constants. */ 799 if (ind->File == TGSI_FILE_CONSTANT) 800 src.Register.Dimension = 1; 801 802 result = ctx->bld_base.emit_fetch_funcs[ind->File](&ctx->bld_base, &src, 803 TGSI_TYPE_SIGNED, 804 ind->Swizzle); 805 result = ac_to_integer(&ctx->ac, result); 806 } 807 808 return ac_build_imad(&ctx->ac, result, LLVMConstInt(ctx->i32, addr_mul, 0), 809 LLVMConstInt(ctx->i32, rel_index, 0)); 810} 811 812/** 813 * Like si_get_indirect_index, but restricts the return value to a (possibly 814 * undefined) value inside [0..num). 815 */ 816LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx, 817 const struct tgsi_ind_register *ind, 818 int rel_index, unsigned num) 819{ 820 LLVMValueRef result = si_get_indirect_index(ctx, ind, 1, rel_index); 821 822 return si_llvm_bound_index(ctx, result, num); 823} 824 825static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx, 826 LLVMValueRef vertex_dw_stride, 827 LLVMValueRef base_addr, 828 LLVMValueRef vertex_index, 829 LLVMValueRef param_index, 830 unsigned input_index, 831 ubyte *name, 832 ubyte *index, 833 bool is_patch) 834{ 835 if (vertex_dw_stride) { 836 base_addr = ac_build_imad(&ctx->ac, vertex_index, 837 vertex_dw_stride, base_addr); 838 } 839 840 if (param_index) { 841 base_addr = ac_build_imad(&ctx->ac, param_index, 842 LLVMConstInt(ctx->i32, 4, 0), base_addr); 843 } 844 845 int param = is_patch ? 846 si_shader_io_get_unique_index_patch(name[input_index], 847 index[input_index]) : 848 si_shader_io_get_unique_index(name[input_index], 849 index[input_index], false); 850 851 /* Add the base address of the element. */ 852 return LLVMBuildAdd(ctx->ac.builder, base_addr, 853 LLVMConstInt(ctx->i32, param * 4, 0), ""); 854} 855 856/** 857 * Calculate a dword address given an input or output register and a stride. 858 */ 859static LLVMValueRef get_dw_address(struct si_shader_context *ctx, 860 const struct tgsi_full_dst_register *dst, 861 const struct tgsi_full_src_register *src, 862 LLVMValueRef vertex_dw_stride, 863 LLVMValueRef base_addr) 864{ 865 struct tgsi_shader_info *info = &ctx->shader->selector->info; 866 ubyte *name, *index, *array_first; 867 int input_index; 868 struct tgsi_full_dst_register reg; 869 LLVMValueRef vertex_index = NULL; 870 LLVMValueRef ind_index = NULL; 871 872 /* Set the register description. The address computation is the same 873 * for sources and destinations. */ 874 if (src) { 875 reg.Register.File = src->Register.File; 876 reg.Register.Index = src->Register.Index; 877 reg.Register.Indirect = src->Register.Indirect; 878 reg.Register.Dimension = src->Register.Dimension; 879 reg.Indirect = src->Indirect; 880 reg.Dimension = src->Dimension; 881 reg.DimIndirect = src->DimIndirect; 882 } else 883 reg = *dst; 884 885 /* If the register is 2-dimensional (e.g. an array of vertices 886 * in a primitive), calculate the base address of the vertex. */ 887 if (reg.Register.Dimension) { 888 if (reg.Dimension.Indirect) 889 vertex_index = si_get_indirect_index(ctx, ®.DimIndirect, 890 1, reg.Dimension.Index); 891 else 892 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0); 893 } 894 895 /* Get information about the register. */ 896 if (reg.Register.File == TGSI_FILE_INPUT) { 897 name = info->input_semantic_name; 898 index = info->input_semantic_index; 899 array_first = info->input_array_first; 900 } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 901 name = info->output_semantic_name; 902 index = info->output_semantic_index; 903 array_first = info->output_array_first; 904 } else { 905 assert(0); 906 return NULL; 907 } 908 909 if (reg.Register.Indirect) { 910 /* Add the relative address of the element. */ 911 if (reg.Indirect.ArrayID) 912 input_index = array_first[reg.Indirect.ArrayID]; 913 else 914 input_index = reg.Register.Index; 915 916 ind_index = si_get_indirect_index(ctx, ®.Indirect, 917 1, reg.Register.Index - input_index); 918 } else { 919 input_index = reg.Register.Index; 920 } 921 922 return get_dw_address_from_generic_indices(ctx, vertex_dw_stride, 923 base_addr, vertex_index, 924 ind_index, input_index, 925 name, index, 926 !reg.Register.Dimension); 927} 928 929/* The offchip buffer layout for TCS->TES is 930 * 931 * - attribute 0 of patch 0 vertex 0 932 * - attribute 0 of patch 0 vertex 1 933 * - attribute 0 of patch 0 vertex 2 934 * ... 935 * - attribute 0 of patch 1 vertex 0 936 * - attribute 0 of patch 1 vertex 1 937 * ... 938 * - attribute 1 of patch 0 vertex 0 939 * - attribute 1 of patch 0 vertex 1 940 * ... 941 * - per patch attribute 0 of patch 0 942 * - per patch attribute 0 of patch 1 943 * ... 944 * 945 * Note that every attribute has 4 components. 946 */ 947static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, 948 LLVMValueRef rel_patch_id, 949 LLVMValueRef vertex_index, 950 LLVMValueRef param_index) 951{ 952 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices; 953 LLVMValueRef param_stride, constant16; 954 955 vertices_per_patch = get_num_tcs_out_vertices(ctx); 956 num_patches = si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6); 957 total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, 958 num_patches, ""); 959 960 constant16 = LLVMConstInt(ctx->i32, 16, 0); 961 if (vertex_index) { 962 base_addr = ac_build_imad(&ctx->ac, rel_patch_id, 963 vertices_per_patch, vertex_index); 964 param_stride = total_vertices; 965 } else { 966 base_addr = rel_patch_id; 967 param_stride = num_patches; 968 } 969 970 base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr); 971 base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, ""); 972 973 if (!vertex_index) { 974 LLVMValueRef patch_data_offset = 975 si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20); 976 977 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, 978 patch_data_offset, ""); 979 } 980 return base_addr; 981} 982 983/* This is a generic helper that can be shared by the NIR and TGSI backends */ 984static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices( 985 struct si_shader_context *ctx, 986 LLVMValueRef vertex_index, 987 LLVMValueRef param_index, 988 unsigned param_base, 989 ubyte *name, 990 ubyte *index, 991 bool is_patch) 992{ 993 unsigned param_index_base; 994 995 param_index_base = is_patch ? 996 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]) : 997 si_shader_io_get_unique_index(name[param_base], index[param_base], false); 998 999 if (param_index) { 1000 param_index = LLVMBuildAdd(ctx->ac.builder, param_index, 1001 LLVMConstInt(ctx->i32, param_index_base, 0), 1002 ""); 1003 } else { 1004 param_index = LLVMConstInt(ctx->i32, param_index_base, 0); 1005 } 1006 1007 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), 1008 vertex_index, param_index); 1009} 1010 1011static LLVMValueRef get_tcs_tes_buffer_address_from_reg( 1012 struct si_shader_context *ctx, 1013 const struct tgsi_full_dst_register *dst, 1014 const struct tgsi_full_src_register *src) 1015{ 1016 struct tgsi_shader_info *info = &ctx->shader->selector->info; 1017 ubyte *name, *index, *array_first; 1018 struct tgsi_full_src_register reg; 1019 LLVMValueRef vertex_index = NULL; 1020 LLVMValueRef param_index = NULL; 1021 unsigned param_base; 1022 1023 reg = src ? *src : tgsi_full_src_register_from_dst(dst); 1024 1025 if (reg.Register.Dimension) { 1026 1027 if (reg.Dimension.Indirect) 1028 vertex_index = si_get_indirect_index(ctx, ®.DimIndirect, 1029 1, reg.Dimension.Index); 1030 else 1031 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0); 1032 } 1033 1034 /* Get information about the register. */ 1035 if (reg.Register.File == TGSI_FILE_INPUT) { 1036 name = info->input_semantic_name; 1037 index = info->input_semantic_index; 1038 array_first = info->input_array_first; 1039 } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 1040 name = info->output_semantic_name; 1041 index = info->output_semantic_index; 1042 array_first = info->output_array_first; 1043 } else { 1044 assert(0); 1045 return NULL; 1046 } 1047 1048 if (reg.Register.Indirect) { 1049 if (reg.Indirect.ArrayID) 1050 param_base = array_first[reg.Indirect.ArrayID]; 1051 else 1052 param_base = reg.Register.Index; 1053 1054 param_index = si_get_indirect_index(ctx, ®.Indirect, 1055 1, reg.Register.Index - param_base); 1056 1057 } else { 1058 param_base = reg.Register.Index; 1059 } 1060 1061 return get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, 1062 param_index, param_base, 1063 name, index, !reg.Register.Dimension); 1064} 1065 1066static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base, 1067 LLVMTypeRef type, unsigned swizzle, 1068 LLVMValueRef buffer, LLVMValueRef offset, 1069 LLVMValueRef base, bool can_speculate) 1070{ 1071 struct si_shader_context *ctx = si_shader_context(bld_base); 1072 LLVMValueRef value, value2; 1073 LLVMTypeRef vec_type = LLVMVectorType(type, 4); 1074 1075 if (swizzle == ~0) { 1076 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 1077 0, 1, 0, can_speculate, false); 1078 1079 return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); 1080 } 1081 1082 if (!llvm_type_is_64bit(ctx, type)) { 1083 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 1084 0, 1, 0, can_speculate, false); 1085 1086 value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); 1087 return LLVMBuildExtractElement(ctx->ac.builder, value, 1088 LLVMConstInt(ctx->i32, swizzle, 0), ""); 1089 } 1090 1091 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, 1092 swizzle * 4, 1, 0, can_speculate, false); 1093 1094 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, 1095 swizzle * 4 + 4, 1, 0, can_speculate, false); 1096 1097 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); 1098} 1099 1100/** 1101 * Load from LDS. 1102 * 1103 * \param type output value type 1104 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4 1105 * \param dw_addr address in dwords 1106 */ 1107static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, 1108 LLVMTypeRef type, unsigned swizzle, 1109 LLVMValueRef dw_addr) 1110{ 1111 struct si_shader_context *ctx = si_shader_context(bld_base); 1112 LLVMValueRef value; 1113 1114 if (swizzle == ~0) { 1115 LLVMValueRef values[TGSI_NUM_CHANNELS]; 1116 1117 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) 1118 values[chan] = lds_load(bld_base, type, chan, dw_addr); 1119 1120 return ac_build_gather_values(&ctx->ac, values, 1121 TGSI_NUM_CHANNELS); 1122 } 1123 1124 /* Split 64-bit loads. */ 1125 if (llvm_type_is_64bit(ctx, type)) { 1126 LLVMValueRef lo, hi; 1127 1128 lo = lds_load(bld_base, ctx->i32, swizzle, dw_addr); 1129 hi = lds_load(bld_base, ctx->i32, swizzle + 1, dw_addr); 1130 return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi); 1131 } 1132 1133 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, 1134 LLVMConstInt(ctx->i32, swizzle, 0), ""); 1135 1136 value = ac_lds_load(&ctx->ac, dw_addr); 1137 1138 return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); 1139} 1140 1141/** 1142 * Store to LDS. 1143 * 1144 * \param swizzle offset (typically 0..3) 1145 * \param dw_addr address in dwords 1146 * \param value value to store 1147 */ 1148static void lds_store(struct si_shader_context *ctx, 1149 unsigned dw_offset_imm, LLVMValueRef dw_addr, 1150 LLVMValueRef value) 1151{ 1152 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, 1153 LLVMConstInt(ctx->i32, dw_offset_imm, 0), ""); 1154 1155 ac_lds_store(&ctx->ac, dw_addr, value); 1156} 1157 1158enum si_tess_ring { 1159 TCS_FACTOR_RING, 1160 TESS_OFFCHIP_RING_TCS, 1161 TESS_OFFCHIP_RING_TES, 1162}; 1163 1164static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, 1165 enum si_tess_ring ring) 1166{ 1167 LLVMBuilderRef builder = ctx->ac.builder; 1168 unsigned param = ring == TESS_OFFCHIP_RING_TES ? ctx->param_tes_offchip_addr : 1169 ctx->param_tcs_out_lds_layout; 1170 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param); 1171 1172 /* TCS only receives high 13 bits of the address. */ 1173 if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) { 1174 addr = LLVMBuildAnd(builder, addr, 1175 LLVMConstInt(ctx->i32, 0xfff80000, 0), ""); 1176 } 1177 1178 if (ring == TCS_FACTOR_RING) { 1179 unsigned tf_offset = ctx->screen->tess_offchip_ring_size; 1180 addr = LLVMBuildAdd(builder, addr, 1181 LLVMConstInt(ctx->i32, tf_offset, 0), ""); 1182 } 1183 1184 LLVMValueRef desc[4]; 1185 desc[0] = addr; 1186 desc[1] = LLVMConstInt(ctx->i32, 1187 S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); 1188 desc[2] = LLVMConstInt(ctx->i32, 0xffffffff, 0); 1189 desc[3] = LLVMConstInt(ctx->i32, 1190 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 1191 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 1192 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 1193 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 1194 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 1195 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0); 1196 1197 return ac_build_gather_values(&ctx->ac, desc, 4); 1198} 1199 1200static LLVMValueRef fetch_input_tcs( 1201 struct lp_build_tgsi_context *bld_base, 1202 const struct tgsi_full_src_register *reg, 1203 enum tgsi_opcode_type type, unsigned swizzle_in) 1204{ 1205 struct si_shader_context *ctx = si_shader_context(bld_base); 1206 LLVMValueRef dw_addr, stride; 1207 unsigned swizzle = swizzle_in & 0xffff; 1208 stride = get_tcs_in_vertex_dw_stride(ctx); 1209 dw_addr = get_tcs_in_current_patch_offset(ctx); 1210 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); 1211 1212 return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr); 1213} 1214 1215static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, 1216 LLVMTypeRef type, 1217 LLVMValueRef vertex_index, 1218 LLVMValueRef param_index, 1219 unsigned const_index, 1220 unsigned location, 1221 unsigned driver_location, 1222 unsigned component, 1223 unsigned num_components, 1224 bool is_patch, 1225 bool is_compact, 1226 bool load_input) 1227{ 1228 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1229 struct tgsi_shader_info *info = &ctx->shader->selector->info; 1230 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 1231 LLVMValueRef dw_addr, stride; 1232 1233 driver_location = driver_location / 4; 1234 1235 if (load_input) { 1236 stride = get_tcs_in_vertex_dw_stride(ctx); 1237 dw_addr = get_tcs_in_current_patch_offset(ctx); 1238 } else { 1239 if (is_patch) { 1240 stride = NULL; 1241 dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1242 } else { 1243 stride = get_tcs_out_vertex_dw_stride(ctx); 1244 dw_addr = get_tcs_out_current_patch_offset(ctx); 1245 } 1246 } 1247 1248 if (param_index) { 1249 /* Add the constant index to the indirect index */ 1250 param_index = LLVMBuildAdd(ctx->ac.builder, param_index, 1251 LLVMConstInt(ctx->i32, const_index, 0), ""); 1252 } else { 1253 param_index = LLVMConstInt(ctx->i32, const_index, 0); 1254 } 1255 1256 ubyte *names; 1257 ubyte *indices; 1258 if (load_input) { 1259 names = info->input_semantic_name; 1260 indices = info->input_semantic_index; 1261 } else { 1262 names = info->output_semantic_name; 1263 indices = info->output_semantic_index; 1264 } 1265 1266 dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, 1267 vertex_index, param_index, 1268 driver_location, 1269 names, indices, 1270 is_patch); 1271 1272 LLVMValueRef value[4]; 1273 for (unsigned i = 0; i < num_components; i++) { 1274 unsigned offset = i; 1275 if (llvm_type_is_64bit(ctx, type)) 1276 offset *= 2; 1277 1278 offset += component; 1279 value[i + component] = lds_load(bld_base, type, offset, dw_addr); 1280 } 1281 1282 return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); 1283} 1284 1285static LLVMValueRef fetch_output_tcs( 1286 struct lp_build_tgsi_context *bld_base, 1287 const struct tgsi_full_src_register *reg, 1288 enum tgsi_opcode_type type, unsigned swizzle_in) 1289{ 1290 struct si_shader_context *ctx = si_shader_context(bld_base); 1291 LLVMValueRef dw_addr, stride; 1292 unsigned swizzle = (swizzle_in & 0xffff); 1293 1294 if (reg->Register.Dimension) { 1295 stride = get_tcs_out_vertex_dw_stride(ctx); 1296 dw_addr = get_tcs_out_current_patch_offset(ctx); 1297 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); 1298 } else { 1299 dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1300 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr); 1301 } 1302 1303 return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr); 1304} 1305 1306static LLVMValueRef fetch_input_tes( 1307 struct lp_build_tgsi_context *bld_base, 1308 const struct tgsi_full_src_register *reg, 1309 enum tgsi_opcode_type type, unsigned swizzle_in) 1310{ 1311 struct si_shader_context *ctx = si_shader_context(bld_base); 1312 LLVMValueRef base, addr; 1313 unsigned swizzle = (swizzle_in & 0xffff); 1314 1315 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1316 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg); 1317 1318 return buffer_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, 1319 ctx->tess_offchip_ring, base, addr, true); 1320} 1321 1322LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, 1323 LLVMTypeRef type, 1324 LLVMValueRef vertex_index, 1325 LLVMValueRef param_index, 1326 unsigned const_index, 1327 unsigned location, 1328 unsigned driver_location, 1329 unsigned component, 1330 unsigned num_components, 1331 bool is_patch, 1332 bool is_compact, 1333 bool load_input) 1334{ 1335 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1336 struct tgsi_shader_info *info = &ctx->shader->selector->info; 1337 LLVMValueRef base, addr; 1338 1339 driver_location = driver_location / 4; 1340 1341 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1342 1343 if (param_index) { 1344 /* Add the constant index to the indirect index */ 1345 param_index = LLVMBuildAdd(ctx->ac.builder, param_index, 1346 LLVMConstInt(ctx->i32, const_index, 0), ""); 1347 } else { 1348 param_index = LLVMConstInt(ctx->i32, const_index, 0); 1349 } 1350 1351 addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, 1352 param_index, driver_location, 1353 info->input_semantic_name, 1354 info->input_semantic_index, 1355 is_patch); 1356 1357 /* TODO: This will generate rather ordinary llvm code, although it 1358 * should be easy for the optimiser to fix up. In future we might want 1359 * to refactor buffer_load(), but for now this maximises code sharing 1360 * between the NIR and TGSI backends. 1361 */ 1362 LLVMValueRef value[4]; 1363 for (unsigned i = 0; i < num_components; i++) { 1364 unsigned offset = i; 1365 if (llvm_type_is_64bit(ctx, type)) 1366 offset *= 2; 1367 1368 offset += component; 1369 value[i + component] = buffer_load(&ctx->bld_base, type, offset, 1370 ctx->tess_offchip_ring, base, addr, true); 1371 } 1372 1373 return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); 1374} 1375 1376static void store_output_tcs(struct lp_build_tgsi_context *bld_base, 1377 const struct tgsi_full_instruction *inst, 1378 const struct tgsi_opcode_info *info, 1379 unsigned index, 1380 LLVMValueRef dst[4]) 1381{ 1382 struct si_shader_context *ctx = si_shader_context(bld_base); 1383 const struct tgsi_full_dst_register *reg = &inst->Dst[index]; 1384 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info; 1385 unsigned chan_index; 1386 LLVMValueRef dw_addr, stride; 1387 LLVMValueRef buffer, base, buf_addr; 1388 LLVMValueRef values[4]; 1389 bool skip_lds_store; 1390 bool is_tess_factor = false, is_tess_inner = false; 1391 1392 /* Only handle per-patch and per-vertex outputs here. 1393 * Vectors will be lowered to scalars and this function will be called again. 1394 */ 1395 if (reg->Register.File != TGSI_FILE_OUTPUT || 1396 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) { 1397 si_llvm_emit_store(bld_base, inst, info, index, dst); 1398 return; 1399 } 1400 1401 if (reg->Register.Dimension) { 1402 stride = get_tcs_out_vertex_dw_stride(ctx); 1403 dw_addr = get_tcs_out_current_patch_offset(ctx); 1404 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr); 1405 skip_lds_store = !sh_info->reads_pervertex_outputs; 1406 } else { 1407 dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1408 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr); 1409 skip_lds_store = !sh_info->reads_perpatch_outputs; 1410 1411 if (!reg->Register.Indirect) { 1412 int name = sh_info->output_semantic_name[reg->Register.Index]; 1413 1414 /* Always write tess factors into LDS for the TCS epilog. */ 1415 if (name == TGSI_SEMANTIC_TESSINNER || 1416 name == TGSI_SEMANTIC_TESSOUTER) { 1417 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ 1418 skip_lds_store = !sh_info->reads_tessfactor_outputs && 1419 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs; 1420 is_tess_factor = true; 1421 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; 1422 } 1423 } 1424 } 1425 1426 buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); 1427 1428 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1429 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL); 1430 1431 uint32_t writemask = reg->Register.WriteMask; 1432 while (writemask) { 1433 chan_index = u_bit_scan(&writemask); 1434 LLVMValueRef value = dst[chan_index]; 1435 1436 if (inst->Instruction.Saturate) 1437 value = ac_build_clamp(&ctx->ac, value); 1438 1439 /* Skip LDS stores if there is no LDS read of this output. */ 1440 if (!skip_lds_store) 1441 lds_store(ctx, chan_index, dw_addr, value); 1442 1443 value = ac_to_integer(&ctx->ac, value); 1444 values[chan_index] = value; 1445 1446 if (reg->Register.WriteMask != 0xF && !is_tess_factor) { 1447 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, 1448 buf_addr, base, 1449 4 * chan_index, 1, 0, true, false); 1450 } 1451 1452 /* Write tess factors into VGPRs for the epilog. */ 1453 if (is_tess_factor && 1454 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { 1455 if (!is_tess_inner) { 1456 LLVMBuildStore(ctx->ac.builder, value, /* outer */ 1457 ctx->invoc0_tess_factors[chan_index]); 1458 } else if (chan_index < 2) { 1459 LLVMBuildStore(ctx->ac.builder, value, /* inner */ 1460 ctx->invoc0_tess_factors[4 + chan_index]); 1461 } 1462 } 1463 } 1464 1465 if (reg->Register.WriteMask == 0xF && !is_tess_factor) { 1466 LLVMValueRef value = ac_build_gather_values(&ctx->ac, 1467 values, 4); 1468 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr, 1469 base, 0, 1, 0, true, false); 1470 } 1471} 1472 1473static void si_nir_store_output_tcs(struct ac_shader_abi *abi, 1474 const struct nir_variable *var, 1475 LLVMValueRef vertex_index, 1476 LLVMValueRef param_index, 1477 unsigned const_index, 1478 LLVMValueRef src, 1479 unsigned writemask) 1480{ 1481 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1482 struct tgsi_shader_info *info = &ctx->shader->selector->info; 1483 const unsigned component = var->data.location_frac; 1484 const bool is_patch = var->data.patch; 1485 unsigned driver_location = var->data.driver_location; 1486 LLVMValueRef dw_addr, stride; 1487 LLVMValueRef buffer, base, addr; 1488 LLVMValueRef values[4]; 1489 bool skip_lds_store; 1490 bool is_tess_factor = false, is_tess_inner = false; 1491 1492 driver_location = driver_location / 4; 1493 1494 if (param_index) { 1495 /* Add the constant index to the indirect index */ 1496 param_index = LLVMBuildAdd(ctx->ac.builder, param_index, 1497 LLVMConstInt(ctx->i32, const_index, 0), ""); 1498 } else { 1499 if (const_index != 0) 1500 param_index = LLVMConstInt(ctx->i32, const_index, 0); 1501 } 1502 1503 if (!is_patch) { 1504 stride = get_tcs_out_vertex_dw_stride(ctx); 1505 dw_addr = get_tcs_out_current_patch_offset(ctx); 1506 dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, 1507 vertex_index, param_index, 1508 driver_location, 1509 info->output_semantic_name, 1510 info->output_semantic_index, 1511 is_patch); 1512 1513 skip_lds_store = !info->reads_pervertex_outputs; 1514 } else { 1515 dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1516 dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, 1517 vertex_index, param_index, 1518 driver_location, 1519 info->output_semantic_name, 1520 info->output_semantic_index, 1521 is_patch); 1522 1523 skip_lds_store = !info->reads_perpatch_outputs; 1524 1525 if (!param_index) { 1526 int name = info->output_semantic_name[driver_location]; 1527 1528 /* Always write tess factors into LDS for the TCS epilog. */ 1529 if (name == TGSI_SEMANTIC_TESSINNER || 1530 name == TGSI_SEMANTIC_TESSOUTER) { 1531 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ 1532 skip_lds_store = !info->reads_tessfactor_outputs && 1533 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs; 1534 is_tess_factor = true; 1535 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; 1536 } 1537 } 1538 } 1539 1540 buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); 1541 1542 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1543 1544 addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, 1545 param_index, driver_location, 1546 info->output_semantic_name, 1547 info->output_semantic_index, 1548 is_patch); 1549 1550 for (unsigned chan = 0; chan < 4; chan++) { 1551 if (!(writemask & (1 << chan))) 1552 continue; 1553 LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); 1554 1555 /* Skip LDS stores if there is no LDS read of this output. */ 1556 if (!skip_lds_store) 1557 lds_store(ctx, chan, dw_addr, value); 1558 1559 value = ac_to_integer(&ctx->ac, value); 1560 values[chan] = value; 1561 1562 if (writemask != 0xF && !is_tess_factor) { 1563 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, 1564 addr, base, 1565 4 * chan, 1, 0, true, false); 1566 } 1567 1568 /* Write tess factors into VGPRs for the epilog. */ 1569 if (is_tess_factor && 1570 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { 1571 if (!is_tess_inner) { 1572 LLVMBuildStore(ctx->ac.builder, value, /* outer */ 1573 ctx->invoc0_tess_factors[chan]); 1574 } else if (chan < 2) { 1575 LLVMBuildStore(ctx->ac.builder, value, /* inner */ 1576 ctx->invoc0_tess_factors[4 + chan]); 1577 } 1578 } 1579 } 1580 1581 if (writemask == 0xF && !is_tess_factor) { 1582 LLVMValueRef value = ac_build_gather_values(&ctx->ac, 1583 values, 4); 1584 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr, 1585 base, 0, 1, 0, true, false); 1586 } 1587} 1588 1589LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, 1590 unsigned input_index, 1591 unsigned vtx_offset_param, 1592 LLVMTypeRef type, 1593 unsigned swizzle) 1594{ 1595 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1596 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 1597 struct si_shader *shader = ctx->shader; 1598 LLVMValueRef vtx_offset, soffset; 1599 struct tgsi_shader_info *info = &shader->selector->info; 1600 unsigned semantic_name = info->input_semantic_name[input_index]; 1601 unsigned semantic_index = info->input_semantic_index[input_index]; 1602 unsigned param; 1603 LLVMValueRef value; 1604 1605 param = si_shader_io_get_unique_index(semantic_name, semantic_index, false); 1606 1607 /* GFX9 has the ESGS ring in LDS. */ 1608 if (ctx->screen->info.chip_class >= GFX9) { 1609 unsigned index = vtx_offset_param; 1610 1611 switch (index / 2) { 1612 case 0: 1613 vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx01_offset, 1614 index % 2 ? 16 : 0, 16); 1615 break; 1616 case 1: 1617 vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx23_offset, 1618 index % 2 ? 16 : 0, 16); 1619 break; 1620 case 2: 1621 vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx45_offset, 1622 index % 2 ? 16 : 0, 16); 1623 break; 1624 default: 1625 assert(0); 1626 return NULL; 1627 } 1628 1629 vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset, 1630 LLVMConstInt(ctx->i32, param * 4, 0), ""); 1631 return lds_load(bld_base, type, swizzle, vtx_offset); 1632 } 1633 1634 /* GFX6: input load from the ESGS ring in memory. */ 1635 if (swizzle == ~0) { 1636 LLVMValueRef values[TGSI_NUM_CHANNELS]; 1637 unsigned chan; 1638 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 1639 values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, 1640 type, chan); 1641 } 1642 return ac_build_gather_values(&ctx->ac, values, 1643 TGSI_NUM_CHANNELS); 1644 } 1645 1646 /* Get the vertex offset parameter on GFX6. */ 1647 LLVMValueRef gs_vtx_offset = ctx->gs_vtx_offset[vtx_offset_param]; 1648 1649 vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, 1650 LLVMConstInt(ctx->i32, 4, 0), ""); 1651 1652 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0); 1653 1654 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0, 1655 vtx_offset, soffset, 0, 1, 0, true, false); 1656 if (llvm_type_is_64bit(ctx, type)) { 1657 LLVMValueRef value2; 1658 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0); 1659 1660 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, 1661 ctx->i32_0, vtx_offset, soffset, 1662 0, 1, 0, true, false); 1663 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); 1664 } 1665 return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); 1666} 1667 1668static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, 1669 unsigned location, 1670 unsigned driver_location, 1671 unsigned component, 1672 unsigned num_components, 1673 unsigned vertex_index, 1674 unsigned const_index, 1675 LLVMTypeRef type) 1676{ 1677 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1678 1679 LLVMValueRef value[4]; 1680 for (unsigned i = 0; i < num_components; i++) { 1681 unsigned offset = i; 1682 if (llvm_type_is_64bit(ctx, type)) 1683 offset *= 2; 1684 1685 offset += component; 1686 value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4, 1687 vertex_index, type, offset); 1688 } 1689 1690 return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); 1691} 1692 1693static LLVMValueRef fetch_input_gs( 1694 struct lp_build_tgsi_context *bld_base, 1695 const struct tgsi_full_src_register *reg, 1696 enum tgsi_opcode_type type, 1697 unsigned swizzle_in) 1698{ 1699 struct si_shader_context *ctx = si_shader_context(bld_base); 1700 struct tgsi_shader_info *info = &ctx->shader->selector->info; 1701 unsigned swizzle = swizzle_in & 0xffff; 1702 1703 unsigned semantic_name = info->input_semantic_name[reg->Register.Index]; 1704 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) 1705 return get_primitive_id(ctx, swizzle); 1706 1707 if (!reg->Register.Dimension) 1708 return NULL; 1709 1710 return si_llvm_load_input_gs(&ctx->abi, reg->Register.Index, 1711 reg->Dimension.Index, 1712 tgsi2llvmtype(bld_base, type), 1713 swizzle); 1714} 1715 1716static int lookup_interp_param_index(unsigned interpolate, unsigned location) 1717{ 1718 switch (interpolate) { 1719 case TGSI_INTERPOLATE_CONSTANT: 1720 return 0; 1721 1722 case TGSI_INTERPOLATE_LINEAR: 1723 if (location == TGSI_INTERPOLATE_LOC_SAMPLE) 1724 return SI_PARAM_LINEAR_SAMPLE; 1725 else if (location == TGSI_INTERPOLATE_LOC_CENTROID) 1726 return SI_PARAM_LINEAR_CENTROID; 1727 else 1728 return SI_PARAM_LINEAR_CENTER; 1729 break; 1730 case TGSI_INTERPOLATE_COLOR: 1731 case TGSI_INTERPOLATE_PERSPECTIVE: 1732 if (location == TGSI_INTERPOLATE_LOC_SAMPLE) 1733 return SI_PARAM_PERSP_SAMPLE; 1734 else if (location == TGSI_INTERPOLATE_LOC_CENTROID) 1735 return SI_PARAM_PERSP_CENTROID; 1736 else 1737 return SI_PARAM_PERSP_CENTER; 1738 break; 1739 default: 1740 fprintf(stderr, "Warning: Unhandled interpolation mode.\n"); 1741 return -1; 1742 } 1743} 1744 1745static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, 1746 unsigned attr_index, unsigned chan, 1747 LLVMValueRef prim_mask, 1748 LLVMValueRef i, LLVMValueRef j) 1749{ 1750 if (i || j) { 1751 return ac_build_fs_interp(&ctx->ac, 1752 LLVMConstInt(ctx->i32, chan, 0), 1753 LLVMConstInt(ctx->i32, attr_index, 0), 1754 prim_mask, i, j); 1755 } 1756 return ac_build_fs_interp_mov(&ctx->ac, 1757 LLVMConstInt(ctx->i32, 2, 0), /* P0 */ 1758 LLVMConstInt(ctx->i32, chan, 0), 1759 LLVMConstInt(ctx->i32, attr_index, 0), 1760 prim_mask); 1761} 1762 1763/** 1764 * Interpolate a fragment shader input. 1765 * 1766 * @param ctx context 1767 * @param input_index index of the input in hardware 1768 * @param semantic_name TGSI_SEMANTIC_* 1769 * @param semantic_index semantic index 1770 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset) 1771 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total) 1772 * @param interp_param interpolation weights (i,j) 1773 * @param prim_mask SI_PARAM_PRIM_MASK 1774 * @param face SI_PARAM_FRONT_FACE 1775 * @param result the return value (4 components) 1776 */ 1777static void interp_fs_input(struct si_shader_context *ctx, 1778 unsigned input_index, 1779 unsigned semantic_name, 1780 unsigned semantic_index, 1781 unsigned num_interp_inputs, 1782 unsigned colors_read_mask, 1783 LLVMValueRef interp_param, 1784 LLVMValueRef prim_mask, 1785 LLVMValueRef face, 1786 LLVMValueRef result[4]) 1787{ 1788 LLVMValueRef i = NULL, j = NULL; 1789 unsigned chan; 1790 1791 /* fs.constant returns the param from the middle vertex, so it's not 1792 * really useful for flat shading. It's meant to be used for custom 1793 * interpolation (but the intrinsic can't fetch from the other two 1794 * vertices). 1795 * 1796 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state 1797 * to do the right thing. The only reason we use fs.constant is that 1798 * fs.interp cannot be used on integers, because they can be equal 1799 * to NaN. 1800 * 1801 * When interp is false we will use fs.constant or for newer llvm, 1802 * amdgcn.interp.mov. 1803 */ 1804 bool interp = interp_param != NULL; 1805 1806 if (interp) { 1807 interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param, 1808 LLVMVectorType(ctx->f32, 2), ""); 1809 1810 i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, 1811 ctx->i32_0, ""); 1812 j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, 1813 ctx->i32_1, ""); 1814 } 1815 1816 if (semantic_name == TGSI_SEMANTIC_COLOR && 1817 ctx->shader->key.part.ps.prolog.color_two_side) { 1818 LLVMValueRef is_face_positive; 1819 1820 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1", 1821 * otherwise it's at offset "num_inputs". 1822 */ 1823 unsigned back_attr_offset = num_interp_inputs; 1824 if (semantic_index == 1 && colors_read_mask & 0xf) 1825 back_attr_offset += 1; 1826 1827 is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, 1828 face, ctx->i32_0, ""); 1829 1830 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 1831 LLVMValueRef front, back; 1832 1833 front = si_build_fs_interp(ctx, 1834 input_index, chan, 1835 prim_mask, i, j); 1836 back = si_build_fs_interp(ctx, 1837 back_attr_offset, chan, 1838 prim_mask, i, j); 1839 1840 result[chan] = LLVMBuildSelect(ctx->ac.builder, 1841 is_face_positive, 1842 front, 1843 back, 1844 ""); 1845 } 1846 } else if (semantic_name == TGSI_SEMANTIC_FOG) { 1847 result[0] = si_build_fs_interp(ctx, input_index, 1848 0, prim_mask, i, j); 1849 result[1] = 1850 result[2] = LLVMConstReal(ctx->f32, 0.0f); 1851 result[3] = LLVMConstReal(ctx->f32, 1.0f); 1852 } else { 1853 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 1854 result[chan] = si_build_fs_interp(ctx, 1855 input_index, chan, 1856 prim_mask, i, j); 1857 } 1858 } 1859} 1860 1861void si_llvm_load_input_fs( 1862 struct si_shader_context *ctx, 1863 unsigned input_index, 1864 LLVMValueRef out[4]) 1865{ 1866 struct si_shader *shader = ctx->shader; 1867 struct tgsi_shader_info *info = &shader->selector->info; 1868 LLVMValueRef main_fn = ctx->main_fn; 1869 LLVMValueRef interp_param = NULL; 1870 int interp_param_idx; 1871 enum tgsi_semantic semantic_name = info->input_semantic_name[input_index]; 1872 unsigned semantic_index = info->input_semantic_index[input_index]; 1873 enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index]; 1874 enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index]; 1875 1876 /* Get colors from input VGPRs (set by the prolog). */ 1877 if (semantic_name == TGSI_SEMANTIC_COLOR) { 1878 unsigned colors_read = shader->selector->info.colors_read; 1879 unsigned mask = colors_read >> (semantic_index * 4); 1880 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 + 1881 (semantic_index ? util_bitcount(colors_read & 0xf) : 0); 1882 LLVMValueRef undef = LLVMGetUndef(ctx->f32); 1883 1884 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; 1885 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; 1886 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; 1887 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; 1888 return; 1889 } 1890 1891 interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc); 1892 if (interp_param_idx == -1) 1893 return; 1894 else if (interp_param_idx) { 1895 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); 1896 } 1897 1898 interp_fs_input(ctx, input_index, semantic_name, 1899 semantic_index, 0, /* this param is unused */ 1900 shader->selector->info.colors_read, interp_param, 1901 ctx->abi.prim_mask, 1902 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE), 1903 &out[0]); 1904} 1905 1906static void declare_input_fs( 1907 struct si_shader_context *ctx, 1908 unsigned input_index, 1909 const struct tgsi_full_declaration *decl, 1910 LLVMValueRef out[4]) 1911{ 1912 si_llvm_load_input_fs(ctx, input_index, out); 1913} 1914 1915LLVMValueRef si_get_sample_id(struct si_shader_context *ctx) 1916{ 1917 return si_unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4); 1918} 1919 1920static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi) 1921{ 1922 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1923 1924 /* For non-indexed draws, the base vertex set by the driver 1925 * (for direct draws) or the CP (for indirect draws) is the 1926 * first vertex ID, but GLSL expects 0 to be returned. 1927 */ 1928 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, 1929 ctx->param_vs_state_bits); 1930 LLVMValueRef indexed; 1931 1932 indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, ""); 1933 indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, ""); 1934 1935 return LLVMBuildSelect(ctx->ac.builder, indexed, ctx->abi.base_vertex, 1936 ctx->i32_0, ""); 1937} 1938 1939static LLVMValueRef get_block_size(struct ac_shader_abi *abi) 1940{ 1941 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1942 1943 LLVMValueRef values[3]; 1944 LLVMValueRef result; 1945 unsigned i; 1946 unsigned *properties = ctx->shader->selector->info.properties; 1947 1948 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { 1949 unsigned sizes[3] = { 1950 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], 1951 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], 1952 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] 1953 }; 1954 1955 for (i = 0; i < 3; ++i) 1956 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0); 1957 1958 result = ac_build_gather_values(&ctx->ac, values, 3); 1959 } else { 1960 result = LLVMGetParam(ctx->main_fn, ctx->param_block_size); 1961 } 1962 1963 return result; 1964} 1965 1966/** 1967 * Load a dword from a constant buffer. 1968 */ 1969static LLVMValueRef buffer_load_const(struct si_shader_context *ctx, 1970 LLVMValueRef resource, 1971 LLVMValueRef offset) 1972{ 1973 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, 1974 0, 0, 0, true, true); 1975} 1976 1977static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id) 1978{ 1979 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1980 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); 1981 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0); 1982 LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index); 1983 1984 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ 1985 LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->i32, 8, 0), ""); 1986 LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), ""); 1987 1988 LLVMValueRef pos[4] = { 1989 buffer_load_const(ctx, resource, offset0), 1990 buffer_load_const(ctx, resource, offset1), 1991 LLVMConstReal(ctx->f32, 0), 1992 LLVMConstReal(ctx->f32, 0) 1993 }; 1994 1995 return ac_build_gather_values(&ctx->ac, pos, 4); 1996} 1997 1998static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi) 1999{ 2000 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2001 return ac_to_integer(&ctx->ac, abi->sample_coverage); 2002} 2003 2004static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi) 2005{ 2006 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2007 LLVMValueRef coord[4] = { 2008 LLVMGetParam(ctx->main_fn, ctx->param_tes_u), 2009 LLVMGetParam(ctx->main_fn, ctx->param_tes_v), 2010 ctx->ac.f32_0, 2011 ctx->ac.f32_0 2012 }; 2013 2014 /* For triangles, the vector should be (u, v, 1-u-v). */ 2015 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == 2016 PIPE_PRIM_TRIANGLES) { 2017 coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1, 2018 LLVMBuildFAdd(ctx->ac.builder, 2019 coord[0], coord[1], ""), ""); 2020 } 2021 return ac_build_gather_values(&ctx->ac, coord, 4); 2022} 2023 2024static LLVMValueRef load_tess_level(struct si_shader_context *ctx, 2025 unsigned semantic_name) 2026{ 2027 LLVMValueRef base, addr; 2028 2029 int param = si_shader_io_get_unique_index_patch(semantic_name, 0); 2030 2031 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 2032 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL, 2033 LLVMConstInt(ctx->i32, param, 0)); 2034 2035 return buffer_load(&ctx->bld_base, ctx->f32, 2036 ~0, ctx->tess_offchip_ring, base, addr, true); 2037 2038} 2039 2040static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, 2041 unsigned varying_id) 2042{ 2043 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2044 unsigned semantic_name; 2045 2046 switch (varying_id) { 2047 case VARYING_SLOT_TESS_LEVEL_INNER: 2048 semantic_name = TGSI_SEMANTIC_TESSINNER; 2049 break; 2050 case VARYING_SLOT_TESS_LEVEL_OUTER: 2051 semantic_name = TGSI_SEMANTIC_TESSOUTER; 2052 break; 2053 default: 2054 unreachable("unknown tess level"); 2055 } 2056 2057 return load_tess_level(ctx, semantic_name); 2058 2059} 2060 2061static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi) 2062{ 2063 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2064 if (ctx->type == PIPE_SHADER_TESS_CTRL) 2065 return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 6); 2066 else if (ctx->type == PIPE_SHADER_TESS_EVAL) 2067 return get_num_tcs_out_vertices(ctx); 2068 else 2069 unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN"); 2070} 2071 2072void si_load_system_value(struct si_shader_context *ctx, 2073 unsigned index, 2074 const struct tgsi_full_declaration *decl) 2075{ 2076 LLVMValueRef value = 0; 2077 2078 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES); 2079 2080 switch (decl->Semantic.Name) { 2081 case TGSI_SEMANTIC_INSTANCEID: 2082 value = ctx->abi.instance_id; 2083 break; 2084 2085 case TGSI_SEMANTIC_VERTEXID: 2086 value = LLVMBuildAdd(ctx->ac.builder, 2087 ctx->abi.vertex_id, 2088 ctx->abi.base_vertex, ""); 2089 break; 2090 2091 case TGSI_SEMANTIC_VERTEXID_NOBASE: 2092 /* Unused. Clarify the meaning in indexed vs. non-indexed 2093 * draws if this is ever used again. */ 2094 assert(false); 2095 break; 2096 2097 case TGSI_SEMANTIC_BASEVERTEX: 2098 value = get_base_vertex(&ctx->abi); 2099 break; 2100 2101 case TGSI_SEMANTIC_BASEINSTANCE: 2102 value = ctx->abi.start_instance; 2103 break; 2104 2105 case TGSI_SEMANTIC_DRAWID: 2106 value = ctx->abi.draw_id; 2107 break; 2108 2109 case TGSI_SEMANTIC_INVOCATIONID: 2110 if (ctx->type == PIPE_SHADER_TESS_CTRL) 2111 value = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); 2112 else if (ctx->type == PIPE_SHADER_GEOMETRY) 2113 value = ctx->abi.gs_invocation_id; 2114 else 2115 assert(!"INVOCATIONID not implemented"); 2116 break; 2117 2118 case TGSI_SEMANTIC_POSITION: 2119 { 2120 LLVMValueRef pos[4] = { 2121 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT), 2122 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT), 2123 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT), 2124 ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, 2125 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_W_FLOAT)), 2126 }; 2127 value = ac_build_gather_values(&ctx->ac, pos, 4); 2128 break; 2129 } 2130 2131 case TGSI_SEMANTIC_FACE: 2132 value = ctx->abi.front_face; 2133 break; 2134 2135 case TGSI_SEMANTIC_SAMPLEID: 2136 value = si_get_sample_id(ctx); 2137 break; 2138 2139 case TGSI_SEMANTIC_SAMPLEPOS: { 2140 LLVMValueRef pos[4] = { 2141 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT), 2142 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT), 2143 LLVMConstReal(ctx->f32, 0), 2144 LLVMConstReal(ctx->f32, 0) 2145 }; 2146 pos[0] = ac_build_fract(&ctx->ac, pos[0], 32); 2147 pos[1] = ac_build_fract(&ctx->ac, pos[1], 32); 2148 value = ac_build_gather_values(&ctx->ac, pos, 4); 2149 break; 2150 } 2151 2152 case TGSI_SEMANTIC_SAMPLEMASK: 2153 /* This can only occur with the OpenGL Core profile, which 2154 * doesn't support smoothing. 2155 */ 2156 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE); 2157 break; 2158 2159 case TGSI_SEMANTIC_TESSCOORD: 2160 value = si_load_tess_coord(&ctx->abi); 2161 break; 2162 2163 case TGSI_SEMANTIC_VERTICESIN: 2164 value = si_load_patch_vertices_in(&ctx->abi); 2165 break; 2166 2167 case TGSI_SEMANTIC_TESSINNER: 2168 case TGSI_SEMANTIC_TESSOUTER: 2169 value = load_tess_level(ctx, decl->Semantic.Name); 2170 break; 2171 2172 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI: 2173 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI: 2174 { 2175 LLVMValueRef buf, slot, val[4]; 2176 int i, offset; 2177 2178 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0); 2179 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); 2180 buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot); 2181 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0; 2182 2183 for (i = 0; i < 4; i++) 2184 val[i] = buffer_load_const(ctx, buf, 2185 LLVMConstInt(ctx->i32, (offset + i) * 4, 0)); 2186 value = ac_build_gather_values(&ctx->ac, val, 4); 2187 break; 2188 } 2189 2190 case TGSI_SEMANTIC_PRIMID: 2191 value = get_primitive_id(ctx, 0); 2192 break; 2193 2194 case TGSI_SEMANTIC_GRID_SIZE: 2195 value = ctx->abi.num_work_groups; 2196 break; 2197 2198 case TGSI_SEMANTIC_BLOCK_SIZE: 2199 value = get_block_size(&ctx->abi); 2200 break; 2201 2202 case TGSI_SEMANTIC_BLOCK_ID: 2203 { 2204 LLVMValueRef values[3]; 2205 2206 for (int i = 0; i < 3; i++) { 2207 values[i] = ctx->i32_0; 2208 if (ctx->abi.workgroup_ids[i]) { 2209 values[i] = ctx->abi.workgroup_ids[i]; 2210 } 2211 } 2212 value = ac_build_gather_values(&ctx->ac, values, 3); 2213 break; 2214 } 2215 2216 case TGSI_SEMANTIC_THREAD_ID: 2217 value = ctx->abi.local_invocation_ids; 2218 break; 2219 2220 case TGSI_SEMANTIC_HELPER_INVOCATION: 2221 value = ac_build_load_helper_invocation(&ctx->ac); 2222 break; 2223 2224 case TGSI_SEMANTIC_SUBGROUP_SIZE: 2225 value = LLVMConstInt(ctx->i32, 64, 0); 2226 break; 2227 2228 case TGSI_SEMANTIC_SUBGROUP_INVOCATION: 2229 value = ac_get_thread_id(&ctx->ac); 2230 break; 2231 2232 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK: 2233 { 2234 LLVMValueRef id = ac_get_thread_id(&ctx->ac); 2235 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, ""); 2236 value = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->i64, 1, 0), id, ""); 2237 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, ""); 2238 break; 2239 } 2240 2241 case TGSI_SEMANTIC_SUBGROUP_GE_MASK: 2242 case TGSI_SEMANTIC_SUBGROUP_GT_MASK: 2243 case TGSI_SEMANTIC_SUBGROUP_LE_MASK: 2244 case TGSI_SEMANTIC_SUBGROUP_LT_MASK: 2245 { 2246 LLVMValueRef id = ac_get_thread_id(&ctx->ac); 2247 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK || 2248 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) { 2249 /* All bits set except LSB */ 2250 value = LLVMConstInt(ctx->i64, -2, 0); 2251 } else { 2252 /* All bits set */ 2253 value = LLVMConstInt(ctx->i64, -1, 0); 2254 } 2255 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, ""); 2256 value = LLVMBuildShl(ctx->ac.builder, value, id, ""); 2257 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK || 2258 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK) 2259 value = LLVMBuildNot(ctx->ac.builder, value, ""); 2260 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, ""); 2261 break; 2262 } 2263 2264 case TGSI_SEMANTIC_CS_USER_DATA: 2265 value = LLVMGetParam(ctx->main_fn, ctx->param_cs_user_data); 2266 break; 2267 2268 default: 2269 assert(!"unknown system value"); 2270 return; 2271 } 2272 2273 ctx->system_values[index] = value; 2274} 2275 2276void si_declare_compute_memory(struct si_shader_context *ctx) 2277{ 2278 struct si_shader_selector *sel = ctx->shader->selector; 2279 unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]; 2280 2281 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS); 2282 LLVMValueRef var; 2283 2284 assert(!ctx->ac.lds); 2285 2286 var = LLVMAddGlobalInAddressSpace(ctx->ac.module, 2287 LLVMArrayType(ctx->i8, lds_size), 2288 "compute_lds", 2289 AC_ADDR_SPACE_LDS); 2290 LLVMSetAlignment(var, 4); 2291 2292 ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); 2293} 2294 2295void si_tgsi_declare_compute_memory(struct si_shader_context *ctx, 2296 const struct tgsi_full_declaration *decl) 2297{ 2298 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED); 2299 assert(decl->Range.First == decl->Range.Last); 2300 2301 si_declare_compute_memory(ctx); 2302} 2303 2304static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx) 2305{ 2306 LLVMValueRef ptr = 2307 LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); 2308 struct si_shader_selector *sel = ctx->shader->selector; 2309 2310 /* Do the bounds checking with a descriptor, because 2311 * doing computation and manual bounds checking of 64-bit 2312 * addresses generates horrible VALU code with very high 2313 * VGPR usage and very low SIMD occupancy. 2314 */ 2315 ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, ""); 2316 2317 LLVMValueRef desc0, desc1; 2318 desc0 = ptr; 2319 desc1 = LLVMConstInt(ctx->i32, 2320 S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); 2321 2322 LLVMValueRef desc_elems[] = { 2323 desc0, 2324 desc1, 2325 LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0), 2326 LLVMConstInt(ctx->i32, 2327 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 2328 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 2329 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 2330 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 2331 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 2332 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0) 2333 }; 2334 2335 return ac_build_gather_values(&ctx->ac, desc_elems, 4); 2336} 2337 2338static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i) 2339{ 2340 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn, 2341 ctx->param_const_and_shader_buffers); 2342 2343 return ac_build_load_to_sgpr(&ctx->ac, list_ptr, 2344 LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0)); 2345} 2346 2347static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index) 2348{ 2349 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2350 struct si_shader_selector *sel = ctx->shader->selector; 2351 2352 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); 2353 2354 if (sel->info.const_buffers_declared == 1 && 2355 sel->info.shader_buffers_declared == 0) { 2356 return load_const_buffer_desc_fast_path(ctx); 2357 } 2358 2359 index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers); 2360 index = LLVMBuildAdd(ctx->ac.builder, index, 2361 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), ""); 2362 2363 return ac_build_load_to_sgpr(&ctx->ac, ptr, index); 2364} 2365 2366static LLVMValueRef 2367load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write) 2368{ 2369 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2370 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn, 2371 ctx->param_const_and_shader_buffers); 2372 2373 index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers); 2374 index = LLVMBuildSub(ctx->ac.builder, 2375 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0), 2376 index, ""); 2377 2378 return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index); 2379} 2380 2381static LLVMValueRef fetch_constant( 2382 struct lp_build_tgsi_context *bld_base, 2383 const struct tgsi_full_src_register *reg, 2384 enum tgsi_opcode_type type, 2385 unsigned swizzle_in) 2386{ 2387 struct si_shader_context *ctx = si_shader_context(bld_base); 2388 struct si_shader_selector *sel = ctx->shader->selector; 2389 const struct tgsi_ind_register *ireg = ®->Indirect; 2390 unsigned buf, idx; 2391 unsigned swizzle = swizzle_in & 0xffff; 2392 2393 LLVMValueRef addr, bufp; 2394 2395 if (swizzle_in == LP_CHAN_ALL) { 2396 unsigned chan; 2397 LLVMValueRef values[4]; 2398 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) 2399 values[chan] = fetch_constant(bld_base, reg, type, chan); 2400 2401 return ac_build_gather_values(&ctx->ac, values, 4); 2402 } 2403 2404 /* Split 64-bit loads. */ 2405 if (tgsi_type_is_64bit(type)) { 2406 LLVMValueRef lo, hi; 2407 2408 lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle); 2409 hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, (swizzle_in >> 16)); 2410 return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), 2411 lo, hi); 2412 } 2413 2414 idx = reg->Register.Index * 4 + swizzle; 2415 if (reg->Register.Indirect) { 2416 addr = si_get_indirect_index(ctx, ireg, 16, idx * 4); 2417 } else { 2418 addr = LLVMConstInt(ctx->i32, idx * 4, 0); 2419 } 2420 2421 /* Fast path when user data SGPRs point to constant buffer 0 directly. */ 2422 if (sel->info.const_buffers_declared == 1 && 2423 sel->info.shader_buffers_declared == 0) { 2424 LLVMValueRef desc = load_const_buffer_desc_fast_path(ctx); 2425 LLVMValueRef result = buffer_load_const(ctx, desc, addr); 2426 return bitcast(bld_base, type, result); 2427 } 2428 2429 assert(reg->Register.Dimension); 2430 buf = reg->Dimension.Index; 2431 2432 if (reg->Dimension.Indirect) { 2433 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); 2434 LLVMValueRef index; 2435 index = si_get_bounded_indirect_index(ctx, ®->DimIndirect, 2436 reg->Dimension.Index, 2437 ctx->num_const_buffers); 2438 index = LLVMBuildAdd(ctx->ac.builder, index, 2439 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), ""); 2440 bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index); 2441 } else 2442 bufp = load_const_buffer_desc(ctx, buf); 2443 2444 return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr)); 2445} 2446 2447/* Initialize arguments for the shader export intrinsic */ 2448static void si_llvm_init_export_args(struct si_shader_context *ctx, 2449 LLVMValueRef *values, 2450 unsigned target, 2451 struct ac_export_args *args) 2452{ 2453 LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32); 2454 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR; 2455 unsigned chan; 2456 bool is_int8, is_int10; 2457 2458 /* Default is 0xf. Adjusted below depending on the format. */ 2459 args->enabled_channels = 0xf; /* writemask */ 2460 2461 /* Specify whether the EXEC mask represents the valid mask */ 2462 args->valid_mask = 0; 2463 2464 /* Specify whether this is the last export */ 2465 args->done = 0; 2466 2467 /* Specify the target we are exporting */ 2468 args->target = target; 2469 2470 if (ctx->type == PIPE_SHADER_FRAGMENT) { 2471 const struct si_shader_key *key = &ctx->shader->key; 2472 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format; 2473 int cbuf = target - V_008DFC_SQ_EXP_MRT; 2474 2475 assert(cbuf >= 0 && cbuf < 8); 2476 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf; 2477 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1; 2478 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1; 2479 } 2480 2481 args->compr = false; 2482 args->out[0] = f32undef; 2483 args->out[1] = f32undef; 2484 args->out[2] = f32undef; 2485 args->out[3] = f32undef; 2486 2487 LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL; 2488 LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2], 2489 unsigned bits, bool hi) = NULL; 2490 2491 switch (spi_shader_col_format) { 2492 case V_028714_SPI_SHADER_ZERO: 2493 args->enabled_channels = 0; /* writemask */ 2494 args->target = V_008DFC_SQ_EXP_NULL; 2495 break; 2496 2497 case V_028714_SPI_SHADER_32_R: 2498 args->enabled_channels = 1; /* writemask */ 2499 args->out[0] = values[0]; 2500 break; 2501 2502 case V_028714_SPI_SHADER_32_GR: 2503 args->enabled_channels = 0x3; /* writemask */ 2504 args->out[0] = values[0]; 2505 args->out[1] = values[1]; 2506 break; 2507 2508 case V_028714_SPI_SHADER_32_AR: 2509 args->enabled_channels = 0x9; /* writemask */ 2510 args->out[0] = values[0]; 2511 args->out[3] = values[3]; 2512 break; 2513 2514 case V_028714_SPI_SHADER_FP16_ABGR: 2515 packf = ac_build_cvt_pkrtz_f16; 2516 break; 2517 2518 case V_028714_SPI_SHADER_UNORM16_ABGR: 2519 packf = ac_build_cvt_pknorm_u16; 2520 break; 2521 2522 case V_028714_SPI_SHADER_SNORM16_ABGR: 2523 packf = ac_build_cvt_pknorm_i16; 2524 break; 2525 2526 case V_028714_SPI_SHADER_UINT16_ABGR: 2527 packi = ac_build_cvt_pk_u16; 2528 break; 2529 2530 case V_028714_SPI_SHADER_SINT16_ABGR: 2531 packi = ac_build_cvt_pk_i16; 2532 break; 2533 2534 case V_028714_SPI_SHADER_32_ABGR: 2535 memcpy(&args->out[0], values, sizeof(values[0]) * 4); 2536 break; 2537 } 2538 2539 /* Pack f16 or norm_i16/u16. */ 2540 if (packf) { 2541 for (chan = 0; chan < 2; chan++) { 2542 LLVMValueRef pack_args[2] = { 2543 values[2 * chan], 2544 values[2 * chan + 1] 2545 }; 2546 LLVMValueRef packed; 2547 2548 packed = packf(&ctx->ac, pack_args); 2549 args->out[chan] = ac_to_float(&ctx->ac, packed); 2550 } 2551 args->compr = 1; /* COMPR flag */ 2552 } 2553 /* Pack i16/u16. */ 2554 if (packi) { 2555 for (chan = 0; chan < 2; chan++) { 2556 LLVMValueRef pack_args[2] = { 2557 ac_to_integer(&ctx->ac, values[2 * chan]), 2558 ac_to_integer(&ctx->ac, values[2 * chan + 1]) 2559 }; 2560 LLVMValueRef packed; 2561 2562 packed = packi(&ctx->ac, pack_args, 2563 is_int8 ? 8 : is_int10 ? 10 : 16, 2564 chan == 1); 2565 args->out[chan] = ac_to_float(&ctx->ac, packed); 2566 } 2567 args->compr = 1; /* COMPR flag */ 2568 } 2569} 2570 2571static void si_alpha_test(struct lp_build_tgsi_context *bld_base, 2572 LLVMValueRef alpha) 2573{ 2574 struct si_shader_context *ctx = si_shader_context(bld_base); 2575 2576 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { 2577 static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = { 2578 [PIPE_FUNC_LESS] = LLVMRealOLT, 2579 [PIPE_FUNC_EQUAL] = LLVMRealOEQ, 2580 [PIPE_FUNC_LEQUAL] = LLVMRealOLE, 2581 [PIPE_FUNC_GREATER] = LLVMRealOGT, 2582 [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, 2583 [PIPE_FUNC_GEQUAL] = LLVMRealOGE, 2584 }; 2585 LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func]; 2586 assert(cond); 2587 2588 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, 2589 SI_PARAM_ALPHA_REF); 2590 LLVMValueRef alpha_pass = 2591 LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, ""); 2592 ac_build_kill_if_false(&ctx->ac, alpha_pass); 2593 } else { 2594 ac_build_kill_if_false(&ctx->ac, ctx->i1false); 2595 } 2596} 2597 2598static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base, 2599 LLVMValueRef alpha, 2600 unsigned samplemask_param) 2601{ 2602 struct si_shader_context *ctx = si_shader_context(bld_base); 2603 LLVMValueRef coverage; 2604 2605 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ 2606 coverage = LLVMGetParam(ctx->main_fn, 2607 samplemask_param); 2608 coverage = ac_to_integer(&ctx->ac, coverage); 2609 2610 coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32", 2611 ctx->i32, 2612 &coverage, 1, AC_FUNC_ATTR_READNONE); 2613 2614 coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, 2615 ctx->f32, ""); 2616 2617 coverage = LLVMBuildFMul(ctx->ac.builder, coverage, 2618 LLVMConstReal(ctx->f32, 2619 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), ""); 2620 2621 return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, ""); 2622} 2623 2624static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, 2625 struct ac_export_args *pos, LLVMValueRef *out_elts) 2626{ 2627 unsigned reg_index; 2628 unsigned chan; 2629 unsigned const_chan; 2630 LLVMValueRef base_elt; 2631 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); 2632 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32, 2633 SI_VS_CONST_CLIP_PLANES, 0); 2634 LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index); 2635 2636 for (reg_index = 0; reg_index < 2; reg_index ++) { 2637 struct ac_export_args *args = &pos[2 + reg_index]; 2638 2639 args->out[0] = 2640 args->out[1] = 2641 args->out[2] = 2642 args->out[3] = LLVMConstReal(ctx->f32, 0.0f); 2643 2644 /* Compute dot products of position and user clip plane vectors */ 2645 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 2646 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) { 2647 LLVMValueRef addr = 2648 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 + 2649 const_chan) * 4, 0); 2650 base_elt = buffer_load_const(ctx, const_resource, 2651 addr); 2652 args->out[chan] = ac_build_fmad(&ctx->ac, base_elt, 2653 out_elts[const_chan], args->out[chan]); 2654 } 2655 } 2656 2657 args->enabled_channels = 0xf; 2658 args->valid_mask = 0; 2659 args->done = 0; 2660 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index; 2661 args->compr = 0; 2662 } 2663} 2664 2665static void si_dump_streamout(struct pipe_stream_output_info *so) 2666{ 2667 unsigned i; 2668 2669 if (so->num_outputs) 2670 fprintf(stderr, "STREAMOUT\n"); 2671 2672 for (i = 0; i < so->num_outputs; i++) { 2673 unsigned mask = ((1 << so->output[i].num_components) - 1) << 2674 so->output[i].start_component; 2675 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", 2676 i, so->output[i].output_buffer, 2677 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 2678 so->output[i].register_index, 2679 mask & 1 ? "x" : "", 2680 mask & 2 ? "y" : "", 2681 mask & 4 ? "z" : "", 2682 mask & 8 ? "w" : ""); 2683 } 2684} 2685 2686static void emit_streamout_output(struct si_shader_context *ctx, 2687 LLVMValueRef const *so_buffers, 2688 LLVMValueRef const *so_write_offsets, 2689 struct pipe_stream_output *stream_out, 2690 struct si_shader_output_values *shader_out) 2691{ 2692 unsigned buf_idx = stream_out->output_buffer; 2693 unsigned start = stream_out->start_component; 2694 unsigned num_comps = stream_out->num_components; 2695 LLVMValueRef out[4]; 2696 2697 assert(num_comps && num_comps <= 4); 2698 if (!num_comps || num_comps > 4) 2699 return; 2700 2701 /* Load the output as int. */ 2702 for (int j = 0; j < num_comps; j++) { 2703 assert(stream_out->stream == shader_out->vertex_stream[start + j]); 2704 2705 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]); 2706 } 2707 2708 /* Pack the output. */ 2709 LLVMValueRef vdata = NULL; 2710 2711 switch (num_comps) { 2712 case 1: /* as i32 */ 2713 vdata = out[0]; 2714 break; 2715 case 2: /* as v2i32 */ 2716 case 3: /* as v4i32 (aligned to 4) */ 2717 out[3] = LLVMGetUndef(ctx->i32); 2718 /* fall through */ 2719 case 4: /* as v4i32 */ 2720 vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps)); 2721 break; 2722 } 2723 2724 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], 2725 vdata, num_comps, 2726 so_write_offsets[buf_idx], 2727 ctx->i32_0, 2728 stream_out->dst_offset * 4, 1, 1, true, false); 2729} 2730 2731/** 2732 * Write streamout data to buffers for vertex stream @p stream (different 2733 * vertex streams can occur for GS copy shaders). 2734 */ 2735static void si_llvm_emit_streamout(struct si_shader_context *ctx, 2736 struct si_shader_output_values *outputs, 2737 unsigned noutput, unsigned stream) 2738{ 2739 struct si_shader_selector *sel = ctx->shader->selector; 2740 struct pipe_stream_output_info *so = &sel->so; 2741 LLVMBuilderRef builder = ctx->ac.builder; 2742 int i; 2743 struct lp_build_if_state if_ctx; 2744 2745 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ 2746 LLVMValueRef so_vtx_count = 2747 si_unpack_param(ctx, ctx->param_streamout_config, 16, 7); 2748 2749 LLVMValueRef tid = ac_get_thread_id(&ctx->ac); 2750 2751 /* can_emit = tid < so_vtx_count; */ 2752 LLVMValueRef can_emit = 2753 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); 2754 2755 /* Emit the streamout code conditionally. This actually avoids 2756 * out-of-bounds buffer access. The hw tells us via the SGPR 2757 * (so_vtx_count) which threads are allowed to emit streamout data. */ 2758 lp_build_if(&if_ctx, &ctx->gallivm, can_emit); 2759 { 2760 /* The buffer offset is computed as follows: 2761 * ByteOffset = streamout_offset[buffer_id]*4 + 2762 * (streamout_write_index + thread_id)*stride[buffer_id] + 2763 * attrib_offset 2764 */ 2765 2766 LLVMValueRef so_write_index = 2767 LLVMGetParam(ctx->main_fn, 2768 ctx->param_streamout_write_index); 2769 2770 /* Compute (streamout_write_index + thread_id). */ 2771 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); 2772 2773 /* Load the descriptor and compute the write offset for each 2774 * enabled buffer. */ 2775 LLVMValueRef so_write_offset[4] = {}; 2776 LLVMValueRef so_buffers[4]; 2777 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, 2778 ctx->param_rw_buffers); 2779 2780 for (i = 0; i < 4; i++) { 2781 if (!so->stride[i]) 2782 continue; 2783 2784 LLVMValueRef offset = LLVMConstInt(ctx->i32, 2785 SI_VS_STREAMOUT_BUF0 + i, 0); 2786 2787 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 2788 2789 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn, 2790 ctx->param_streamout_offset[i]); 2791 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), ""); 2792 2793 so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index, 2794 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), 2795 so_offset); 2796 } 2797 2798 /* Write streamout data. */ 2799 for (i = 0; i < so->num_outputs; i++) { 2800 unsigned reg = so->output[i].register_index; 2801 2802 if (reg >= noutput) 2803 continue; 2804 2805 if (stream != so->output[i].stream) 2806 continue; 2807 2808 emit_streamout_output(ctx, so_buffers, so_write_offset, 2809 &so->output[i], &outputs[reg]); 2810 } 2811 } 2812 lp_build_endif(&if_ctx); 2813} 2814 2815static void si_export_param(struct si_shader_context *ctx, unsigned index, 2816 LLVMValueRef *values) 2817{ 2818 struct ac_export_args args; 2819 2820 si_llvm_init_export_args(ctx, values, 2821 V_008DFC_SQ_EXP_PARAM + index, &args); 2822 ac_build_export(&ctx->ac, &args); 2823} 2824 2825static void si_build_param_exports(struct si_shader_context *ctx, 2826 struct si_shader_output_values *outputs, 2827 unsigned noutput) 2828{ 2829 struct si_shader *shader = ctx->shader; 2830 unsigned param_count = 0; 2831 2832 for (unsigned i = 0; i < noutput; i++) { 2833 unsigned semantic_name = outputs[i].semantic_name; 2834 unsigned semantic_index = outputs[i].semantic_index; 2835 2836 if (outputs[i].vertex_stream[0] != 0 && 2837 outputs[i].vertex_stream[1] != 0 && 2838 outputs[i].vertex_stream[2] != 0 && 2839 outputs[i].vertex_stream[3] != 0) 2840 continue; 2841 2842 switch (semantic_name) { 2843 case TGSI_SEMANTIC_LAYER: 2844 case TGSI_SEMANTIC_VIEWPORT_INDEX: 2845 case TGSI_SEMANTIC_CLIPDIST: 2846 case TGSI_SEMANTIC_COLOR: 2847 case TGSI_SEMANTIC_BCOLOR: 2848 case TGSI_SEMANTIC_PRIMID: 2849 case TGSI_SEMANTIC_FOG: 2850 case TGSI_SEMANTIC_TEXCOORD: 2851 case TGSI_SEMANTIC_GENERIC: 2852 break; 2853 default: 2854 continue; 2855 } 2856 2857 if ((semantic_name != TGSI_SEMANTIC_GENERIC || 2858 semantic_index < SI_MAX_IO_GENERIC) && 2859 shader->key.opt.kill_outputs & 2860 (1ull << si_shader_io_get_unique_index(semantic_name, 2861 semantic_index, true))) 2862 continue; 2863 2864 si_export_param(ctx, param_count, outputs[i].values); 2865 2866 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); 2867 shader->info.vs_output_param_offset[i] = param_count++; 2868 } 2869 2870 shader->info.nr_param_exports = param_count; 2871} 2872 2873/* Generate export instructions for hardware VS shader stage */ 2874static void si_llvm_export_vs(struct si_shader_context *ctx, 2875 struct si_shader_output_values *outputs, 2876 unsigned noutput) 2877{ 2878 struct si_shader *shader = ctx->shader; 2879 struct ac_export_args pos_args[4] = {}; 2880 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; 2881 unsigned pos_idx; 2882 int i; 2883 2884 /* Build position exports. */ 2885 for (i = 0; i < noutput; i++) { 2886 switch (outputs[i].semantic_name) { 2887 case TGSI_SEMANTIC_POSITION: 2888 si_llvm_init_export_args(ctx, outputs[i].values, 2889 V_008DFC_SQ_EXP_POS, &pos_args[0]); 2890 break; 2891 case TGSI_SEMANTIC_PSIZE: 2892 psize_value = outputs[i].values[0]; 2893 break; 2894 case TGSI_SEMANTIC_LAYER: 2895 layer_value = outputs[i].values[0]; 2896 break; 2897 case TGSI_SEMANTIC_VIEWPORT_INDEX: 2898 viewport_index_value = outputs[i].values[0]; 2899 break; 2900 case TGSI_SEMANTIC_EDGEFLAG: 2901 edgeflag_value = outputs[i].values[0]; 2902 break; 2903 case TGSI_SEMANTIC_CLIPDIST: 2904 if (!shader->key.opt.clip_disable) { 2905 unsigned index = 2 + outputs[i].semantic_index; 2906 si_llvm_init_export_args(ctx, outputs[i].values, 2907 V_008DFC_SQ_EXP_POS + index, 2908 &pos_args[index]); 2909 } 2910 break; 2911 case TGSI_SEMANTIC_CLIPVERTEX: 2912 if (!shader->key.opt.clip_disable) { 2913 si_llvm_emit_clipvertex(ctx, pos_args, 2914 outputs[i].values); 2915 } 2916 break; 2917 } 2918 } 2919 2920 /* We need to add the position output manually if it's missing. */ 2921 if (!pos_args[0].out[0]) { 2922 pos_args[0].enabled_channels = 0xf; /* writemask */ 2923 pos_args[0].valid_mask = 0; /* EXEC mask */ 2924 pos_args[0].done = 0; /* last export? */ 2925 pos_args[0].target = V_008DFC_SQ_EXP_POS; 2926 pos_args[0].compr = 0; /* COMPR flag */ 2927 pos_args[0].out[0] = ctx->ac.f32_0; /* X */ 2928 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */ 2929 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */ 2930 pos_args[0].out[3] = ctx->ac.f32_1; /* W */ 2931 } 2932 2933 /* Write the misc vector (point size, edgeflag, layer, viewport). */ 2934 if (shader->selector->info.writes_psize || 2935 shader->selector->info.writes_edgeflag || 2936 shader->selector->info.writes_viewport_index || 2937 shader->selector->info.writes_layer) { 2938 pos_args[1].enabled_channels = shader->selector->info.writes_psize | 2939 (shader->selector->info.writes_edgeflag << 1) | 2940 (shader->selector->info.writes_layer << 2); 2941 2942 pos_args[1].valid_mask = 0; /* EXEC mask */ 2943 pos_args[1].done = 0; /* last export? */ 2944 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1; 2945 pos_args[1].compr = 0; /* COMPR flag */ 2946 pos_args[1].out[0] = ctx->ac.f32_0; /* X */ 2947 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */ 2948 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */ 2949 pos_args[1].out[3] = ctx->ac.f32_0; /* W */ 2950 2951 if (shader->selector->info.writes_psize) 2952 pos_args[1].out[0] = psize_value; 2953 2954 if (shader->selector->info.writes_edgeflag) { 2955 /* The output is a float, but the hw expects an integer 2956 * with the first bit containing the edge flag. */ 2957 edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, 2958 edgeflag_value, 2959 ctx->i32, ""); 2960 edgeflag_value = ac_build_umin(&ctx->ac, 2961 edgeflag_value, 2962 ctx->i32_1); 2963 2964 /* The LLVM intrinsic expects a float. */ 2965 pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value); 2966 } 2967 2968 if (ctx->screen->info.chip_class >= GFX9) { 2969 /* GFX9 has the layer in out.z[10:0] and the viewport 2970 * index in out.z[19:16]. 2971 */ 2972 if (shader->selector->info.writes_layer) 2973 pos_args[1].out[2] = layer_value; 2974 2975 if (shader->selector->info.writes_viewport_index) { 2976 LLVMValueRef v = viewport_index_value; 2977 2978 v = ac_to_integer(&ctx->ac, v); 2979 v = LLVMBuildShl(ctx->ac.builder, v, 2980 LLVMConstInt(ctx->i32, 16, 0), ""); 2981 v = LLVMBuildOr(ctx->ac.builder, v, 2982 ac_to_integer(&ctx->ac, pos_args[1].out[2]), ""); 2983 pos_args[1].out[2] = ac_to_float(&ctx->ac, v); 2984 pos_args[1].enabled_channels |= 1 << 2; 2985 } 2986 } else { 2987 if (shader->selector->info.writes_layer) 2988 pos_args[1].out[2] = layer_value; 2989 2990 if (shader->selector->info.writes_viewport_index) { 2991 pos_args[1].out[3] = viewport_index_value; 2992 pos_args[1].enabled_channels |= 1 << 3; 2993 } 2994 } 2995 } 2996 2997 for (i = 0; i < 4; i++) 2998 if (pos_args[i].out[0]) 2999 shader->info.nr_pos_exports++; 3000 3001 pos_idx = 0; 3002 for (i = 0; i < 4; i++) { 3003 if (!pos_args[i].out[0]) 3004 continue; 3005 3006 /* Specify the target we are exporting */ 3007 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++; 3008 3009 if (pos_idx == shader->info.nr_pos_exports) 3010 /* Specify that this is the last export */ 3011 pos_args[i].done = 1; 3012 3013 ac_build_export(&ctx->ac, &pos_args[i]); 3014 } 3015 3016 /* Build parameter exports. */ 3017 si_build_param_exports(ctx, outputs, noutput); 3018} 3019 3020/** 3021 * Forward all outputs from the vertex shader to the TES. This is only used 3022 * for the fixed function TCS. 3023 */ 3024static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) 3025{ 3026 struct si_shader_context *ctx = si_shader_context(bld_base); 3027 LLVMValueRef invocation_id, buffer, buffer_offset; 3028 LLVMValueRef lds_vertex_stride, lds_base; 3029 uint64_t inputs; 3030 3031 invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); 3032 buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); 3033 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 3034 3035 lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx); 3036 lds_base = get_tcs_in_current_patch_offset(ctx); 3037 lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, 3038 lds_base); 3039 3040 inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy; 3041 while (inputs) { 3042 unsigned i = u_bit_scan64(&inputs); 3043 3044 LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base, 3045 LLVMConstInt(ctx->i32, 4 * i, 0), 3046 ""); 3047 3048 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx, 3049 get_rel_patch_id(ctx), 3050 invocation_id, 3051 LLVMConstInt(ctx->i32, i, 0)); 3052 3053 LLVMValueRef value = lds_load(bld_base, ctx->ac.i32, ~0, 3054 lds_ptr); 3055 3056 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, 3057 buffer_offset, 0, 1, 0, true, false); 3058 } 3059} 3060 3061static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, 3062 LLVMValueRef rel_patch_id, 3063 LLVMValueRef invocation_id, 3064 LLVMValueRef tcs_out_current_patch_data_offset, 3065 LLVMValueRef invoc0_tf_outer[4], 3066 LLVMValueRef invoc0_tf_inner[2]) 3067{ 3068 struct si_shader_context *ctx = si_shader_context(bld_base); 3069 struct si_shader *shader = ctx->shader; 3070 unsigned tess_inner_index, tess_outer_index; 3071 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer; 3072 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4]; 3073 unsigned stride, outer_comps, inner_comps, i, offset; 3074 struct lp_build_if_state if_ctx, inner_if_ctx; 3075 3076 /* Add a barrier before loading tess factors from LDS. */ 3077 if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) 3078 si_llvm_emit_barrier(NULL, bld_base, NULL); 3079 3080 /* Do this only for invocation 0, because the tess levels are per-patch, 3081 * not per-vertex. 3082 * 3083 * This can't jump, because invocation 0 executes this. It should 3084 * at least mask out the loads and stores for other invocations. 3085 */ 3086 lp_build_if(&if_ctx, &ctx->gallivm, 3087 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, 3088 invocation_id, ctx->i32_0, "")); 3089 3090 /* Determine the layout of one tess factor element in the buffer. */ 3091 switch (shader->key.part.tcs.epilog.prim_mode) { 3092 case PIPE_PRIM_LINES: 3093 stride = 2; /* 2 dwords, 1 vec2 store */ 3094 outer_comps = 2; 3095 inner_comps = 0; 3096 break; 3097 case PIPE_PRIM_TRIANGLES: 3098 stride = 4; /* 4 dwords, 1 vec4 store */ 3099 outer_comps = 3; 3100 inner_comps = 1; 3101 break; 3102 case PIPE_PRIM_QUADS: 3103 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */ 3104 outer_comps = 4; 3105 inner_comps = 2; 3106 break; 3107 default: 3108 assert(0); 3109 return; 3110 } 3111 3112 for (i = 0; i < 4; i++) { 3113 inner[i] = LLVMGetUndef(ctx->i32); 3114 outer[i] = LLVMGetUndef(ctx->i32); 3115 } 3116 3117 if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) { 3118 /* Tess factors are in VGPRs. */ 3119 for (i = 0; i < outer_comps; i++) 3120 outer[i] = out[i] = invoc0_tf_outer[i]; 3121 for (i = 0; i < inner_comps; i++) 3122 inner[i] = out[outer_comps+i] = invoc0_tf_inner[i]; 3123 } else { 3124 /* Load tess_inner and tess_outer from LDS. 3125 * Any invocation can write them, so we can't get them from a temporary. 3126 */ 3127 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0); 3128 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0); 3129 3130 lds_base = tcs_out_current_patch_data_offset; 3131 lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base, 3132 LLVMConstInt(ctx->i32, 3133 tess_inner_index * 4, 0), ""); 3134 lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base, 3135 LLVMConstInt(ctx->i32, 3136 tess_outer_index * 4, 0), ""); 3137 3138 for (i = 0; i < outer_comps; i++) { 3139 outer[i] = out[i] = 3140 lds_load(bld_base, ctx->ac.i32, i, lds_outer); 3141 } 3142 for (i = 0; i < inner_comps; i++) { 3143 inner[i] = out[outer_comps+i] = 3144 lds_load(bld_base, ctx->ac.i32, i, lds_inner); 3145 } 3146 } 3147 3148 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { 3149 /* For isolines, the hardware expects tess factors in the 3150 * reverse order from what GLSL / TGSI specify. 3151 */ 3152 LLVMValueRef tmp = out[0]; 3153 out[0] = out[1]; 3154 out[1] = tmp; 3155 } 3156 3157 /* Convert the outputs to vectors for stores. */ 3158 vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4)); 3159 vec1 = NULL; 3160 3161 if (stride > 4) 3162 vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4); 3163 3164 /* Get the buffer. */ 3165 buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING); 3166 3167 /* Get the offset. */ 3168 tf_base = LLVMGetParam(ctx->main_fn, 3169 ctx->param_tcs_factor_offset); 3170 byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id, 3171 LLVMConstInt(ctx->i32, 4 * stride, 0), ""); 3172 3173 lp_build_if(&inner_if_ctx, &ctx->gallivm, 3174 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, 3175 rel_patch_id, ctx->i32_0, "")); 3176 3177 /* Store the dynamic HS control word. */ 3178 offset = 0; 3179 if (ctx->screen->info.chip_class <= VI) { 3180 ac_build_buffer_store_dword(&ctx->ac, buffer, 3181 LLVMConstInt(ctx->i32, 0x80000000, 0), 3182 1, ctx->i32_0, tf_base, 3183 offset, 1, 0, true, false); 3184 offset += 4; 3185 } 3186 3187 lp_build_endif(&inner_if_ctx); 3188 3189 /* Store the tessellation factors. */ 3190 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, 3191 MIN2(stride, 4), byteoffset, tf_base, 3192 offset, 1, 0, true, false); 3193 offset += 16; 3194 if (vec1) 3195 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, 3196 stride - 4, byteoffset, tf_base, 3197 offset, 1, 0, true, false); 3198 3199 /* Store the tess factors into the offchip buffer if TES reads them. */ 3200 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) { 3201 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset; 3202 LLVMValueRef tf_inner_offset; 3203 unsigned param_outer, param_inner; 3204 3205 buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); 3206 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 3207 3208 param_outer = si_shader_io_get_unique_index_patch( 3209 TGSI_SEMANTIC_TESSOUTER, 0); 3210 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, 3211 LLVMConstInt(ctx->i32, param_outer, 0)); 3212 3213 outer_vec = ac_build_gather_values(&ctx->ac, outer, 3214 util_next_power_of_two(outer_comps)); 3215 3216 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, 3217 outer_comps, tf_outer_offset, 3218 base, 0, 1, 0, true, false); 3219 if (inner_comps) { 3220 param_inner = si_shader_io_get_unique_index_patch( 3221 TGSI_SEMANTIC_TESSINNER, 0); 3222 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, 3223 LLVMConstInt(ctx->i32, param_inner, 0)); 3224 3225 inner_vec = inner_comps == 1 ? inner[0] : 3226 ac_build_gather_values(&ctx->ac, inner, inner_comps); 3227 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, 3228 inner_comps, tf_inner_offset, 3229 base, 0, 1, 0, true, false); 3230 } 3231 } 3232 3233 lp_build_endif(&if_ctx); 3234} 3235 3236static LLVMValueRef 3237si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret, 3238 unsigned param, unsigned return_index) 3239{ 3240 return LLVMBuildInsertValue(ctx->ac.builder, ret, 3241 LLVMGetParam(ctx->main_fn, param), 3242 return_index, ""); 3243} 3244 3245static LLVMValueRef 3246si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret, 3247 unsigned param, unsigned return_index) 3248{ 3249 LLVMBuilderRef builder = ctx->ac.builder; 3250 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param); 3251 3252 return LLVMBuildInsertValue(builder, ret, 3253 ac_to_float(&ctx->ac, p), 3254 return_index, ""); 3255} 3256 3257static LLVMValueRef 3258si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret, 3259 unsigned param, unsigned return_index) 3260{ 3261 LLVMBuilderRef builder = ctx->ac.builder; 3262 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, param); 3263 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, ""); 3264 return LLVMBuildInsertValue(builder, ret, ptr, return_index, ""); 3265} 3266 3267/* This only writes the tessellation factor levels. */ 3268static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, 3269 unsigned max_outputs, 3270 LLVMValueRef *addrs) 3271{ 3272 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3273 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 3274 LLVMBuilderRef builder = ctx->ac.builder; 3275 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; 3276 3277 si_copy_tcs_inputs(bld_base); 3278 3279 rel_patch_id = get_rel_patch_id(ctx); 3280 invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); 3281 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); 3282 3283 if (ctx->screen->info.chip_class >= GFX9) { 3284 LLVMBasicBlockRef blocks[2] = { 3285 LLVMGetInsertBlock(builder), 3286 ctx->merged_wrap_if_state.entry_block 3287 }; 3288 LLVMValueRef values[2]; 3289 3290 lp_build_endif(&ctx->merged_wrap_if_state); 3291 3292 values[0] = rel_patch_id; 3293 values[1] = LLVMGetUndef(ctx->i32); 3294 rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); 3295 3296 values[0] = tf_lds_offset; 3297 values[1] = LLVMGetUndef(ctx->i32); 3298 tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); 3299 3300 values[0] = invocation_id; 3301 values[1] = ctx->i32_1; /* cause the epilog to skip threads */ 3302 invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); 3303 } 3304 3305 /* Return epilog parameters from this function. */ 3306 LLVMValueRef ret = ctx->return_value; 3307 unsigned vgpr; 3308 3309 if (ctx->screen->info.chip_class >= GFX9) { 3310 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, 3311 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); 3312 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout, 3313 8 + GFX9_SGPR_TCS_OUT_LAYOUT); 3314 /* Tess offchip and tess factor offsets are at the beginning. */ 3315 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2); 3316 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4); 3317 vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1; 3318 } else { 3319 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, 3320 GFX6_SGPR_TCS_OFFCHIP_LAYOUT); 3321 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout, 3322 GFX6_SGPR_TCS_OUT_LAYOUT); 3323 /* Tess offchip and tess factor offsets are after user SGPRs. */ 3324 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 3325 GFX6_TCS_NUM_USER_SGPR); 3326 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 3327 GFX6_TCS_NUM_USER_SGPR + 1); 3328 vgpr = GFX6_TCS_NUM_USER_SGPR + 2; 3329 } 3330 3331 /* VGPRs */ 3332 rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id); 3333 invocation_id = ac_to_float(&ctx->ac, invocation_id); 3334 tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset); 3335 3336 /* Leave a hole corresponding to the two input VGPRs. This ensures that 3337 * the invocation_id output does not alias the tcs_rel_ids input, 3338 * which saves a V_MOV on gfx9. 3339 */ 3340 vgpr += 2; 3341 3342 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); 3343 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); 3344 3345 if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { 3346 vgpr++; /* skip the tess factor LDS offset */ 3347 for (unsigned i = 0; i < 6; i++) { 3348 LLVMValueRef value = 3349 LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], ""); 3350 value = ac_to_float(&ctx->ac, value); 3351 ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, ""); 3352 } 3353 } else { 3354 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); 3355 } 3356 ctx->return_value = ret; 3357} 3358 3359/* Pass TCS inputs from LS to TCS on GFX9. */ 3360static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx) 3361{ 3362 LLVMValueRef ret = ctx->return_value; 3363 3364 ret = si_insert_input_ptr(ctx, ret, 0, 0); 3365 ret = si_insert_input_ptr(ctx, ret, 1, 1); 3366 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2); 3367 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3); 3368 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4); 3369 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5); 3370 3371 ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers, 3372 8 + SI_SGPR_RW_BUFFERS); 3373 ret = si_insert_input_ptr(ctx, ret, 3374 ctx->param_bindless_samplers_and_images, 3375 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); 3376 3377 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits, 3378 8 + SI_SGPR_VS_STATE_BITS); 3379 3380 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, 3381 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); 3382 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets, 3383 8 + GFX9_SGPR_TCS_OUT_OFFSETS); 3384 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout, 3385 8 + GFX9_SGPR_TCS_OUT_LAYOUT); 3386 3387 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR; 3388 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 3389 ac_to_float(&ctx->ac, ctx->abi.tcs_patch_id), 3390 vgpr++, ""); 3391 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 3392 ac_to_float(&ctx->ac, ctx->abi.tcs_rel_ids), 3393 vgpr++, ""); 3394 ctx->return_value = ret; 3395} 3396 3397/* Pass GS inputs from ES to GS on GFX9. */ 3398static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) 3399{ 3400 LLVMValueRef ret = ctx->return_value; 3401 3402 ret = si_insert_input_ptr(ctx, ret, 0, 0); 3403 ret = si_insert_input_ptr(ctx, ret, 1, 1); 3404 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2); 3405 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3); 3406 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5); 3407 3408 ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers, 3409 8 + SI_SGPR_RW_BUFFERS); 3410 ret = si_insert_input_ptr(ctx, ret, 3411 ctx->param_bindless_samplers_and_images, 3412 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); 3413 3414 unsigned vgpr; 3415 if (ctx->type == PIPE_SHADER_VERTEX) 3416 vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR; 3417 else 3418 vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; 3419 3420 for (unsigned i = 0; i < 5; i++) { 3421 unsigned param = ctx->param_gs_vtx01_offset + i; 3422 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++); 3423 } 3424 ctx->return_value = ret; 3425} 3426 3427static void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, 3428 unsigned max_outputs, 3429 LLVMValueRef *addrs) 3430{ 3431 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3432 struct si_shader *shader = ctx->shader; 3433 struct tgsi_shader_info *info = &shader->selector->info; 3434 unsigned i, chan; 3435 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn, 3436 ctx->param_rel_auto_id); 3437 LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx); 3438 LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, 3439 vertex_dw_stride, ""); 3440 3441 /* Write outputs to LDS. The next shader (TCS aka HS) will read 3442 * its inputs from it. */ 3443 for (i = 0; i < info->num_outputs; i++) { 3444 unsigned name = info->output_semantic_name[i]; 3445 unsigned index = info->output_semantic_index[i]; 3446 3447 /* The ARB_shader_viewport_layer_array spec contains the 3448 * following issue: 3449 * 3450 * 2) What happens if gl_ViewportIndex or gl_Layer is 3451 * written in the vertex shader and a geometry shader is 3452 * present? 3453 * 3454 * RESOLVED: The value written by the last vertex processing 3455 * stage is used. If the last vertex processing stage 3456 * (vertex, tessellation evaluation or geometry) does not 3457 * statically assign to gl_ViewportIndex or gl_Layer, index 3458 * or layer zero is assumed. 3459 * 3460 * So writes to those outputs in VS-as-LS are simply ignored. 3461 */ 3462 if (name == TGSI_SEMANTIC_LAYER || 3463 name == TGSI_SEMANTIC_VIEWPORT_INDEX) 3464 continue; 3465 3466 int param = si_shader_io_get_unique_index(name, index, false); 3467 LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, 3468 LLVMConstInt(ctx->i32, param * 4, 0), ""); 3469 3470 for (chan = 0; chan < 4; chan++) { 3471 if (!(info->output_usagemask[i] & (1 << chan))) 3472 continue; 3473 3474 lds_store(ctx, chan, dw_addr, 3475 LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "")); 3476 } 3477 } 3478 3479 if (ctx->screen->info.chip_class >= GFX9) 3480 si_set_ls_return_value_for_tcs(ctx); 3481} 3482 3483static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, 3484 unsigned max_outputs, 3485 LLVMValueRef *addrs) 3486{ 3487 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3488 struct si_shader *es = ctx->shader; 3489 struct tgsi_shader_info *info = &es->selector->info; 3490 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, 3491 ctx->param_es2gs_offset); 3492 LLVMValueRef lds_base = NULL; 3493 unsigned chan; 3494 int i; 3495 3496 if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) { 3497 unsigned itemsize_dw = es->selector->esgs_itemsize / 4; 3498 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); 3499 LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4); 3500 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, 3501 LLVMBuildMul(ctx->ac.builder, wave_idx, 3502 LLVMConstInt(ctx->i32, 64, false), ""), ""); 3503 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx, 3504 LLVMConstInt(ctx->i32, itemsize_dw, 0), ""); 3505 } 3506 3507 for (i = 0; i < info->num_outputs; i++) { 3508 int param; 3509 3510 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || 3511 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) 3512 continue; 3513 3514 param = si_shader_io_get_unique_index(info->output_semantic_name[i], 3515 info->output_semantic_index[i], false); 3516 3517 for (chan = 0; chan < 4; chan++) { 3518 if (!(info->output_usagemask[i] & (1 << chan))) 3519 continue; 3520 3521 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); 3522 out_val = ac_to_integer(&ctx->ac, out_val); 3523 3524 /* GFX9 has the ESGS ring in LDS. */ 3525 if (ctx->screen->info.chip_class >= GFX9) { 3526 lds_store(ctx, param * 4 + chan, lds_base, out_val); 3527 continue; 3528 } 3529 3530 ac_build_buffer_store_dword(&ctx->ac, 3531 ctx->esgs_ring, 3532 out_val, 1, NULL, soffset, 3533 (4 * param + chan) * 4, 3534 1, 1, true, true); 3535 } 3536 } 3537 3538 if (ctx->screen->info.chip_class >= GFX9) 3539 si_set_es_return_value_for_gs(ctx); 3540} 3541 3542static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx) 3543{ 3544 if (ctx->screen->info.chip_class >= GFX9) 3545 return si_unpack_param(ctx, ctx->param_merged_wave_info, 16, 8); 3546 else 3547 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id); 3548} 3549 3550static void emit_gs_epilogue(struct si_shader_context *ctx) 3551{ 3552 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, 3553 si_get_gs_wave_id(ctx)); 3554 3555 if (ctx->screen->info.chip_class >= GFX9) 3556 lp_build_endif(&ctx->merged_wrap_if_state); 3557} 3558 3559static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, 3560 unsigned max_outputs, 3561 LLVMValueRef *addrs) 3562{ 3563 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3564 struct tgsi_shader_info UNUSED *info = &ctx->shader->selector->info; 3565 3566 assert(info->num_outputs <= max_outputs); 3567 3568 emit_gs_epilogue(ctx); 3569} 3570 3571static void si_tgsi_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base) 3572{ 3573 struct si_shader_context *ctx = si_shader_context(bld_base); 3574 emit_gs_epilogue(ctx); 3575} 3576 3577static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, 3578 unsigned max_outputs, 3579 LLVMValueRef *addrs) 3580{ 3581 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3582 struct tgsi_shader_info *info = &ctx->shader->selector->info; 3583 struct si_shader_output_values *outputs = NULL; 3584 int i,j; 3585 3586 assert(!ctx->shader->is_gs_copy_shader); 3587 assert(info->num_outputs <= max_outputs); 3588 3589 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); 3590 3591 /* Vertex color clamping. 3592 * 3593 * This uses a state constant loaded in a user data SGPR and 3594 * an IF statement is added that clamps all colors if the constant 3595 * is true. 3596 */ 3597 struct lp_build_if_state if_ctx; 3598 LLVMValueRef cond = NULL; 3599 LLVMValueRef addr, val; 3600 3601 for (i = 0; i < info->num_outputs; i++) { 3602 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR && 3603 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR) 3604 continue; 3605 3606 /* We've found a color. */ 3607 if (!cond) { 3608 /* The state is in the first bit of the user SGPR. */ 3609 cond = LLVMGetParam(ctx->main_fn, 3610 ctx->param_vs_state_bits); 3611 cond = LLVMBuildTrunc(ctx->ac.builder, cond, 3612 ctx->i1, ""); 3613 lp_build_if(&if_ctx, &ctx->gallivm, cond); 3614 } 3615 3616 for (j = 0; j < 4; j++) { 3617 addr = addrs[4 * i + j]; 3618 val = LLVMBuildLoad(ctx->ac.builder, addr, ""); 3619 val = ac_build_clamp(&ctx->ac, val); 3620 LLVMBuildStore(ctx->ac.builder, val, addr); 3621 } 3622 } 3623 3624 if (cond) 3625 lp_build_endif(&if_ctx); 3626 3627 for (i = 0; i < info->num_outputs; i++) { 3628 outputs[i].semantic_name = info->output_semantic_name[i]; 3629 outputs[i].semantic_index = info->output_semantic_index[i]; 3630 3631 for (j = 0; j < 4; j++) { 3632 outputs[i].values[j] = 3633 LLVMBuildLoad(ctx->ac.builder, 3634 addrs[4 * i + j], 3635 ""); 3636 outputs[i].vertex_stream[j] = 3637 (info->output_streams[i] >> (2 * j)) & 3; 3638 } 3639 } 3640 3641 if (ctx->shader->selector->so.num_outputs) 3642 si_llvm_emit_streamout(ctx, outputs, i, 0); 3643 3644 /* Export PrimitiveID. */ 3645 if (ctx->shader->key.mono.u.vs_export_prim_id) { 3646 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID; 3647 outputs[i].semantic_index = 0; 3648 outputs[i].values[0] = ac_to_float(&ctx->ac, get_primitive_id(ctx, 0)); 3649 for (j = 1; j < 4; j++) 3650 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0); 3651 3652 memset(outputs[i].vertex_stream, 0, 3653 sizeof(outputs[i].vertex_stream)); 3654 i++; 3655 } 3656 3657 si_llvm_export_vs(ctx, outputs, i); 3658 FREE(outputs); 3659} 3660 3661static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base) 3662{ 3663 struct si_shader_context *ctx = si_shader_context(bld_base); 3664 3665 ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS, 3666 &ctx->outputs[0][0]); 3667} 3668 3669struct si_ps_exports { 3670 unsigned num; 3671 struct ac_export_args args[10]; 3672}; 3673 3674static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, 3675 LLVMValueRef depth, LLVMValueRef stencil, 3676 LLVMValueRef samplemask, struct si_ps_exports *exp) 3677{ 3678 struct si_shader_context *ctx = si_shader_context(bld_base); 3679 struct ac_export_args args; 3680 3681 ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args); 3682 3683 memcpy(&exp->args[exp->num++], &args, sizeof(args)); 3684} 3685 3686static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, 3687 LLVMValueRef *color, unsigned index, 3688 unsigned samplemask_param, 3689 bool is_last, struct si_ps_exports *exp) 3690{ 3691 struct si_shader_context *ctx = si_shader_context(bld_base); 3692 int i; 3693 3694 /* Clamp color */ 3695 if (ctx->shader->key.part.ps.epilog.clamp_color) 3696 for (i = 0; i < 4; i++) 3697 color[i] = ac_build_clamp(&ctx->ac, color[i]); 3698 3699 /* Alpha to one */ 3700 if (ctx->shader->key.part.ps.epilog.alpha_to_one) 3701 color[3] = ctx->ac.f32_1; 3702 3703 /* Alpha test */ 3704 if (index == 0 && 3705 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) 3706 si_alpha_test(bld_base, color[3]); 3707 3708 /* Line & polygon smoothing */ 3709 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing) 3710 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3], 3711 samplemask_param); 3712 3713 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ 3714 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) { 3715 struct ac_export_args args[8]; 3716 int c, last = -1; 3717 3718 /* Get the export arguments, also find out what the last one is. */ 3719 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { 3720 si_llvm_init_export_args(ctx, color, 3721 V_008DFC_SQ_EXP_MRT + c, &args[c]); 3722 if (args[c].enabled_channels) 3723 last = c; 3724 } 3725 3726 /* Emit all exports. */ 3727 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { 3728 if (is_last && last == c) { 3729 args[c].valid_mask = 1; /* whether the EXEC mask is valid */ 3730 args[c].done = 1; /* DONE bit */ 3731 } else if (!args[c].enabled_channels) 3732 continue; /* unnecessary NULL export */ 3733 3734 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c])); 3735 } 3736 } else { 3737 struct ac_export_args args; 3738 3739 /* Export */ 3740 si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index, 3741 &args); 3742 if (is_last) { 3743 args.valid_mask = 1; /* whether the EXEC mask is valid */ 3744 args.done = 1; /* DONE bit */ 3745 } else if (!args.enabled_channels) 3746 return; /* unnecessary NULL export */ 3747 3748 memcpy(&exp->args[exp->num++], &args, sizeof(args)); 3749 } 3750} 3751 3752static void si_emit_ps_exports(struct si_shader_context *ctx, 3753 struct si_ps_exports *exp) 3754{ 3755 for (unsigned i = 0; i < exp->num; i++) 3756 ac_build_export(&ctx->ac, &exp->args[i]); 3757} 3758 3759/** 3760 * Return PS outputs in this order: 3761 * 3762 * v[0:3] = color0.xyzw 3763 * v[4:7] = color1.xyzw 3764 * ... 3765 * vN+0 = Depth 3766 * vN+1 = Stencil 3767 * vN+2 = SampleMask 3768 * vN+3 = SampleMaskIn (used for OpenGL smoothing) 3769 * 3770 * The alpha-ref SGPR is returned via its original location. 3771 */ 3772static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, 3773 unsigned max_outputs, 3774 LLVMValueRef *addrs) 3775{ 3776 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3777 struct si_shader *shader = ctx->shader; 3778 struct tgsi_shader_info *info = &shader->selector->info; 3779 LLVMBuilderRef builder = ctx->ac.builder; 3780 unsigned i, j, first_vgpr, vgpr; 3781 3782 LLVMValueRef color[8][4] = {}; 3783 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; 3784 LLVMValueRef ret; 3785 3786 if (ctx->postponed_kill) 3787 ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, "")); 3788 3789 /* Read the output values. */ 3790 for (i = 0; i < info->num_outputs; i++) { 3791 unsigned semantic_name = info->output_semantic_name[i]; 3792 unsigned semantic_index = info->output_semantic_index[i]; 3793 3794 switch (semantic_name) { 3795 case TGSI_SEMANTIC_COLOR: 3796 assert(semantic_index < 8); 3797 for (j = 0; j < 4; j++) { 3798 LLVMValueRef ptr = addrs[4 * i + j]; 3799 LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); 3800 color[semantic_index][j] = result; 3801 } 3802 break; 3803 case TGSI_SEMANTIC_POSITION: 3804 depth = LLVMBuildLoad(builder, 3805 addrs[4 * i + 2], ""); 3806 break; 3807 case TGSI_SEMANTIC_STENCIL: 3808 stencil = LLVMBuildLoad(builder, 3809 addrs[4 * i + 1], ""); 3810 break; 3811 case TGSI_SEMANTIC_SAMPLEMASK: 3812 samplemask = LLVMBuildLoad(builder, 3813 addrs[4 * i + 0], ""); 3814 break; 3815 default: 3816 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n", 3817 semantic_name); 3818 } 3819 } 3820 3821 /* Fill the return structure. */ 3822 ret = ctx->return_value; 3823 3824 /* Set SGPRs. */ 3825 ret = LLVMBuildInsertValue(builder, ret, 3826 ac_to_integer(&ctx->ac, 3827 LLVMGetParam(ctx->main_fn, 3828 SI_PARAM_ALPHA_REF)), 3829 SI_SGPR_ALPHA_REF, ""); 3830 3831 /* Set VGPRs */ 3832 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; 3833 for (i = 0; i < ARRAY_SIZE(color); i++) { 3834 if (!color[i][0]) 3835 continue; 3836 3837 for (j = 0; j < 4; j++) 3838 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); 3839 } 3840 if (depth) 3841 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); 3842 if (stencil) 3843 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); 3844 if (samplemask) 3845 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); 3846 3847 /* Add the input sample mask for smoothing at the end. */ 3848 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) 3849 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; 3850 ret = LLVMBuildInsertValue(builder, ret, 3851 LLVMGetParam(ctx->main_fn, 3852 SI_PARAM_SAMPLE_COVERAGE), vgpr++, ""); 3853 3854 ctx->return_value = ret; 3855} 3856 3857static void membar_emit( 3858 const struct lp_build_tgsi_action *action, 3859 struct lp_build_tgsi_context *bld_base, 3860 struct lp_build_emit_data *emit_data) 3861{ 3862 struct si_shader_context *ctx = si_shader_context(bld_base); 3863 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0); 3864 unsigned flags = LLVMConstIntGetZExtValue(src0); 3865 unsigned waitcnt = NOOP_WAITCNT; 3866 3867 if (flags & TGSI_MEMBAR_THREAD_GROUP) 3868 waitcnt &= VM_CNT & LGKM_CNT; 3869 3870 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER | 3871 TGSI_MEMBAR_SHADER_BUFFER | 3872 TGSI_MEMBAR_SHADER_IMAGE)) 3873 waitcnt &= VM_CNT; 3874 3875 if (flags & TGSI_MEMBAR_SHARED) 3876 waitcnt &= LGKM_CNT; 3877 3878 if (waitcnt != NOOP_WAITCNT) 3879 ac_build_waitcnt(&ctx->ac, waitcnt); 3880} 3881 3882static void clock_emit( 3883 const struct lp_build_tgsi_action *action, 3884 struct lp_build_tgsi_context *bld_base, 3885 struct lp_build_emit_data *emit_data) 3886{ 3887 struct si_shader_context *ctx = si_shader_context(bld_base); 3888 LLVMValueRef tmp = ac_build_shader_clock(&ctx->ac); 3889 3890 emit_data->output[0] = 3891 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_0, ""); 3892 emit_data->output[1] = 3893 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_1, ""); 3894} 3895 3896static void si_llvm_emit_ddxy( 3897 const struct lp_build_tgsi_action *action, 3898 struct lp_build_tgsi_context *bld_base, 3899 struct lp_build_emit_data *emit_data) 3900{ 3901 struct si_shader_context *ctx = si_shader_context(bld_base); 3902 unsigned opcode = emit_data->info->opcode; 3903 LLVMValueRef val; 3904 int idx; 3905 unsigned mask; 3906 3907 if (opcode == TGSI_OPCODE_DDX_FINE) 3908 mask = AC_TID_MASK_LEFT; 3909 else if (opcode == TGSI_OPCODE_DDY_FINE) 3910 mask = AC_TID_MASK_TOP; 3911 else 3912 mask = AC_TID_MASK_TOP_LEFT; 3913 3914 /* for DDX we want to next X pixel, DDY next Y pixel. */ 3915 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2; 3916 3917 val = ac_to_integer(&ctx->ac, emit_data->args[0]); 3918 val = ac_build_ddxy(&ctx->ac, mask, idx, val); 3919 emit_data->output[emit_data->chan] = val; 3920} 3921 3922static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, 3923 struct lp_build_tgsi_context *bld_base, 3924 struct lp_build_emit_data *emit_data) 3925{ 3926 struct si_shader_context *ctx = si_shader_context(bld_base); 3927 struct si_shader *shader = ctx->shader; 3928 const struct tgsi_shader_info *info = &shader->selector->info; 3929 LLVMValueRef interp_param; 3930 const struct tgsi_full_instruction *inst = emit_data->inst; 3931 const struct tgsi_full_src_register *input = &inst->Src[0]; 3932 int input_base, input_array_size; 3933 int chan; 3934 int i; 3935 LLVMValueRef prim_mask = ctx->abi.prim_mask; 3936 LLVMValueRef array_idx, offset_x = NULL, offset_y = NULL; 3937 int interp_param_idx; 3938 unsigned interp; 3939 unsigned location; 3940 3941 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { 3942 /* offset is in second src, first two channels */ 3943 offset_x = lp_build_emit_fetch(bld_base, emit_data->inst, 1, 3944 TGSI_CHAN_X); 3945 offset_y = lp_build_emit_fetch(bld_base, emit_data->inst, 1, 3946 TGSI_CHAN_Y); 3947 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 3948 LLVMValueRef sample_position; 3949 LLVMValueRef sample_id; 3950 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f); 3951 3952 /* fetch sample ID, then fetch its sample position, 3953 * and place into first two channels. 3954 */ 3955 sample_id = lp_build_emit_fetch(bld_base, 3956 emit_data->inst, 1, TGSI_CHAN_X); 3957 sample_id = ac_to_integer(&ctx->ac, sample_id); 3958 3959 /* Section 8.13.2 (Interpolation Functions) of the OpenGL Shading 3960 * Language 4.50 spec says about interpolateAtSample: 3961 * 3962 * "Returns the value of the input interpolant variable at 3963 * the location of sample number sample. If multisample 3964 * buffers are not available, the input variable will be 3965 * evaluated at the center of the pixel. If sample sample 3966 * does not exist, the position used to interpolate the 3967 * input variable is undefined." 3968 * 3969 * This means that sample_id values outside of the valid are 3970 * in fact valid input, and the usual mechanism for loading the 3971 * sample position doesn't work. 3972 */ 3973 if (ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center) { 3974 LLVMValueRef center[4] = { 3975 LLVMConstReal(ctx->f32, 0.5), 3976 LLVMConstReal(ctx->f32, 0.5), 3977 ctx->ac.f32_0, 3978 ctx->ac.f32_0, 3979 }; 3980 3981 sample_position = ac_build_gather_values(&ctx->ac, center, 4); 3982 } else { 3983 sample_position = load_sample_position(&ctx->abi, sample_id); 3984 } 3985 3986 offset_x = LLVMBuildExtractElement(ctx->ac.builder, sample_position, 3987 ctx->i32_0, ""); 3988 3989 offset_x = LLVMBuildFSub(ctx->ac.builder, offset_x, halfval, ""); 3990 offset_y = LLVMBuildExtractElement(ctx->ac.builder, sample_position, 3991 ctx->i32_1, ""); 3992 offset_y = LLVMBuildFSub(ctx->ac.builder, offset_y, halfval, ""); 3993 } 3994 3995 assert(input->Register.File == TGSI_FILE_INPUT); 3996 3997 if (input->Register.Indirect) { 3998 unsigned array_id = input->Indirect.ArrayID; 3999 4000 if (array_id) { 4001 input_base = info->input_array_first[array_id]; 4002 input_array_size = info->input_array_last[array_id] - input_base + 1; 4003 } else { 4004 input_base = inst->Src[0].Register.Index; 4005 input_array_size = info->num_inputs - input_base; 4006 } 4007 4008 array_idx = si_get_indirect_index(ctx, &input->Indirect, 4009 1, input->Register.Index - input_base); 4010 } else { 4011 input_base = inst->Src[0].Register.Index; 4012 input_array_size = 1; 4013 array_idx = ctx->i32_0; 4014 } 4015 4016 interp = shader->selector->info.input_interpolate[input_base]; 4017 4018 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 4019 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) 4020 location = TGSI_INTERPOLATE_LOC_CENTER; 4021 else 4022 location = TGSI_INTERPOLATE_LOC_CENTROID; 4023 4024 interp_param_idx = lookup_interp_param_index(interp, location); 4025 if (interp_param_idx == -1) 4026 return; 4027 else if (interp_param_idx) 4028 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); 4029 else 4030 interp_param = NULL; 4031 4032 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 4033 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 4034 LLVMValueRef ij_out[2]; 4035 LLVMValueRef ddxy_out = ac_build_ddxy_interp(&ctx->ac, interp_param); 4036 4037 /* 4038 * take the I then J parameters, and the DDX/Y for it, and 4039 * calculate the IJ inputs for the interpolator. 4040 * temp1 = ddx * offset/sample.x + I; 4041 * interp_param.I = ddy * offset/sample.y + temp1; 4042 * temp1 = ddx * offset/sample.x + J; 4043 * interp_param.J = ddy * offset/sample.y + temp1; 4044 */ 4045 for (i = 0; i < 2; i++) { 4046 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0); 4047 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0); 4048 LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder, 4049 ddxy_out, ix_ll, ""); 4050 LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder, 4051 ddxy_out, iy_ll, ""); 4052 LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, 4053 interp_param, ix_ll, ""); 4054 LLVMValueRef temp; 4055 4056 interp_el = ac_to_float(&ctx->ac, interp_el); 4057 4058 temp = ac_build_fmad(&ctx->ac, ddx_el, offset_x, interp_el); 4059 ij_out[i] = ac_build_fmad(&ctx->ac, ddy_el, offset_y, temp); 4060 } 4061 interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2); 4062 } 4063 4064 if (interp_param) 4065 interp_param = ac_to_float(&ctx->ac, interp_param); 4066 4067 for (chan = 0; chan < 4; chan++) { 4068 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size)); 4069 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan); 4070 4071 for (unsigned idx = 0; idx < input_array_size; ++idx) { 4072 LLVMValueRef v, i = NULL, j = NULL; 4073 4074 if (interp_param) { 4075 i = LLVMBuildExtractElement( 4076 ctx->ac.builder, interp_param, ctx->i32_0, ""); 4077 j = LLVMBuildExtractElement( 4078 ctx->ac.builder, interp_param, ctx->i32_1, ""); 4079 } 4080 v = si_build_fs_interp(ctx, input_base + idx, schan, 4081 prim_mask, i, j); 4082 4083 gather = LLVMBuildInsertElement(ctx->ac.builder, 4084 gather, v, LLVMConstInt(ctx->i32, idx, false), ""); 4085 } 4086 4087 emit_data->output[chan] = LLVMBuildExtractElement( 4088 ctx->ac.builder, gather, array_idx, ""); 4089 } 4090} 4091 4092static void vote_all_emit( 4093 const struct lp_build_tgsi_action *action, 4094 struct lp_build_tgsi_context *bld_base, 4095 struct lp_build_emit_data *emit_data) 4096{ 4097 struct si_shader_context *ctx = si_shader_context(bld_base); 4098 4099 LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, emit_data->args[0]); 4100 emit_data->output[emit_data->chan] = 4101 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); 4102} 4103 4104static void vote_any_emit( 4105 const struct lp_build_tgsi_action *action, 4106 struct lp_build_tgsi_context *bld_base, 4107 struct lp_build_emit_data *emit_data) 4108{ 4109 struct si_shader_context *ctx = si_shader_context(bld_base); 4110 4111 LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, emit_data->args[0]); 4112 emit_data->output[emit_data->chan] = 4113 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); 4114} 4115 4116static void vote_eq_emit( 4117 const struct lp_build_tgsi_action *action, 4118 struct lp_build_tgsi_context *bld_base, 4119 struct lp_build_emit_data *emit_data) 4120{ 4121 struct si_shader_context *ctx = si_shader_context(bld_base); 4122 4123 LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, emit_data->args[0]); 4124 emit_data->output[emit_data->chan] = 4125 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); 4126} 4127 4128static void ballot_emit( 4129 const struct lp_build_tgsi_action *action, 4130 struct lp_build_tgsi_context *bld_base, 4131 struct lp_build_emit_data *emit_data) 4132{ 4133 struct si_shader_context *ctx = si_shader_context(bld_base); 4134 LLVMBuilderRef builder = ctx->ac.builder; 4135 LLVMValueRef tmp; 4136 4137 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X); 4138 tmp = ac_build_ballot(&ctx->ac, tmp); 4139 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, ""); 4140 4141 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, ""); 4142 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, ""); 4143} 4144 4145static void read_lane_emit( 4146 const struct lp_build_tgsi_action *action, 4147 struct lp_build_tgsi_context *bld_base, 4148 struct lp_build_emit_data *emit_data) 4149{ 4150 struct si_shader_context *ctx = si_shader_context(bld_base); 4151 4152 if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_READ_INVOC) { 4153 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, 4154 0, emit_data->src_chan); 4155 4156 /* Always read the source invocation (= lane) from the X channel. */ 4157 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst, 4158 1, TGSI_CHAN_X); 4159 emit_data->arg_count = 2; 4160 } 4161 4162 /* We currently have no other way to prevent LLVM from lifting the icmp 4163 * calls to a dominating basic block. 4164 */ 4165 ac_build_optimization_barrier(&ctx->ac, &emit_data->args[0]); 4166 4167 for (unsigned i = 0; i < emit_data->arg_count; ++i) 4168 emit_data->args[i] = ac_to_integer(&ctx->ac, emit_data->args[i]); 4169 4170 emit_data->output[emit_data->chan] = 4171 ac_build_intrinsic(&ctx->ac, action->intr_name, 4172 ctx->i32, emit_data->args, emit_data->arg_count, 4173 AC_FUNC_ATTR_READNONE | 4174 AC_FUNC_ATTR_CONVERGENT); 4175} 4176 4177static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base, 4178 struct lp_build_emit_data *emit_data) 4179{ 4180 struct si_shader_context *ctx = si_shader_context(bld_base); 4181 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register; 4182 LLVMValueRef imm; 4183 unsigned stream; 4184 4185 assert(src0.File == TGSI_FILE_IMMEDIATE); 4186 4187 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX]; 4188 stream = LLVMConstIntGetZExtValue(imm) & 0x3; 4189 return stream; 4190} 4191 4192/* Emit one vertex from the geometry shader */ 4193static void si_llvm_emit_vertex(struct ac_shader_abi *abi, 4194 unsigned stream, 4195 LLVMValueRef *addrs) 4196{ 4197 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 4198 struct tgsi_shader_info *info = &ctx->shader->selector->info; 4199 struct si_shader *shader = ctx->shader; 4200 struct lp_build_if_state if_state; 4201 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, 4202 ctx->param_gs2vs_offset); 4203 LLVMValueRef gs_next_vertex; 4204 LLVMValueRef can_emit; 4205 unsigned chan, offset; 4206 int i; 4207 4208 /* Write vertex attribute values to GSVS ring */ 4209 gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, 4210 ctx->gs_next_vertex[stream], 4211 ""); 4212 4213 /* If this thread has already emitted the declared maximum number of 4214 * vertices, skip the write: excessive vertex emissions are not 4215 * supposed to have any effect. 4216 * 4217 * If the shader has no writes to memory, kill it instead. This skips 4218 * further memory loads and may allow LLVM to skip to the end 4219 * altogether. 4220 */ 4221 can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, 4222 LLVMConstInt(ctx->i32, 4223 shader->selector->gs_max_out_vertices, 0), ""); 4224 4225 bool use_kill = !info->writes_memory; 4226 if (use_kill) { 4227 ac_build_kill_if_false(&ctx->ac, can_emit); 4228 } else { 4229 lp_build_if(&if_state, &ctx->gallivm, can_emit); 4230 } 4231 4232 offset = 0; 4233 for (i = 0; i < info->num_outputs; i++) { 4234 for (chan = 0; chan < 4; chan++) { 4235 if (!(info->output_usagemask[i] & (1 << chan)) || 4236 ((info->output_streams[i] >> (2 * chan)) & 3) != stream) 4237 continue; 4238 4239 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); 4240 LLVMValueRef voffset = 4241 LLVMConstInt(ctx->i32, offset * 4242 shader->selector->gs_max_out_vertices, 0); 4243 offset++; 4244 4245 voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); 4246 voffset = LLVMBuildMul(ctx->ac.builder, voffset, 4247 LLVMConstInt(ctx->i32, 4, 0), ""); 4248 4249 out_val = ac_to_integer(&ctx->ac, out_val); 4250 4251 ac_build_buffer_store_dword(&ctx->ac, 4252 ctx->gsvs_ring[stream], 4253 out_val, 1, 4254 voffset, soffset, 0, 4255 1, 1, true, true); 4256 } 4257 } 4258 4259 gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, ""); 4260 LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); 4261 4262 /* Signal vertex emission if vertex data was written. */ 4263 if (offset) { 4264 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), 4265 si_get_gs_wave_id(ctx)); 4266 } 4267 4268 if (!use_kill) 4269 lp_build_endif(&if_state); 4270} 4271 4272/* Emit one vertex from the geometry shader */ 4273static void si_tgsi_emit_vertex( 4274 const struct lp_build_tgsi_action *action, 4275 struct lp_build_tgsi_context *bld_base, 4276 struct lp_build_emit_data *emit_data) 4277{ 4278 struct si_shader_context *ctx = si_shader_context(bld_base); 4279 unsigned stream = si_llvm_get_stream(bld_base, emit_data); 4280 4281 si_llvm_emit_vertex(&ctx->abi, stream, ctx->outputs[0]); 4282} 4283 4284/* Cut one primitive from the geometry shader */ 4285static void si_llvm_emit_primitive(struct ac_shader_abi *abi, 4286 unsigned stream) 4287{ 4288 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 4289 4290 /* Signal primitive cut */ 4291 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), 4292 si_get_gs_wave_id(ctx)); 4293} 4294 4295/* Cut one primitive from the geometry shader */ 4296static void si_tgsi_emit_primitive( 4297 const struct lp_build_tgsi_action *action, 4298 struct lp_build_tgsi_context *bld_base, 4299 struct lp_build_emit_data *emit_data) 4300{ 4301 struct si_shader_context *ctx = si_shader_context(bld_base); 4302 4303 si_llvm_emit_primitive(&ctx->abi, si_llvm_get_stream(bld_base, emit_data)); 4304} 4305 4306static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, 4307 struct lp_build_tgsi_context *bld_base, 4308 struct lp_build_emit_data *emit_data) 4309{ 4310 struct si_shader_context *ctx = si_shader_context(bld_base); 4311 4312 /* SI only (thanks to a hw bug workaround): 4313 * The real barrier instruction isn’t needed, because an entire patch 4314 * always fits into a single wave. 4315 */ 4316 if (ctx->screen->info.chip_class == SI && 4317 ctx->type == PIPE_SHADER_TESS_CTRL) { 4318 ac_build_waitcnt(&ctx->ac, LGKM_CNT & VM_CNT); 4319 return; 4320 } 4321 4322 ac_build_s_barrier(&ctx->ac); 4323} 4324 4325static void si_create_function(struct si_shader_context *ctx, 4326 const char *name, 4327 LLVMTypeRef *returns, unsigned num_returns, 4328 struct si_function_info *fninfo, 4329 unsigned max_workgroup_size) 4330{ 4331 int i; 4332 4333 si_llvm_create_func(ctx, name, returns, num_returns, 4334 fninfo->types, fninfo->num_params); 4335 ctx->return_value = LLVMGetUndef(ctx->return_type); 4336 4337 for (i = 0; i < fninfo->num_sgpr_params; ++i) { 4338 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i); 4339 4340 /* The combination of: 4341 * - noalias 4342 * - dereferenceable 4343 * - invariant.load 4344 * allows the optimization passes to move loads and reduces 4345 * SGPR spilling significantly. 4346 */ 4347 ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1, 4348 AC_FUNC_ATTR_INREG); 4349 4350 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { 4351 ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1, 4352 AC_FUNC_ATTR_NOALIAS); 4353 ac_add_attr_dereferenceable(P, UINT64_MAX); 4354 } 4355 } 4356 4357 for (i = 0; i < fninfo->num_params; ++i) { 4358 if (fninfo->assign[i]) 4359 *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i); 4360 } 4361 4362 if (ctx->screen->info.address32_hi) { 4363 ac_llvm_add_target_dep_function_attr(ctx->main_fn, 4364 "amdgpu-32bit-address-high-bits", 4365 ctx->screen->info.address32_hi); 4366 } 4367 4368 if (max_workgroup_size) { 4369 ac_llvm_add_target_dep_function_attr(ctx->main_fn, 4370 "amdgpu-max-work-group-size", 4371 max_workgroup_size); 4372 } 4373 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4374 "no-signed-zeros-fp-math", 4375 "true"); 4376 4377 if (ctx->screen->debug_flags & DBG(UNSAFE_MATH)) { 4378 /* These were copied from some LLVM test. */ 4379 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4380 "less-precise-fpmad", 4381 "true"); 4382 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4383 "no-infs-fp-math", 4384 "true"); 4385 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4386 "no-nans-fp-math", 4387 "true"); 4388 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4389 "unsafe-fp-math", 4390 "true"); 4391 } 4392} 4393 4394static void declare_streamout_params(struct si_shader_context *ctx, 4395 struct pipe_stream_output_info *so, 4396 struct si_function_info *fninfo) 4397{ 4398 int i; 4399 4400 /* Streamout SGPRs. */ 4401 if (so->num_outputs) { 4402 if (ctx->type != PIPE_SHADER_TESS_EVAL) 4403 ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); 4404 else 4405 ctx->param_streamout_config = fninfo->num_params - 1; 4406 4407 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); 4408 } 4409 /* A streamout buffer offset is loaded if the stride is non-zero. */ 4410 for (i = 0; i < 4; i++) { 4411 if (!so->stride[i]) 4412 continue; 4413 4414 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); 4415 } 4416} 4417 4418static unsigned si_get_max_workgroup_size(const struct si_shader *shader) 4419{ 4420 switch (shader->selector->type) { 4421 case PIPE_SHADER_TESS_CTRL: 4422 /* Return this so that LLVM doesn't remove s_barrier 4423 * instructions on chips where we use s_barrier. */ 4424 return shader->selector->screen->info.chip_class >= CIK ? 128 : 64; 4425 4426 case PIPE_SHADER_GEOMETRY: 4427 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 64; 4428 4429 case PIPE_SHADER_COMPUTE: 4430 break; /* see below */ 4431 4432 default: 4433 return 0; 4434 } 4435 4436 const unsigned *properties = shader->selector->info.properties; 4437 unsigned max_work_group_size = 4438 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * 4439 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * 4440 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; 4441 4442 if (!max_work_group_size) { 4443 /* This is a variable group size compute shader, 4444 * compile it for the maximum possible group size. 4445 */ 4446 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; 4447 } 4448 return max_work_group_size; 4449} 4450 4451static void declare_const_and_shader_buffers(struct si_shader_context *ctx, 4452 struct si_function_info *fninfo, 4453 bool assign_params) 4454{ 4455 LLVMTypeRef const_shader_buf_type; 4456 4457 if (ctx->shader->selector->info.const_buffers_declared == 1 && 4458 ctx->shader->selector->info.shader_buffers_declared == 0) 4459 const_shader_buf_type = ctx->f32; 4460 else 4461 const_shader_buf_type = ctx->v4i32; 4462 4463 unsigned const_and_shader_buffers = 4464 add_arg(fninfo, ARG_SGPR, 4465 ac_array_in_const32_addr_space(const_shader_buf_type)); 4466 4467 if (assign_params) 4468 ctx->param_const_and_shader_buffers = const_and_shader_buffers; 4469} 4470 4471static void declare_samplers_and_images(struct si_shader_context *ctx, 4472 struct si_function_info *fninfo, 4473 bool assign_params) 4474{ 4475 unsigned samplers_and_images = 4476 add_arg(fninfo, ARG_SGPR, 4477 ac_array_in_const32_addr_space(ctx->v8i32)); 4478 4479 if (assign_params) 4480 ctx->param_samplers_and_images = samplers_and_images; 4481} 4482 4483static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, 4484 struct si_function_info *fninfo, 4485 bool assign_params) 4486{ 4487 declare_const_and_shader_buffers(ctx, fninfo, assign_params); 4488 declare_samplers_and_images(ctx, fninfo, assign_params); 4489} 4490 4491static void declare_global_desc_pointers(struct si_shader_context *ctx, 4492 struct si_function_info *fninfo) 4493{ 4494 ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR, 4495 ac_array_in_const32_addr_space(ctx->v4i32)); 4496 ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR, 4497 ac_array_in_const32_addr_space(ctx->v8i32)); 4498} 4499 4500static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx, 4501 struct si_function_info *fninfo) 4502{ 4503 ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32); 4504 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex); 4505 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance); 4506 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id); 4507} 4508 4509static void declare_vs_input_vgprs(struct si_shader_context *ctx, 4510 struct si_function_info *fninfo, 4511 unsigned *num_prolog_vgprs) 4512{ 4513 struct si_shader *shader = ctx->shader; 4514 4515 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id); 4516 if (shader->key.as_ls) { 4517 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32); 4518 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id); 4519 } else { 4520 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id); 4521 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32); 4522 } 4523 add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */ 4524 4525 if (!shader->is_gs_copy_shader) { 4526 /* Vertex load indices. */ 4527 ctx->param_vertex_index0 = fninfo->num_params; 4528 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++) 4529 add_arg(fninfo, ARG_VGPR, ctx->i32); 4530 *num_prolog_vgprs += shader->selector->info.num_inputs; 4531 } 4532} 4533 4534static void declare_vs_blit_inputs(struct si_shader_context *ctx, 4535 struct si_function_info *fninfo, 4536 unsigned vs_blit_property) 4537{ 4538 ctx->param_vs_blit_inputs = fninfo->num_params; 4539 add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */ 4540 add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */ 4541 add_arg(fninfo, ARG_SGPR, ctx->f32); /* depth */ 4542 4543 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { 4544 add_arg(fninfo, ARG_SGPR, ctx->f32); /* color0 */ 4545 add_arg(fninfo, ARG_SGPR, ctx->f32); /* color1 */ 4546 add_arg(fninfo, ARG_SGPR, ctx->f32); /* color2 */ 4547 add_arg(fninfo, ARG_SGPR, ctx->f32); /* color3 */ 4548 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { 4549 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */ 4550 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */ 4551 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */ 4552 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */ 4553 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */ 4554 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */ 4555 } 4556} 4557 4558static void declare_tes_input_vgprs(struct si_shader_context *ctx, 4559 struct si_function_info *fninfo) 4560{ 4561 ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32); 4562 ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32); 4563 ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32); 4564 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tes_patch_id); 4565} 4566 4567enum { 4568 /* Convenient merged shader definitions. */ 4569 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES, 4570 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY, 4571}; 4572 4573static void create_function(struct si_shader_context *ctx) 4574{ 4575 struct si_shader *shader = ctx->shader; 4576 struct si_function_info fninfo; 4577 LLVMTypeRef returns[16+32*4]; 4578 unsigned i, num_return_sgprs; 4579 unsigned num_returns = 0; 4580 unsigned num_prolog_vgprs = 0; 4581 unsigned type = ctx->type; 4582 unsigned vs_blit_property = 4583 shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; 4584 4585 si_init_function_info(&fninfo); 4586 4587 /* Set MERGED shaders. */ 4588 if (ctx->screen->info.chip_class >= GFX9) { 4589 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL) 4590 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */ 4591 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY) 4592 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY; 4593 } 4594 4595 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3); 4596 4597 switch (type) { 4598 case PIPE_SHADER_VERTEX: 4599 declare_global_desc_pointers(ctx, &fninfo); 4600 4601 if (vs_blit_property) { 4602 declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property); 4603 4604 /* VGPRs */ 4605 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); 4606 break; 4607 } 4608 4609 declare_per_stage_desc_pointers(ctx, &fninfo, true); 4610 declare_vs_specific_input_sgprs(ctx, &fninfo); 4611 ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR, 4612 ac_array_in_const32_addr_space(ctx->v4i32)); 4613 4614 if (shader->key.as_es) { 4615 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4616 } else if (shader->key.as_ls) { 4617 /* no extra parameters */ 4618 } else { 4619 if (shader->is_gs_copy_shader) { 4620 fninfo.num_params = ctx->param_vs_state_bits + 1; 4621 fninfo.num_sgpr_params = fninfo.num_params; 4622 } 4623 4624 /* The locations of the other parameters are assigned dynamically. */ 4625 declare_streamout_params(ctx, &shader->selector->so, 4626 &fninfo); 4627 } 4628 4629 /* VGPRs */ 4630 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); 4631 break; 4632 4633 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */ 4634 declare_global_desc_pointers(ctx, &fninfo); 4635 declare_per_stage_desc_pointers(ctx, &fninfo, true); 4636 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4637 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4638 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4639 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4640 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4641 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4642 4643 /* VGPRs */ 4644 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id); 4645 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids); 4646 4647 /* param_tcs_offchip_offset and param_tcs_factor_offset are 4648 * placed after the user SGPRs. 4649 */ 4650 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) 4651 returns[num_returns++] = ctx->i32; /* SGPRs */ 4652 for (i = 0; i < 11; i++) 4653 returns[num_returns++] = ctx->f32; /* VGPRs */ 4654 break; 4655 4656 case SI_SHADER_MERGED_VERTEX_TESSCTRL: 4657 /* Merged stages have 8 system SGPRs at the beginning. */ 4658 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */ 4659 declare_per_stage_desc_pointers(ctx, &fninfo, 4660 ctx->type == PIPE_SHADER_TESS_CTRL); 4661 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4662 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4663 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4664 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4665 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ 4666 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ 4667 4668 declare_global_desc_pointers(ctx, &fninfo); 4669 declare_per_stage_desc_pointers(ctx, &fninfo, 4670 ctx->type == PIPE_SHADER_VERTEX); 4671 declare_vs_specific_input_sgprs(ctx, &fninfo); 4672 4673 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4674 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4675 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4676 ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR, 4677 ac_array_in_const32_addr_space(ctx->v4i32)); 4678 4679 /* VGPRs (first TCS, then VS) */ 4680 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id); 4681 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids); 4682 4683 if (ctx->type == PIPE_SHADER_VERTEX) { 4684 declare_vs_input_vgprs(ctx, &fninfo, 4685 &num_prolog_vgprs); 4686 4687 /* LS return values are inputs to the TCS main shader part. */ 4688 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) 4689 returns[num_returns++] = ctx->i32; /* SGPRs */ 4690 for (i = 0; i < 2; i++) 4691 returns[num_returns++] = ctx->f32; /* VGPRs */ 4692 } else { 4693 /* TCS return values are inputs to the TCS epilog. 4694 * 4695 * param_tcs_offchip_offset, param_tcs_factor_offset, 4696 * param_tcs_offchip_layout, and param_rw_buffers 4697 * should be passed to the epilog. 4698 */ 4699 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++) 4700 returns[num_returns++] = ctx->i32; /* SGPRs */ 4701 for (i = 0; i < 11; i++) 4702 returns[num_returns++] = ctx->f32; /* VGPRs */ 4703 } 4704 break; 4705 4706 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY: 4707 /* Merged stages have 8 system SGPRs at the beginning. */ 4708 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */ 4709 declare_per_stage_desc_pointers(ctx, &fninfo, 4710 ctx->type == PIPE_SHADER_GEOMETRY); 4711 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4712 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4713 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4714 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4715 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */ 4716 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */ 4717 4718 declare_global_desc_pointers(ctx, &fninfo); 4719 declare_per_stage_desc_pointers(ctx, &fninfo, 4720 (ctx->type == PIPE_SHADER_VERTEX || 4721 ctx->type == PIPE_SHADER_TESS_EVAL)); 4722 if (ctx->type == PIPE_SHADER_VERTEX) { 4723 declare_vs_specific_input_sgprs(ctx, &fninfo); 4724 } else { 4725 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4726 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4727 ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4728 /* Declare as many input SGPRs as the VS has. */ 4729 } 4730 4731 if (ctx->type == PIPE_SHADER_VERTEX) { 4732 ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR, 4733 ac_array_in_const32_addr_space(ctx->v4i32)); 4734 } 4735 4736 /* VGPRs (first GS, then VS/TES) */ 4737 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); 4738 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); 4739 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id); 4740 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id); 4741 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); 4742 4743 if (ctx->type == PIPE_SHADER_VERTEX) { 4744 declare_vs_input_vgprs(ctx, &fninfo, 4745 &num_prolog_vgprs); 4746 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { 4747 declare_tes_input_vgprs(ctx, &fninfo); 4748 } 4749 4750 if (ctx->type == PIPE_SHADER_VERTEX || 4751 ctx->type == PIPE_SHADER_TESS_EVAL) { 4752 unsigned num_user_sgprs; 4753 4754 if (ctx->type == PIPE_SHADER_VERTEX) 4755 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR; 4756 else 4757 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; 4758 4759 /* ES return values are inputs to GS. */ 4760 for (i = 0; i < 8 + num_user_sgprs; i++) 4761 returns[num_returns++] = ctx->i32; /* SGPRs */ 4762 for (i = 0; i < 5; i++) 4763 returns[num_returns++] = ctx->f32; /* VGPRs */ 4764 } 4765 break; 4766 4767 case PIPE_SHADER_TESS_EVAL: 4768 declare_global_desc_pointers(ctx, &fninfo); 4769 declare_per_stage_desc_pointers(ctx, &fninfo, true); 4770 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4771 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4772 ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4773 4774 if (shader->key.as_es) { 4775 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4776 add_arg(&fninfo, ARG_SGPR, ctx->i32); 4777 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4778 } else { 4779 add_arg(&fninfo, ARG_SGPR, ctx->i32); 4780 declare_streamout_params(ctx, &shader->selector->so, 4781 &fninfo); 4782 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4783 } 4784 4785 /* VGPRs */ 4786 declare_tes_input_vgprs(ctx, &fninfo); 4787 break; 4788 4789 case PIPE_SHADER_GEOMETRY: 4790 declare_global_desc_pointers(ctx, &fninfo); 4791 declare_per_stage_desc_pointers(ctx, &fninfo, true); 4792 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4793 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4794 4795 /* VGPRs */ 4796 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[0]); 4797 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[1]); 4798 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id); 4799 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[2]); 4800 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[3]); 4801 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[4]); 4802 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[5]); 4803 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id); 4804 break; 4805 4806 case PIPE_SHADER_FRAGMENT: 4807 declare_global_desc_pointers(ctx, &fninfo); 4808 declare_per_stage_desc_pointers(ctx, &fninfo, true); 4809 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF); 4810 add_arg_assign_checked(&fninfo, ARG_SGPR, ctx->i32, 4811 &ctx->abi.prim_mask, SI_PARAM_PRIM_MASK); 4812 4813 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE); 4814 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER); 4815 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID); 4816 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL); 4817 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE); 4818 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER); 4819 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID); 4820 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX); 4821 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4822 &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT); 4823 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4824 &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT); 4825 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4826 &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT); 4827 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4828 &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT); 4829 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32, 4830 &ctx->abi.front_face, SI_PARAM_FRONT_FACE); 4831 shader->info.face_vgpr_index = 20; 4832 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32, 4833 &ctx->abi.ancillary, SI_PARAM_ANCILLARY); 4834 shader->info.ancillary_vgpr_index = 21; 4835 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4836 &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE); 4837 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT); 4838 4839 /* Color inputs from the prolog. */ 4840 if (shader->selector->info.colors_read) { 4841 unsigned num_color_elements = 4842 util_bitcount(shader->selector->info.colors_read); 4843 4844 assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types)); 4845 for (i = 0; i < num_color_elements; i++) 4846 add_arg(&fninfo, ARG_VGPR, ctx->f32); 4847 4848 num_prolog_vgprs += num_color_elements; 4849 } 4850 4851 /* Outputs for the epilog. */ 4852 num_return_sgprs = SI_SGPR_ALPHA_REF + 1; 4853 num_returns = 4854 num_return_sgprs + 4855 util_bitcount(shader->selector->info.colors_written) * 4 + 4856 shader->selector->info.writes_z + 4857 shader->selector->info.writes_stencil + 4858 shader->selector->info.writes_samplemask + 4859 1 /* SampleMaskIn */; 4860 4861 num_returns = MAX2(num_returns, 4862 num_return_sgprs + 4863 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); 4864 4865 for (i = 0; i < num_return_sgprs; i++) 4866 returns[i] = ctx->i32; 4867 for (; i < num_returns; i++) 4868 returns[i] = ctx->f32; 4869 break; 4870 4871 case PIPE_SHADER_COMPUTE: 4872 declare_global_desc_pointers(ctx, &fninfo); 4873 declare_per_stage_desc_pointers(ctx, &fninfo, true); 4874 if (shader->selector->info.uses_grid_size) 4875 add_arg_assign(&fninfo, ARG_SGPR, v3i32, &ctx->abi.num_work_groups); 4876 if (shader->selector->info.uses_block_size && 4877 shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0) 4878 ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32); 4879 4880 unsigned cs_user_data_dwords = 4881 shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_DWORDS]; 4882 if (cs_user_data_dwords) { 4883 ctx->param_cs_user_data = add_arg(&fninfo, ARG_SGPR, 4884 LLVMVectorType(ctx->i32, cs_user_data_dwords)); 4885 } 4886 4887 for (i = 0; i < 3; i++) { 4888 ctx->abi.workgroup_ids[i] = NULL; 4889 if (shader->selector->info.uses_block_id[i]) 4890 add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ctx->abi.workgroup_ids[i]); 4891 } 4892 4893 add_arg_assign(&fninfo, ARG_VGPR, v3i32, &ctx->abi.local_invocation_ids); 4894 break; 4895 default: 4896 assert(0 && "unimplemented shader"); 4897 return; 4898 } 4899 4900 si_create_function(ctx, "main", returns, num_returns, &fninfo, 4901 si_get_max_workgroup_size(shader)); 4902 4903 /* Reserve register locations for VGPR inputs the PS prolog may need. */ 4904 if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) { 4905 ac_llvm_add_target_dep_function_attr(ctx->main_fn, 4906 "InitialPSInputAddr", 4907 S_0286D0_PERSP_SAMPLE_ENA(1) | 4908 S_0286D0_PERSP_CENTER_ENA(1) | 4909 S_0286D0_PERSP_CENTROID_ENA(1) | 4910 S_0286D0_LINEAR_SAMPLE_ENA(1) | 4911 S_0286D0_LINEAR_CENTER_ENA(1) | 4912 S_0286D0_LINEAR_CENTROID_ENA(1) | 4913 S_0286D0_FRONT_FACE_ENA(1) | 4914 S_0286D0_ANCILLARY_ENA(1) | 4915 S_0286D0_POS_FIXED_PT_ENA(1)); 4916 } 4917 4918 shader->info.num_input_sgprs = 0; 4919 shader->info.num_input_vgprs = 0; 4920 4921 for (i = 0; i < fninfo.num_sgpr_params; ++i) 4922 shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4; 4923 4924 for (; i < fninfo.num_params; ++i) 4925 shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4; 4926 4927 assert(shader->info.num_input_vgprs >= num_prolog_vgprs); 4928 shader->info.num_input_vgprs -= num_prolog_vgprs; 4929 4930 if (shader->key.as_ls || 4931 ctx->type == PIPE_SHADER_TESS_CTRL || 4932 /* GFX9 has the ESGS ring buffer in LDS. */ 4933 type == SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY) 4934 ac_declare_lds_as_pointer(&ctx->ac); 4935} 4936 4937/** 4938 * Load ESGS and GSVS ring buffer resource descriptors and save the variables 4939 * for later use. 4940 */ 4941static void preload_ring_buffers(struct si_shader_context *ctx) 4942{ 4943 LLVMBuilderRef builder = ctx->ac.builder; 4944 4945 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, 4946 ctx->param_rw_buffers); 4947 4948 if (ctx->screen->info.chip_class <= VI && 4949 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) { 4950 unsigned ring = 4951 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS 4952 : SI_ES_RING_ESGS; 4953 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0); 4954 4955 ctx->esgs_ring = 4956 ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 4957 } 4958 4959 if (ctx->shader->is_gs_copy_shader) { 4960 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); 4961 4962 ctx->gsvs_ring[0] = 4963 ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 4964 } else if (ctx->type == PIPE_SHADER_GEOMETRY) { 4965 const struct si_shader_selector *sel = ctx->shader->selector; 4966 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); 4967 LLVMValueRef base_ring; 4968 4969 base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 4970 4971 /* The conceptual layout of the GSVS ring is 4972 * v0c0 .. vLv0 v0c1 .. vLc1 .. 4973 * but the real memory layout is swizzled across 4974 * threads: 4975 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL 4976 * t16v0c0 .. 4977 * Override the buffer descriptor accordingly. 4978 */ 4979 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2); 4980 uint64_t stream_offset = 0; 4981 4982 for (unsigned stream = 0; stream < 4; ++stream) { 4983 unsigned num_components; 4984 unsigned stride; 4985 unsigned num_records; 4986 LLVMValueRef ring, tmp; 4987 4988 num_components = sel->info.num_stream_output_components[stream]; 4989 if (!num_components) 4990 continue; 4991 4992 stride = 4 * num_components * sel->gs_max_out_vertices; 4993 4994 /* Limit on the stride field for <= CIK. */ 4995 assert(stride < (1 << 14)); 4996 4997 num_records = 64; 4998 4999 ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); 5000 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, ""); 5001 tmp = LLVMBuildAdd(builder, tmp, 5002 LLVMConstInt(ctx->i64, 5003 stream_offset, 0), ""); 5004 stream_offset += stride * 64; 5005 5006 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, ""); 5007 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, ""); 5008 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, ""); 5009 tmp = LLVMBuildOr(builder, tmp, 5010 LLVMConstInt(ctx->i32, 5011 S_008F04_STRIDE(stride) | 5012 S_008F04_SWIZZLE_ENABLE(1), 0), ""); 5013 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, ""); 5014 ring = LLVMBuildInsertElement(builder, ring, 5015 LLVMConstInt(ctx->i32, num_records, 0), 5016 LLVMConstInt(ctx->i32, 2, 0), ""); 5017 ring = LLVMBuildInsertElement(builder, ring, 5018 LLVMConstInt(ctx->i32, 5019 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 5020 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 5021 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 5022 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 5023 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 5024 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | 5025 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */ 5026 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ 5027 S_008F0C_ADD_TID_ENABLE(1), 5028 0), 5029 LLVMConstInt(ctx->i32, 3, 0), ""); 5030 5031 ctx->gsvs_ring[stream] = ring; 5032 } 5033 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { 5034 ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES); 5035 } 5036} 5037 5038static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, 5039 LLVMValueRef param_rw_buffers, 5040 unsigned param_pos_fixed_pt) 5041{ 5042 LLVMBuilderRef builder = ctx->ac.builder; 5043 LLVMValueRef slot, desc, offset, row, bit, address[2]; 5044 5045 /* Use the fixed-point gl_FragCoord input. 5046 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits 5047 * per coordinate to get the repeating effect. 5048 */ 5049 address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5); 5050 address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5); 5051 5052 /* Load the buffer descriptor. */ 5053 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0); 5054 desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot); 5055 5056 /* The stipple pattern is 32x32, each row has 32 bits. */ 5057 offset = LLVMBuildMul(builder, address[1], 5058 LLVMConstInt(ctx->i32, 4, 0), ""); 5059 row = buffer_load_const(ctx, desc, offset); 5060 row = ac_to_integer(&ctx->ac, row); 5061 bit = LLVMBuildLShr(builder, row, address[0], ""); 5062 bit = LLVMBuildTrunc(builder, bit, ctx->i1, ""); 5063 ac_build_kill_if_false(&ctx->ac, bit); 5064} 5065 5066void si_shader_binary_read_config(struct ac_shader_binary *binary, 5067 struct si_shader_config *conf, 5068 unsigned symbol_offset) 5069{ 5070 unsigned i; 5071 const unsigned char *config = 5072 ac_shader_binary_config_start(binary, symbol_offset); 5073 bool really_needs_scratch = false; 5074 5075 /* LLVM adds SGPR spills to the scratch size. 5076 * Find out if we really need the scratch buffer. 5077 */ 5078 for (i = 0; i < binary->reloc_count; i++) { 5079 const struct ac_shader_reloc *reloc = &binary->relocs[i]; 5080 5081 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) || 5082 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { 5083 really_needs_scratch = true; 5084 break; 5085 } 5086 } 5087 5088 /* XXX: We may be able to emit some of these values directly rather than 5089 * extracting fields to be emitted later. 5090 */ 5091 5092 for (i = 0; i < binary->config_size_per_symbol; i+= 8) { 5093 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i)); 5094 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4)); 5095 switch (reg) { 5096 case R_00B028_SPI_SHADER_PGM_RSRC1_PS: 5097 case R_00B128_SPI_SHADER_PGM_RSRC1_VS: 5098 case R_00B228_SPI_SHADER_PGM_RSRC1_GS: 5099 case R_00B428_SPI_SHADER_PGM_RSRC1_HS: 5100 case R_00B848_COMPUTE_PGM_RSRC1: 5101 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); 5102 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); 5103 conf->float_mode = G_00B028_FLOAT_MODE(value); 5104 conf->rsrc1 = value; 5105 break; 5106 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: 5107 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); 5108 break; 5109 case R_00B84C_COMPUTE_PGM_RSRC2: 5110 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value)); 5111 conf->rsrc2 = value; 5112 break; 5113 case R_0286CC_SPI_PS_INPUT_ENA: 5114 conf->spi_ps_input_ena = value; 5115 break; 5116 case R_0286D0_SPI_PS_INPUT_ADDR: 5117 conf->spi_ps_input_addr = value; 5118 break; 5119 case R_0286E8_SPI_TMPRING_SIZE: 5120 case R_00B860_COMPUTE_TMPRING_SIZE: 5121 /* WAVESIZE is in units of 256 dwords. */ 5122 if (really_needs_scratch) 5123 conf->scratch_bytes_per_wave = 5124 G_00B860_WAVESIZE(value) * 256 * 4; 5125 break; 5126 case 0x4: /* SPILLED_SGPRS */ 5127 conf->spilled_sgprs = value; 5128 break; 5129 case 0x8: /* SPILLED_VGPRS */ 5130 conf->spilled_vgprs = value; 5131 break; 5132 default: 5133 { 5134 static bool printed; 5135 5136 if (!printed) { 5137 fprintf(stderr, "Warning: LLVM emitted unknown " 5138 "config register: 0x%x\n", reg); 5139 printed = true; 5140 } 5141 } 5142 break; 5143 } 5144 } 5145 5146 if (!conf->spi_ps_input_addr) 5147 conf->spi_ps_input_addr = conf->spi_ps_input_ena; 5148} 5149 5150void si_shader_apply_scratch_relocs(struct si_shader *shader, 5151 uint64_t scratch_va) 5152{ 5153 unsigned i; 5154 uint32_t scratch_rsrc_dword0 = scratch_va; 5155 uint32_t scratch_rsrc_dword1 = 5156 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32); 5157 5158 /* Enable scratch coalescing. */ 5159 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1); 5160 5161 for (i = 0 ; i < shader->binary.reloc_count; i++) { 5162 const struct ac_shader_reloc *reloc = 5163 &shader->binary.relocs[i]; 5164 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { 5165 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, 5166 &scratch_rsrc_dword0, 4); 5167 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { 5168 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, 5169 &scratch_rsrc_dword1, 4); 5170 } 5171 } 5172} 5173 5174/* For the UMR disassembler. */ 5175#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ 5176#define DEBUGGER_NUM_MARKERS 5 5177 5178static unsigned si_get_shader_binary_size(const struct si_shader *shader) 5179{ 5180 unsigned size = shader->binary.code_size; 5181 5182 if (shader->prolog) 5183 size += shader->prolog->binary.code_size; 5184 if (shader->previous_stage) 5185 size += shader->previous_stage->binary.code_size; 5186 if (shader->prolog2) 5187 size += shader->prolog2->binary.code_size; 5188 if (shader->epilog) 5189 size += shader->epilog->binary.code_size; 5190 return size + DEBUGGER_NUM_MARKERS * 4; 5191} 5192 5193int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) 5194{ 5195 const struct ac_shader_binary *prolog = 5196 shader->prolog ? &shader->prolog->binary : NULL; 5197 const struct ac_shader_binary *previous_stage = 5198 shader->previous_stage ? &shader->previous_stage->binary : NULL; 5199 const struct ac_shader_binary *prolog2 = 5200 shader->prolog2 ? &shader->prolog2->binary : NULL; 5201 const struct ac_shader_binary *epilog = 5202 shader->epilog ? &shader->epilog->binary : NULL; 5203 const struct ac_shader_binary *mainb = &shader->binary; 5204 unsigned bo_size = si_get_shader_binary_size(shader) + 5205 (!epilog ? mainb->rodata_size : 0); 5206 unsigned char *ptr; 5207 5208 assert(!prolog || !prolog->rodata_size); 5209 assert(!previous_stage || !previous_stage->rodata_size); 5210 assert(!prolog2 || !prolog2->rodata_size); 5211 assert((!prolog && !previous_stage && !prolog2 && !epilog) || 5212 !mainb->rodata_size); 5213 assert(!epilog || !epilog->rodata_size); 5214 5215 si_resource_reference(&shader->bo, NULL); 5216 shader->bo = si_aligned_buffer_create(&sscreen->b, 5217 sscreen->cpdma_prefetch_writes_memory ? 5218 0 : SI_RESOURCE_FLAG_READ_ONLY, 5219 PIPE_USAGE_IMMUTABLE, 5220 align(bo_size, SI_CPDMA_ALIGNMENT), 5221 256); 5222 if (!shader->bo) 5223 return -ENOMEM; 5224 5225 /* Upload. */ 5226 ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL, 5227 PIPE_TRANSFER_READ_WRITE | 5228 PIPE_TRANSFER_UNSYNCHRONIZED | 5229 RADEON_TRANSFER_TEMPORARY); 5230 5231 /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are 5232 * endian-independent. */ 5233 if (prolog) { 5234 memcpy(ptr, prolog->code, prolog->code_size); 5235 ptr += prolog->code_size; 5236 } 5237 if (previous_stage) { 5238 memcpy(ptr, previous_stage->code, previous_stage->code_size); 5239 ptr += previous_stage->code_size; 5240 } 5241 if (prolog2) { 5242 memcpy(ptr, prolog2->code, prolog2->code_size); 5243 ptr += prolog2->code_size; 5244 } 5245 5246 memcpy(ptr, mainb->code, mainb->code_size); 5247 ptr += mainb->code_size; 5248 5249 if (epilog) { 5250 memcpy(ptr, epilog->code, epilog->code_size); 5251 ptr += epilog->code_size; 5252 } else if (mainb->rodata_size > 0) { 5253 memcpy(ptr, mainb->rodata, mainb->rodata_size); 5254 ptr += mainb->rodata_size; 5255 } 5256 5257 /* Add end-of-code markers for the UMR disassembler. */ 5258 uint32_t *ptr32 = (uint32_t*)ptr; 5259 for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++) 5260 ptr32[i] = DEBUGGER_END_OF_CODE_MARKER; 5261 5262 sscreen->ws->buffer_unmap(shader->bo->buf); 5263 return 0; 5264} 5265 5266static void si_shader_dump_disassembly(const struct ac_shader_binary *binary, 5267 struct pipe_debug_callback *debug, 5268 const char *name, FILE *file) 5269{ 5270 char *line, *p; 5271 unsigned i, count; 5272 5273 if (binary->disasm_string) { 5274 fprintf(file, "Shader %s disassembly:\n", name); 5275 fprintf(file, "%s", binary->disasm_string); 5276 5277 if (debug && debug->debug_message) { 5278 /* Very long debug messages are cut off, so send the 5279 * disassembly one line at a time. This causes more 5280 * overhead, but on the plus side it simplifies 5281 * parsing of resulting logs. 5282 */ 5283 pipe_debug_message(debug, SHADER_INFO, 5284 "Shader Disassembly Begin"); 5285 5286 line = binary->disasm_string; 5287 while (*line) { 5288 p = util_strchrnul(line, '\n'); 5289 count = p - line; 5290 5291 if (count) { 5292 pipe_debug_message(debug, SHADER_INFO, 5293 "%.*s", count, line); 5294 } 5295 5296 if (!*p) 5297 break; 5298 line = p + 1; 5299 } 5300 5301 pipe_debug_message(debug, SHADER_INFO, 5302 "Shader Disassembly End"); 5303 } 5304 } else { 5305 fprintf(file, "Shader %s binary:\n", name); 5306 for (i = 0; i < binary->code_size; i += 4) { 5307 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i, 5308 binary->code[i + 3], binary->code[i + 2], 5309 binary->code[i + 1], binary->code[i]); 5310 } 5311 } 5312} 5313 5314static void si_calculate_max_simd_waves(struct si_shader *shader) 5315{ 5316 struct si_screen *sscreen = shader->selector->screen; 5317 struct si_shader_config *conf = &shader->config; 5318 unsigned num_inputs = shader->selector->info.num_inputs; 5319 unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 : 256; 5320 unsigned lds_per_wave = 0; 5321 unsigned max_simd_waves; 5322 5323 max_simd_waves = ac_get_max_simd_waves(sscreen->info.family); 5324 5325 /* Compute LDS usage for PS. */ 5326 switch (shader->selector->type) { 5327 case PIPE_SHADER_FRAGMENT: 5328 /* The minimum usage per wave is (num_inputs * 48). The maximum 5329 * usage is (num_inputs * 48 * 16). 5330 * We can get anything in between and it varies between waves. 5331 * 5332 * The 48 bytes per input for a single primitive is equal to 5333 * 4 bytes/component * 4 components/input * 3 points. 5334 * 5335 * Other stages don't know the size at compile time or don't 5336 * allocate LDS per wave, but instead they do it per thread group. 5337 */ 5338 lds_per_wave = conf->lds_size * lds_increment + 5339 align(num_inputs * 48, lds_increment); 5340 break; 5341 case PIPE_SHADER_COMPUTE: 5342 if (shader->selector) { 5343 unsigned max_workgroup_size = 5344 si_get_max_workgroup_size(shader); 5345 lds_per_wave = (conf->lds_size * lds_increment) / 5346 DIV_ROUND_UP(max_workgroup_size, 64); 5347 } 5348 break; 5349 } 5350 5351 /* Compute the per-SIMD wave counts. */ 5352 if (conf->num_sgprs) { 5353 max_simd_waves = 5354 MIN2(max_simd_waves, 5355 ac_get_num_physical_sgprs(sscreen->info.chip_class) / conf->num_sgprs); 5356 } 5357 5358 if (conf->num_vgprs) 5359 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs); 5360 5361 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above 5362 * 16KB makes some SIMDs unoccupied). */ 5363 if (lds_per_wave) 5364 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave); 5365 5366 conf->max_simd_waves = max_simd_waves; 5367} 5368 5369void si_shader_dump_stats_for_shader_db(const struct si_shader *shader, 5370 struct pipe_debug_callback *debug) 5371{ 5372 const struct si_shader_config *conf = &shader->config; 5373 5374 pipe_debug_message(debug, SHADER_INFO, 5375 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " 5376 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d " 5377 "Spilled VGPRs: %d PrivMem VGPRs: %d", 5378 conf->num_sgprs, conf->num_vgprs, 5379 si_get_shader_binary_size(shader), 5380 conf->lds_size, conf->scratch_bytes_per_wave, 5381 conf->max_simd_waves, conf->spilled_sgprs, 5382 conf->spilled_vgprs, conf->private_mem_vgprs); 5383} 5384 5385static void si_shader_dump_stats(struct si_screen *sscreen, 5386 const struct si_shader *shader, 5387 unsigned processor, 5388 FILE *file, 5389 bool check_debug_option) 5390{ 5391 const struct si_shader_config *conf = &shader->config; 5392 5393 if (!check_debug_option || 5394 si_can_dump_shader(sscreen, processor)) { 5395 if (processor == PIPE_SHADER_FRAGMENT) { 5396 fprintf(file, "*** SHADER CONFIG ***\n" 5397 "SPI_PS_INPUT_ADDR = 0x%04x\n" 5398 "SPI_PS_INPUT_ENA = 0x%04x\n", 5399 conf->spi_ps_input_addr, conf->spi_ps_input_ena); 5400 } 5401 5402 fprintf(file, "*** SHADER STATS ***\n" 5403 "SGPRS: %d\n" 5404 "VGPRS: %d\n" 5405 "Spilled SGPRs: %d\n" 5406 "Spilled VGPRs: %d\n" 5407 "Private memory VGPRs: %d\n" 5408 "Code Size: %d bytes\n" 5409 "LDS: %d blocks\n" 5410 "Scratch: %d bytes per wave\n" 5411 "Max Waves: %d\n" 5412 "********************\n\n\n", 5413 conf->num_sgprs, conf->num_vgprs, 5414 conf->spilled_sgprs, conf->spilled_vgprs, 5415 conf->private_mem_vgprs, 5416 si_get_shader_binary_size(shader), 5417 conf->lds_size, conf->scratch_bytes_per_wave, 5418 conf->max_simd_waves); 5419 } 5420} 5421 5422const char *si_get_shader_name(const struct si_shader *shader, unsigned processor) 5423{ 5424 switch (processor) { 5425 case PIPE_SHADER_VERTEX: 5426 if (shader->key.as_es) 5427 return "Vertex Shader as ES"; 5428 else if (shader->key.as_ls) 5429 return "Vertex Shader as LS"; 5430 else 5431 return "Vertex Shader as VS"; 5432 case PIPE_SHADER_TESS_CTRL: 5433 return "Tessellation Control Shader"; 5434 case PIPE_SHADER_TESS_EVAL: 5435 if (shader->key.as_es) 5436 return "Tessellation Evaluation Shader as ES"; 5437 else 5438 return "Tessellation Evaluation Shader as VS"; 5439 case PIPE_SHADER_GEOMETRY: 5440 if (shader->is_gs_copy_shader) 5441 return "GS Copy Shader as VS"; 5442 else 5443 return "Geometry Shader"; 5444 case PIPE_SHADER_FRAGMENT: 5445 return "Pixel Shader"; 5446 case PIPE_SHADER_COMPUTE: 5447 return "Compute Shader"; 5448 default: 5449 return "Unknown Shader"; 5450 } 5451} 5452 5453void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader, 5454 struct pipe_debug_callback *debug, unsigned processor, 5455 FILE *file, bool check_debug_option) 5456{ 5457 if (!check_debug_option || 5458 si_can_dump_shader(sscreen, processor)) 5459 si_dump_shader_key(processor, shader, file); 5460 5461 if (!check_debug_option && shader->binary.llvm_ir_string) { 5462 if (shader->previous_stage && 5463 shader->previous_stage->binary.llvm_ir_string) { 5464 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", 5465 si_get_shader_name(shader, processor)); 5466 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string); 5467 } 5468 5469 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", 5470 si_get_shader_name(shader, processor)); 5471 fprintf(file, "%s\n", shader->binary.llvm_ir_string); 5472 } 5473 5474 if (!check_debug_option || 5475 (si_can_dump_shader(sscreen, processor) && 5476 !(sscreen->debug_flags & DBG(NO_ASM)))) { 5477 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor)); 5478 5479 if (shader->prolog) 5480 si_shader_dump_disassembly(&shader->prolog->binary, 5481 debug, "prolog", file); 5482 if (shader->previous_stage) 5483 si_shader_dump_disassembly(&shader->previous_stage->binary, 5484 debug, "previous stage", file); 5485 if (shader->prolog2) 5486 si_shader_dump_disassembly(&shader->prolog2->binary, 5487 debug, "prolog2", file); 5488 5489 si_shader_dump_disassembly(&shader->binary, debug, "main", file); 5490 5491 if (shader->epilog) 5492 si_shader_dump_disassembly(&shader->epilog->binary, 5493 debug, "epilog", file); 5494 fprintf(file, "\n"); 5495 } 5496 5497 si_shader_dump_stats(sscreen, shader, processor, file, 5498 check_debug_option); 5499} 5500 5501static int si_compile_llvm(struct si_screen *sscreen, 5502 struct ac_shader_binary *binary, 5503 struct si_shader_config *conf, 5504 struct ac_llvm_compiler *compiler, 5505 LLVMModuleRef mod, 5506 struct pipe_debug_callback *debug, 5507 unsigned processor, 5508 const char *name, 5509 bool less_optimized) 5510{ 5511 int r = 0; 5512 unsigned count = p_atomic_inc_return(&sscreen->num_compilations); 5513 5514 if (si_can_dump_shader(sscreen, processor)) { 5515 fprintf(stderr, "radeonsi: Compiling shader %d\n", count); 5516 5517 if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) { 5518 fprintf(stderr, "%s LLVM IR:\n\n", name); 5519 ac_dump_module(mod); 5520 fprintf(stderr, "\n"); 5521 } 5522 } 5523 5524 if (sscreen->record_llvm_ir) { 5525 char *ir = LLVMPrintModuleToString(mod); 5526 binary->llvm_ir_string = strdup(ir); 5527 LLVMDisposeMessage(ir); 5528 } 5529 5530 if (!si_replace_shader(count, binary)) { 5531 r = si_llvm_compile(mod, binary, compiler, debug, 5532 less_optimized); 5533 if (r) 5534 return r; 5535 } 5536 5537 si_shader_binary_read_config(binary, conf, 0); 5538 5539 /* Enable 64-bit and 16-bit denormals, because there is no performance 5540 * cost. 5541 * 5542 * If denormals are enabled, all floating-point output modifiers are 5543 * ignored. 5544 * 5545 * Don't enable denormals for 32-bit floats, because: 5546 * - Floating-point output modifiers would be ignored by the hw. 5547 * - Some opcodes don't support denormals, such as v_mad_f32. We would 5548 * have to stop using those. 5549 * - SI & CI would be very slow. 5550 */ 5551 conf->float_mode |= V_00B028_FP_64_DENORMS; 5552 5553 FREE(binary->config); 5554 FREE(binary->global_symbol_offsets); 5555 binary->config = NULL; 5556 binary->global_symbol_offsets = NULL; 5557 5558 /* Some shaders can't have rodata because their binaries can be 5559 * concatenated. 5560 */ 5561 if (binary->rodata_size && 5562 (processor == PIPE_SHADER_VERTEX || 5563 processor == PIPE_SHADER_TESS_CTRL || 5564 processor == PIPE_SHADER_TESS_EVAL || 5565 processor == PIPE_SHADER_FRAGMENT)) { 5566 fprintf(stderr, "radeonsi: The shader can't have rodata."); 5567 return -EINVAL; 5568 } 5569 5570 return r; 5571} 5572 5573static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret) 5574{ 5575 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) 5576 LLVMBuildRetVoid(ctx->ac.builder); 5577 else 5578 LLVMBuildRet(ctx->ac.builder, ret); 5579} 5580 5581/* Generate code for the hardware VS shader stage to go with a geometry shader */ 5582struct si_shader * 5583si_generate_gs_copy_shader(struct si_screen *sscreen, 5584 struct ac_llvm_compiler *compiler, 5585 struct si_shader_selector *gs_selector, 5586 struct pipe_debug_callback *debug) 5587{ 5588 struct si_shader_context ctx; 5589 struct si_shader *shader; 5590 LLVMBuilderRef builder; 5591 struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS]; 5592 struct tgsi_shader_info *gsinfo = &gs_selector->info; 5593 int i, r; 5594 5595 5596 shader = CALLOC_STRUCT(si_shader); 5597 if (!shader) 5598 return NULL; 5599 5600 /* We can leave the fence as permanently signaled because the GS copy 5601 * shader only becomes visible globally after it has been compiled. */ 5602 util_queue_fence_init(&shader->ready); 5603 5604 shader->selector = gs_selector; 5605 shader->is_gs_copy_shader = true; 5606 5607 si_init_shader_ctx(&ctx, sscreen, compiler); 5608 ctx.shader = shader; 5609 ctx.type = PIPE_SHADER_VERTEX; 5610 5611 builder = ctx.ac.builder; 5612 5613 create_function(&ctx); 5614 preload_ring_buffers(&ctx); 5615 5616 LLVMValueRef voffset = 5617 LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, 5618 LLVMConstInt(ctx.i32, 4, 0), ""); 5619 5620 /* Fetch the vertex stream ID.*/ 5621 LLVMValueRef stream_id; 5622 5623 if (gs_selector->so.num_outputs) 5624 stream_id = si_unpack_param(&ctx, ctx.param_streamout_config, 24, 2); 5625 else 5626 stream_id = ctx.i32_0; 5627 5628 /* Fill in output information. */ 5629 for (i = 0; i < gsinfo->num_outputs; ++i) { 5630 outputs[i].semantic_name = gsinfo->output_semantic_name[i]; 5631 outputs[i].semantic_index = gsinfo->output_semantic_index[i]; 5632 5633 for (int chan = 0; chan < 4; chan++) { 5634 outputs[i].vertex_stream[chan] = 5635 (gsinfo->output_streams[i] >> (2 * chan)) & 3; 5636 } 5637 } 5638 5639 LLVMBasicBlockRef end_bb; 5640 LLVMValueRef switch_inst; 5641 5642 end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end"); 5643 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); 5644 5645 for (int stream = 0; stream < 4; stream++) { 5646 LLVMBasicBlockRef bb; 5647 unsigned offset; 5648 5649 if (!gsinfo->num_stream_output_components[stream]) 5650 continue; 5651 5652 if (stream > 0 && !gs_selector->so.num_outputs) 5653 continue; 5654 5655 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out"); 5656 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb); 5657 LLVMPositionBuilderAtEnd(builder, bb); 5658 5659 /* Fetch vertex data from GSVS ring */ 5660 offset = 0; 5661 for (i = 0; i < gsinfo->num_outputs; ++i) { 5662 for (unsigned chan = 0; chan < 4; chan++) { 5663 if (!(gsinfo->output_usagemask[i] & (1 << chan)) || 5664 outputs[i].vertex_stream[chan] != stream) { 5665 outputs[i].values[chan] = LLVMGetUndef(ctx.f32); 5666 continue; 5667 } 5668 5669 LLVMValueRef soffset = LLVMConstInt(ctx.i32, 5670 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0); 5671 offset++; 5672 5673 outputs[i].values[chan] = 5674 ac_build_buffer_load(&ctx.ac, 5675 ctx.gsvs_ring[0], 1, 5676 ctx.i32_0, voffset, 5677 soffset, 0, 1, 1, 5678 true, false); 5679 } 5680 } 5681 5682 /* Streamout and exports. */ 5683 if (gs_selector->so.num_outputs) { 5684 si_llvm_emit_streamout(&ctx, outputs, 5685 gsinfo->num_outputs, 5686 stream); 5687 } 5688 5689 if (stream == 0) { 5690 /* Vertex color clamping. 5691 * 5692 * This uses a state constant loaded in a user data SGPR and 5693 * an IF statement is added that clamps all colors if the constant 5694 * is true. 5695 */ 5696 struct lp_build_if_state if_ctx; 5697 LLVMValueRef v[2], cond = NULL; 5698 LLVMBasicBlockRef blocks[2]; 5699 5700 for (unsigned i = 0; i < gsinfo->num_outputs; i++) { 5701 if (gsinfo->output_semantic_name[i] != TGSI_SEMANTIC_COLOR && 5702 gsinfo->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR) 5703 continue; 5704 5705 /* We've found a color. */ 5706 if (!cond) { 5707 /* The state is in the first bit of the user SGPR. */ 5708 cond = LLVMGetParam(ctx.main_fn, 5709 ctx.param_vs_state_bits); 5710 cond = LLVMBuildTrunc(ctx.ac.builder, cond, 5711 ctx.i1, ""); 5712 lp_build_if(&if_ctx, &ctx.gallivm, cond); 5713 /* Remember blocks for Phi. */ 5714 blocks[0] = if_ctx.true_block; 5715 blocks[1] = if_ctx.entry_block; 5716 } 5717 5718 for (unsigned j = 0; j < 4; j++) { 5719 /* Insert clamp into the true block. */ 5720 v[0] = ac_build_clamp(&ctx.ac, outputs[i].values[j]); 5721 v[1] = outputs[i].values[j]; 5722 5723 /* Insert Phi into the endif block. */ 5724 LLVMPositionBuilderAtEnd(ctx.ac.builder, if_ctx.merge_block); 5725 outputs[i].values[j] = ac_build_phi(&ctx.ac, ctx.f32, 2, v, blocks); 5726 LLVMPositionBuilderAtEnd(ctx.ac.builder, if_ctx.true_block); 5727 } 5728 } 5729 if (cond) 5730 lp_build_endif(&if_ctx); 5731 5732 si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs); 5733 } 5734 5735 LLVMBuildBr(builder, end_bb); 5736 } 5737 5738 LLVMPositionBuilderAtEnd(builder, end_bb); 5739 5740 LLVMBuildRetVoid(ctx.ac.builder); 5741 5742 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */ 5743 si_llvm_optimize_module(&ctx); 5744 5745 r = si_compile_llvm(sscreen, &ctx.shader->binary, 5746 &ctx.shader->config, ctx.compiler, 5747 ctx.ac.module, 5748 debug, PIPE_SHADER_GEOMETRY, 5749 "GS Copy Shader", false); 5750 if (!r) { 5751 if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY)) 5752 fprintf(stderr, "GS Copy Shader:\n"); 5753 si_shader_dump(sscreen, ctx.shader, debug, 5754 PIPE_SHADER_GEOMETRY, stderr, true); 5755 r = si_shader_binary_upload(sscreen, ctx.shader); 5756 } 5757 5758 si_llvm_dispose(&ctx); 5759 5760 if (r != 0) { 5761 FREE(shader); 5762 shader = NULL; 5763 } else { 5764 si_fix_resource_usage(sscreen, shader); 5765 } 5766 return shader; 5767} 5768 5769static void si_dump_shader_key_vs(const struct si_shader_key *key, 5770 const struct si_vs_prolog_bits *prolog, 5771 const char *prefix, FILE *f) 5772{ 5773 fprintf(f, " %s.instance_divisor_is_one = %u\n", 5774 prefix, prolog->instance_divisor_is_one); 5775 fprintf(f, " %s.instance_divisor_is_fetched = %u\n", 5776 prefix, prolog->instance_divisor_is_fetched); 5777 fprintf(f, " %s.ls_vgpr_fix = %u\n", 5778 prefix, prolog->ls_vgpr_fix); 5779 5780 fprintf(f, " mono.vs.fix_fetch = {"); 5781 for (int i = 0; i < SI_MAX_ATTRIBS; i++) 5782 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]); 5783 fprintf(f, "}\n"); 5784} 5785 5786static void si_dump_shader_key(unsigned processor, const struct si_shader *shader, 5787 FILE *f) 5788{ 5789 const struct si_shader_key *key = &shader->key; 5790 5791 fprintf(f, "SHADER KEY\n"); 5792 5793 switch (processor) { 5794 case PIPE_SHADER_VERTEX: 5795 si_dump_shader_key_vs(key, &key->part.vs.prolog, 5796 "part.vs.prolog", f); 5797 fprintf(f, " as_es = %u\n", key->as_es); 5798 fprintf(f, " as_ls = %u\n", key->as_ls); 5799 fprintf(f, " mono.u.vs_export_prim_id = %u\n", 5800 key->mono.u.vs_export_prim_id); 5801 break; 5802 5803 case PIPE_SHADER_TESS_CTRL: 5804 if (shader->selector->screen->info.chip_class >= GFX9) { 5805 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, 5806 "part.tcs.ls_prolog", f); 5807 } 5808 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); 5809 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy); 5810 break; 5811 5812 case PIPE_SHADER_TESS_EVAL: 5813 fprintf(f, " as_es = %u\n", key->as_es); 5814 fprintf(f, " mono.u.vs_export_prim_id = %u\n", 5815 key->mono.u.vs_export_prim_id); 5816 break; 5817 5818 case PIPE_SHADER_GEOMETRY: 5819 if (shader->is_gs_copy_shader) 5820 break; 5821 5822 if (shader->selector->screen->info.chip_class >= GFX9 && 5823 key->part.gs.es->type == PIPE_SHADER_VERTEX) { 5824 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, 5825 "part.gs.vs_prolog", f); 5826 } 5827 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix); 5828 break; 5829 5830 case PIPE_SHADER_COMPUTE: 5831 break; 5832 5833 case PIPE_SHADER_FRAGMENT: 5834 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side); 5835 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors); 5836 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple); 5837 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp); 5838 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp); 5839 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp); 5840 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp); 5841 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp); 5842 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear); 5843 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format); 5844 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8); 5845 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10); 5846 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf); 5847 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func); 5848 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one); 5849 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing); 5850 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color); 5851 break; 5852 5853 default: 5854 assert(0); 5855 } 5856 5857 if ((processor == PIPE_SHADER_GEOMETRY || 5858 processor == PIPE_SHADER_TESS_EVAL || 5859 processor == PIPE_SHADER_VERTEX) && 5860 !key->as_es && !key->as_ls) { 5861 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs); 5862 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable); 5863 } 5864} 5865 5866static void si_init_shader_ctx(struct si_shader_context *ctx, 5867 struct si_screen *sscreen, 5868 struct ac_llvm_compiler *compiler) 5869{ 5870 struct lp_build_tgsi_context *bld_base; 5871 5872 si_llvm_context_init(ctx, sscreen, compiler); 5873 5874 bld_base = &ctx->bld_base; 5875 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; 5876 5877 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID].emit = build_interp_intrinsic; 5878 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE].emit = build_interp_intrinsic; 5879 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET].emit = build_interp_intrinsic; 5880 5881 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit; 5882 5883 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit; 5884 5885 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy; 5886 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy; 5887 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy; 5888 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy; 5889 5890 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit; 5891 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit; 5892 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit; 5893 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit; 5894 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane"; 5895 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit; 5896 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane"; 5897 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit; 5898 5899 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_tgsi_emit_vertex; 5900 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_tgsi_emit_primitive; 5901 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier; 5902} 5903 5904static void si_optimize_vs_outputs(struct si_shader_context *ctx) 5905{ 5906 struct si_shader *shader = ctx->shader; 5907 struct tgsi_shader_info *info = &shader->selector->info; 5908 5909 if ((ctx->type != PIPE_SHADER_VERTEX && 5910 ctx->type != PIPE_SHADER_TESS_EVAL) || 5911 shader->key.as_ls || 5912 shader->key.as_es) 5913 return; 5914 5915 ac_optimize_vs_outputs(&ctx->ac, 5916 ctx->main_fn, 5917 shader->info.vs_output_param_offset, 5918 info->num_outputs, 5919 &shader->info.nr_param_exports); 5920} 5921 5922static void si_init_exec_from_input(struct si_shader_context *ctx, 5923 unsigned param, unsigned bitoffset) 5924{ 5925 LLVMValueRef args[] = { 5926 LLVMGetParam(ctx->main_fn, param), 5927 LLVMConstInt(ctx->i32, bitoffset, 0), 5928 }; 5929 ac_build_intrinsic(&ctx->ac, 5930 "llvm.amdgcn.init.exec.from.input", 5931 ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT); 5932} 5933 5934static bool si_vs_needs_prolog(const struct si_shader_selector *sel, 5935 const struct si_vs_prolog_bits *key) 5936{ 5937 /* VGPR initialization fixup for Vega10 and Raven is always done in the 5938 * VS prolog. */ 5939 return sel->vs_needs_prolog || key->ls_vgpr_fix; 5940} 5941 5942static bool si_compile_tgsi_main(struct si_shader_context *ctx) 5943{ 5944 struct si_shader *shader = ctx->shader; 5945 struct si_shader_selector *sel = shader->selector; 5946 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 5947 5948 // TODO clean all this up! 5949 switch (ctx->type) { 5950 case PIPE_SHADER_VERTEX: 5951 ctx->load_input = declare_input_vs; 5952 if (shader->key.as_ls) 5953 ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue; 5954 else if (shader->key.as_es) 5955 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; 5956 else 5957 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; 5958 bld_base->emit_epilogue = si_tgsi_emit_epilogue; 5959 ctx->abi.load_base_vertex = get_base_vertex; 5960 break; 5961 case PIPE_SHADER_TESS_CTRL: 5962 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs; 5963 ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings; 5964 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs; 5965 bld_base->emit_store = store_output_tcs; 5966 ctx->abi.store_tcs_outputs = si_nir_store_output_tcs; 5967 ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue; 5968 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; 5969 bld_base->emit_epilogue = si_tgsi_emit_epilogue; 5970 break; 5971 case PIPE_SHADER_TESS_EVAL: 5972 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes; 5973 ctx->abi.load_tess_varyings = si_nir_load_input_tes; 5974 ctx->abi.load_tess_coord = si_load_tess_coord; 5975 ctx->abi.load_tess_level = si_load_tess_level; 5976 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; 5977 if (shader->key.as_es) 5978 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; 5979 else 5980 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; 5981 bld_base->emit_epilogue = si_tgsi_emit_epilogue; 5982 break; 5983 case PIPE_SHADER_GEOMETRY: 5984 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs; 5985 ctx->abi.load_inputs = si_nir_load_input_gs; 5986 ctx->abi.emit_vertex = si_llvm_emit_vertex; 5987 ctx->abi.emit_primitive = si_llvm_emit_primitive; 5988 ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue; 5989 bld_base->emit_epilogue = si_tgsi_emit_gs_epilogue; 5990 break; 5991 case PIPE_SHADER_FRAGMENT: 5992 ctx->load_input = declare_input_fs; 5993 ctx->abi.emit_outputs = si_llvm_return_fs_outputs; 5994 bld_base->emit_epilogue = si_tgsi_emit_epilogue; 5995 ctx->abi.lookup_interp_param = si_nir_lookup_interp_param; 5996 ctx->abi.load_sample_position = load_sample_position; 5997 ctx->abi.load_sample_mask_in = load_sample_mask_in; 5998 ctx->abi.emit_kill = si_llvm_emit_kill; 5999 break; 6000 case PIPE_SHADER_COMPUTE: 6001 ctx->abi.load_local_group_size = get_block_size; 6002 break; 6003 default: 6004 assert(!"Unsupported shader type"); 6005 return false; 6006 } 6007 6008 ctx->abi.load_ubo = load_ubo; 6009 ctx->abi.load_ssbo = load_ssbo; 6010 6011 create_function(ctx); 6012 preload_ring_buffers(ctx); 6013 6014 /* For GFX9 merged shaders: 6015 * - Set EXEC for the first shader. If the prolog is present, set 6016 * EXEC there instead. 6017 * - Add a barrier before the second shader. 6018 * - In the second shader, reset EXEC to ~0 and wrap the main part in 6019 * an if-statement. This is required for correctness in geometry 6020 * shaders, to ensure that empty GS waves do not send GS_EMIT and 6021 * GS_CUT messages. 6022 * 6023 * For monolithic merged shaders, the first shader is wrapped in an 6024 * if-block together with its prolog in si_build_wrapper_function. 6025 */ 6026 if (ctx->screen->info.chip_class >= GFX9) { 6027 if (!shader->is_monolithic && 6028 sel->info.num_instructions > 1 && /* not empty shader */ 6029 (shader->key.as_es || shader->key.as_ls) && 6030 (ctx->type == PIPE_SHADER_TESS_EVAL || 6031 (ctx->type == PIPE_SHADER_VERTEX && 6032 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) { 6033 si_init_exec_from_input(ctx, 6034 ctx->param_merged_wave_info, 0); 6035 } else if (ctx->type == PIPE_SHADER_TESS_CTRL || 6036 ctx->type == PIPE_SHADER_GEOMETRY) { 6037 if (!shader->is_monolithic) 6038 ac_init_exec_full_mask(&ctx->ac); 6039 6040 LLVMValueRef num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8); 6041 LLVMValueRef ena = 6042 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, 6043 ac_get_thread_id(&ctx->ac), num_threads, ""); 6044 lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena); 6045 6046 /* The barrier must execute for all shaders in a 6047 * threadgroup. 6048 * 6049 * Execute the barrier inside the conditional block, 6050 * so that empty waves can jump directly to s_endpgm, 6051 * which will also signal the barrier. 6052 * 6053 * If the shader is TCS and the TCS epilog is present 6054 * and contains a barrier, it will wait there and then 6055 * reach s_endpgm. 6056 */ 6057 si_llvm_emit_barrier(NULL, bld_base, NULL); 6058 } 6059 } 6060 6061 if (ctx->type == PIPE_SHADER_TESS_CTRL && 6062 sel->tcs_info.tessfactors_are_def_in_all_invocs) { 6063 for (unsigned i = 0; i < 6; i++) { 6064 ctx->invoc0_tess_factors[i] = 6065 ac_build_alloca_undef(&ctx->ac, ctx->i32, ""); 6066 } 6067 } 6068 6069 if (ctx->type == PIPE_SHADER_GEOMETRY) { 6070 int i; 6071 for (i = 0; i < 4; i++) { 6072 ctx->gs_next_vertex[i] = 6073 ac_build_alloca(&ctx->ac, ctx->i32, ""); 6074 } 6075 } 6076 6077 if (sel->force_correct_derivs_after_kill) { 6078 ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, ""); 6079 /* true = don't kill. */ 6080 LLVMBuildStore(ctx->ac.builder, ctx->i1true, 6081 ctx->postponed_kill); 6082 } 6083 6084 if (sel->tokens) { 6085 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) { 6086 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n"); 6087 return false; 6088 } 6089 } else { 6090 if (!si_nir_build_llvm(ctx, sel->nir)) { 6091 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n"); 6092 return false; 6093 } 6094 } 6095 6096 si_llvm_build_ret(ctx, ctx->return_value); 6097 return true; 6098} 6099 6100/** 6101 * Compute the VS prolog key, which contains all the information needed to 6102 * build the VS prolog function, and set shader->info bits where needed. 6103 * 6104 * \param info Shader info of the vertex shader. 6105 * \param num_input_sgprs Number of input SGPRs for the vertex shader. 6106 * \param prolog_key Key of the VS prolog 6107 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS. 6108 * \param key Output shader part key. 6109 */ 6110static void si_get_vs_prolog_key(const struct tgsi_shader_info *info, 6111 unsigned num_input_sgprs, 6112 const struct si_vs_prolog_bits *prolog_key, 6113 struct si_shader *shader_out, 6114 union si_shader_part_key *key) 6115{ 6116 memset(key, 0, sizeof(*key)); 6117 key->vs_prolog.states = *prolog_key; 6118 key->vs_prolog.num_input_sgprs = num_input_sgprs; 6119 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; 6120 key->vs_prolog.as_ls = shader_out->key.as_ls; 6121 key->vs_prolog.as_es = shader_out->key.as_es; 6122 6123 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) { 6124 key->vs_prolog.as_ls = 1; 6125 key->vs_prolog.num_merged_next_stage_vgprs = 2; 6126 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) { 6127 key->vs_prolog.as_es = 1; 6128 key->vs_prolog.num_merged_next_stage_vgprs = 5; 6129 } 6130 6131 /* Enable loading the InstanceID VGPR. */ 6132 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); 6133 6134 if ((key->vs_prolog.states.instance_divisor_is_one | 6135 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask) 6136 shader_out->info.uses_instanceid = true; 6137} 6138 6139/** 6140 * Compute the PS prolog key, which contains all the information needed to 6141 * build the PS prolog function, and set related bits in shader->config. 6142 */ 6143static void si_get_ps_prolog_key(struct si_shader *shader, 6144 union si_shader_part_key *key, 6145 bool separate_prolog) 6146{ 6147 struct tgsi_shader_info *info = &shader->selector->info; 6148 6149 memset(key, 0, sizeof(*key)); 6150 key->ps_prolog.states = shader->key.part.ps.prolog; 6151 key->ps_prolog.colors_read = info->colors_read; 6152 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; 6153 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; 6154 key->ps_prolog.wqm = info->uses_derivatives && 6155 (key->ps_prolog.colors_read || 6156 key->ps_prolog.states.force_persp_sample_interp || 6157 key->ps_prolog.states.force_linear_sample_interp || 6158 key->ps_prolog.states.force_persp_center_interp || 6159 key->ps_prolog.states.force_linear_center_interp || 6160 key->ps_prolog.states.bc_optimize_for_persp || 6161 key->ps_prolog.states.bc_optimize_for_linear); 6162 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index; 6163 6164 if (info->colors_read) { 6165 unsigned *color = shader->selector->color_attr_index; 6166 6167 if (shader->key.part.ps.prolog.color_two_side) { 6168 /* BCOLORs are stored after the last input. */ 6169 key->ps_prolog.num_interp_inputs = info->num_inputs; 6170 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; 6171 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); 6172 } 6173 6174 for (unsigned i = 0; i < 2; i++) { 6175 unsigned interp = info->input_interpolate[color[i]]; 6176 unsigned location = info->input_interpolate_loc[color[i]]; 6177 6178 if (!(info->colors_read & (0xf << i*4))) 6179 continue; 6180 6181 key->ps_prolog.color_attr_index[i] = color[i]; 6182 6183 if (shader->key.part.ps.prolog.flatshade_colors && 6184 interp == TGSI_INTERPOLATE_COLOR) 6185 interp = TGSI_INTERPOLATE_CONSTANT; 6186 6187 switch (interp) { 6188 case TGSI_INTERPOLATE_CONSTANT: 6189 key->ps_prolog.color_interp_vgpr_index[i] = -1; 6190 break; 6191 case TGSI_INTERPOLATE_PERSPECTIVE: 6192 case TGSI_INTERPOLATE_COLOR: 6193 /* Force the interpolation location for colors here. */ 6194 if (shader->key.part.ps.prolog.force_persp_sample_interp) 6195 location = TGSI_INTERPOLATE_LOC_SAMPLE; 6196 if (shader->key.part.ps.prolog.force_persp_center_interp) 6197 location = TGSI_INTERPOLATE_LOC_CENTER; 6198 6199 switch (location) { 6200 case TGSI_INTERPOLATE_LOC_SAMPLE: 6201 key->ps_prolog.color_interp_vgpr_index[i] = 0; 6202 shader->config.spi_ps_input_ena |= 6203 S_0286CC_PERSP_SAMPLE_ENA(1); 6204 break; 6205 case TGSI_INTERPOLATE_LOC_CENTER: 6206 key->ps_prolog.color_interp_vgpr_index[i] = 2; 6207 shader->config.spi_ps_input_ena |= 6208 S_0286CC_PERSP_CENTER_ENA(1); 6209 break; 6210 case TGSI_INTERPOLATE_LOC_CENTROID: 6211 key->ps_prolog.color_interp_vgpr_index[i] = 4; 6212 shader->config.spi_ps_input_ena |= 6213 S_0286CC_PERSP_CENTROID_ENA(1); 6214 break; 6215 default: 6216 assert(0); 6217 } 6218 break; 6219 case TGSI_INTERPOLATE_LINEAR: 6220 /* Force the interpolation location for colors here. */ 6221 if (shader->key.part.ps.prolog.force_linear_sample_interp) 6222 location = TGSI_INTERPOLATE_LOC_SAMPLE; 6223 if (shader->key.part.ps.prolog.force_linear_center_interp) 6224 location = TGSI_INTERPOLATE_LOC_CENTER; 6225 6226 /* The VGPR assignment for non-monolithic shaders 6227 * works because InitialPSInputAddr is set on the 6228 * main shader and PERSP_PULL_MODEL is never used. 6229 */ 6230 switch (location) { 6231 case TGSI_INTERPOLATE_LOC_SAMPLE: 6232 key->ps_prolog.color_interp_vgpr_index[i] = 6233 separate_prolog ? 6 : 9; 6234 shader->config.spi_ps_input_ena |= 6235 S_0286CC_LINEAR_SAMPLE_ENA(1); 6236 break; 6237 case TGSI_INTERPOLATE_LOC_CENTER: 6238 key->ps_prolog.color_interp_vgpr_index[i] = 6239 separate_prolog ? 8 : 11; 6240 shader->config.spi_ps_input_ena |= 6241 S_0286CC_LINEAR_CENTER_ENA(1); 6242 break; 6243 case TGSI_INTERPOLATE_LOC_CENTROID: 6244 key->ps_prolog.color_interp_vgpr_index[i] = 6245 separate_prolog ? 10 : 13; 6246 shader->config.spi_ps_input_ena |= 6247 S_0286CC_LINEAR_CENTROID_ENA(1); 6248 break; 6249 default: 6250 assert(0); 6251 } 6252 break; 6253 default: 6254 assert(0); 6255 } 6256 } 6257 } 6258} 6259 6260/** 6261 * Check whether a PS prolog is required based on the key. 6262 */ 6263static bool si_need_ps_prolog(const union si_shader_part_key *key) 6264{ 6265 return key->ps_prolog.colors_read || 6266 key->ps_prolog.states.force_persp_sample_interp || 6267 key->ps_prolog.states.force_linear_sample_interp || 6268 key->ps_prolog.states.force_persp_center_interp || 6269 key->ps_prolog.states.force_linear_center_interp || 6270 key->ps_prolog.states.bc_optimize_for_persp || 6271 key->ps_prolog.states.bc_optimize_for_linear || 6272 key->ps_prolog.states.poly_stipple || 6273 key->ps_prolog.states.samplemask_log_ps_iter; 6274} 6275 6276/** 6277 * Compute the PS epilog key, which contains all the information needed to 6278 * build the PS epilog function. 6279 */ 6280static void si_get_ps_epilog_key(struct si_shader *shader, 6281 union si_shader_part_key *key) 6282{ 6283 struct tgsi_shader_info *info = &shader->selector->info; 6284 memset(key, 0, sizeof(*key)); 6285 key->ps_epilog.colors_written = info->colors_written; 6286 key->ps_epilog.writes_z = info->writes_z; 6287 key->ps_epilog.writes_stencil = info->writes_stencil; 6288 key->ps_epilog.writes_samplemask = info->writes_samplemask; 6289 key->ps_epilog.states = shader->key.part.ps.epilog; 6290} 6291 6292/** 6293 * Build the GS prolog function. Rotate the input vertices for triangle strips 6294 * with adjacency. 6295 */ 6296static void si_build_gs_prolog_function(struct si_shader_context *ctx, 6297 union si_shader_part_key *key) 6298{ 6299 unsigned num_sgprs, num_vgprs; 6300 struct si_function_info fninfo; 6301 LLVMBuilderRef builder = ctx->ac.builder; 6302 LLVMTypeRef returns[48]; 6303 LLVMValueRef func, ret; 6304 6305 si_init_function_info(&fninfo); 6306 6307 if (ctx->screen->info.chip_class >= GFX9) { 6308 if (key->gs_prolog.states.gfx9_prev_is_vs) 6309 num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR; 6310 else 6311 num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR; 6312 num_vgprs = 5; /* ES inputs are not needed by GS */ 6313 } else { 6314 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; 6315 num_vgprs = 8; 6316 } 6317 6318 for (unsigned i = 0; i < num_sgprs; ++i) { 6319 add_arg(&fninfo, ARG_SGPR, ctx->i32); 6320 returns[i] = ctx->i32; 6321 } 6322 6323 for (unsigned i = 0; i < num_vgprs; ++i) { 6324 add_arg(&fninfo, ARG_VGPR, ctx->i32); 6325 returns[num_sgprs + i] = ctx->f32; 6326 } 6327 6328 /* Create the function. */ 6329 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 6330 &fninfo, 0); 6331 func = ctx->main_fn; 6332 6333 /* Set the full EXEC mask for the prolog, because we are only fiddling 6334 * with registers here. The main shader part will set the correct EXEC 6335 * mask. 6336 */ 6337 if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic) 6338 ac_init_exec_full_mask(&ctx->ac); 6339 6340 /* Copy inputs to outputs. This should be no-op, as the registers match, 6341 * but it will prevent the compiler from overwriting them unintentionally. 6342 */ 6343 ret = ctx->return_value; 6344 for (unsigned i = 0; i < num_sgprs; i++) { 6345 LLVMValueRef p = LLVMGetParam(func, i); 6346 ret = LLVMBuildInsertValue(builder, ret, p, i, ""); 6347 } 6348 for (unsigned i = 0; i < num_vgprs; i++) { 6349 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); 6350 p = ac_to_float(&ctx->ac, p); 6351 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); 6352 } 6353 6354 if (key->gs_prolog.states.tri_strip_adj_fix) { 6355 /* Remap the input vertices for every other primitive. */ 6356 const unsigned gfx6_vtx_params[6] = { 6357 num_sgprs, 6358 num_sgprs + 1, 6359 num_sgprs + 3, 6360 num_sgprs + 4, 6361 num_sgprs + 5, 6362 num_sgprs + 6 6363 }; 6364 const unsigned gfx9_vtx_params[3] = { 6365 num_sgprs, 6366 num_sgprs + 1, 6367 num_sgprs + 4, 6368 }; 6369 LLVMValueRef vtx_in[6], vtx_out[6]; 6370 LLVMValueRef prim_id, rotate; 6371 6372 if (ctx->screen->info.chip_class >= GFX9) { 6373 for (unsigned i = 0; i < 3; i++) { 6374 vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16); 6375 vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16); 6376 } 6377 } else { 6378 for (unsigned i = 0; i < 6; i++) 6379 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]); 6380 } 6381 6382 prim_id = LLVMGetParam(func, num_sgprs + 2); 6383 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, ""); 6384 6385 for (unsigned i = 0; i < 6; ++i) { 6386 LLVMValueRef base, rotated; 6387 base = vtx_in[i]; 6388 rotated = vtx_in[(i + 4) % 6]; 6389 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, ""); 6390 } 6391 6392 if (ctx->screen->info.chip_class >= GFX9) { 6393 for (unsigned i = 0; i < 3; i++) { 6394 LLVMValueRef hi, out; 6395 6396 hi = LLVMBuildShl(builder, vtx_out[i*2+1], 6397 LLVMConstInt(ctx->i32, 16, 0), ""); 6398 out = LLVMBuildOr(builder, vtx_out[i*2], hi, ""); 6399 out = ac_to_float(&ctx->ac, out); 6400 ret = LLVMBuildInsertValue(builder, ret, out, 6401 gfx9_vtx_params[i], ""); 6402 } 6403 } else { 6404 for (unsigned i = 0; i < 6; i++) { 6405 LLVMValueRef out; 6406 6407 out = ac_to_float(&ctx->ac, vtx_out[i]); 6408 ret = LLVMBuildInsertValue(builder, ret, out, 6409 gfx6_vtx_params[i], ""); 6410 } 6411 } 6412 } 6413 6414 LLVMBuildRet(builder, ret); 6415} 6416 6417/** 6418 * Given a list of shader part functions, build a wrapper function that 6419 * runs them in sequence to form a monolithic shader. 6420 */ 6421static void si_build_wrapper_function(struct si_shader_context *ctx, 6422 LLVMValueRef *parts, 6423 unsigned num_parts, 6424 unsigned main_part, 6425 unsigned next_shader_first_part) 6426{ 6427 LLVMBuilderRef builder = ctx->ac.builder; 6428 /* PS epilog has one arg per color component; gfx9 merged shader 6429 * prologs need to forward 32 user SGPRs. 6430 */ 6431 struct si_function_info fninfo; 6432 LLVMValueRef initial[64], out[64]; 6433 LLVMTypeRef function_type; 6434 unsigned num_first_params; 6435 unsigned num_out, initial_num_out; 6436 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */ 6437 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */ 6438 unsigned num_sgprs, num_vgprs; 6439 unsigned gprs; 6440 struct lp_build_if_state if_state; 6441 6442 si_init_function_info(&fninfo); 6443 6444 for (unsigned i = 0; i < num_parts; ++i) { 6445 ac_add_function_attr(ctx->ac.context, parts[i], -1, 6446 AC_FUNC_ATTR_ALWAYSINLINE); 6447 LLVMSetLinkage(parts[i], LLVMPrivateLinkage); 6448 } 6449 6450 /* The parameters of the wrapper function correspond to those of the 6451 * first part in terms of SGPRs and VGPRs, but we use the types of the 6452 * main part to get the right types. This is relevant for the 6453 * dereferenceable attribute on descriptor table pointers. 6454 */ 6455 num_sgprs = 0; 6456 num_vgprs = 0; 6457 6458 function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); 6459 num_first_params = LLVMCountParamTypes(function_type); 6460 6461 for (unsigned i = 0; i < num_first_params; ++i) { 6462 LLVMValueRef param = LLVMGetParam(parts[0], i); 6463 6464 if (ac_is_sgpr_param(param)) { 6465 assert(num_vgprs == 0); 6466 num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; 6467 } else { 6468 num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; 6469 } 6470 } 6471 6472 gprs = 0; 6473 while (gprs < num_sgprs + num_vgprs) { 6474 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params); 6475 LLVMTypeRef type = LLVMTypeOf(param); 6476 unsigned size = ac_get_type_size(type) / 4; 6477 6478 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type); 6479 6480 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); 6481 assert(gprs + size <= num_sgprs + num_vgprs && 6482 (gprs >= num_sgprs || gprs + size <= num_sgprs)); 6483 6484 gprs += size; 6485 } 6486 6487 si_create_function(ctx, "wrapper", NULL, 0, &fninfo, 6488 si_get_max_workgroup_size(ctx->shader)); 6489 6490 if (is_merged_shader(ctx)) 6491 ac_init_exec_full_mask(&ctx->ac); 6492 6493 /* Record the arguments of the function as if they were an output of 6494 * a previous part. 6495 */ 6496 num_out = 0; 6497 num_out_sgpr = 0; 6498 6499 for (unsigned i = 0; i < fninfo.num_params; ++i) { 6500 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); 6501 LLVMTypeRef param_type = LLVMTypeOf(param); 6502 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32; 6503 unsigned size = ac_get_type_size(param_type) / 4; 6504 6505 if (size == 1) { 6506 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 6507 param = LLVMBuildPtrToInt(builder, param, ctx->i32, ""); 6508 param_type = ctx->i32; 6509 } 6510 6511 if (param_type != out_type) 6512 param = LLVMBuildBitCast(builder, param, out_type, ""); 6513 out[num_out++] = param; 6514 } else { 6515 LLVMTypeRef vector_type = LLVMVectorType(out_type, size); 6516 6517 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 6518 param = LLVMBuildPtrToInt(builder, param, ctx->i64, ""); 6519 param_type = ctx->i64; 6520 } 6521 6522 if (param_type != vector_type) 6523 param = LLVMBuildBitCast(builder, param, vector_type, ""); 6524 6525 for (unsigned j = 0; j < size; ++j) 6526 out[num_out++] = LLVMBuildExtractElement( 6527 builder, param, LLVMConstInt(ctx->i32, j, 0), ""); 6528 } 6529 6530 if (i < fninfo.num_sgpr_params) 6531 num_out_sgpr = num_out; 6532 } 6533 6534 memcpy(initial, out, sizeof(out)); 6535 initial_num_out = num_out; 6536 initial_num_out_sgpr = num_out_sgpr; 6537 6538 /* Now chain the parts. */ 6539 for (unsigned part = 0; part < num_parts; ++part) { 6540 LLVMValueRef in[48]; 6541 LLVMValueRef ret; 6542 LLVMTypeRef ret_type; 6543 unsigned out_idx = 0; 6544 unsigned num_params = LLVMCountParams(parts[part]); 6545 6546 /* Merged shaders are executed conditionally depending 6547 * on the number of enabled threads passed in the input SGPRs. */ 6548 if (is_merged_shader(ctx) && part == 0) { 6549 LLVMValueRef ena, count = initial[3]; 6550 6551 count = LLVMBuildAnd(builder, count, 6552 LLVMConstInt(ctx->i32, 0x7f, 0), ""); 6553 ena = LLVMBuildICmp(builder, LLVMIntULT, 6554 ac_get_thread_id(&ctx->ac), count, ""); 6555 lp_build_if(&if_state, &ctx->gallivm, ena); 6556 } 6557 6558 /* Derive arguments for the next part from outputs of the 6559 * previous one. 6560 */ 6561 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) { 6562 LLVMValueRef param; 6563 LLVMTypeRef param_type; 6564 bool is_sgpr; 6565 unsigned param_size; 6566 LLVMValueRef arg = NULL; 6567 6568 param = LLVMGetParam(parts[part], param_idx); 6569 param_type = LLVMTypeOf(param); 6570 param_size = ac_get_type_size(param_type) / 4; 6571 is_sgpr = ac_is_sgpr_param(param); 6572 6573 if (is_sgpr) { 6574 ac_add_function_attr(ctx->ac.context, parts[part], 6575 param_idx + 1, AC_FUNC_ATTR_INREG); 6576 } else if (out_idx < num_out_sgpr) { 6577 /* Skip returned SGPRs the current part doesn't 6578 * declare on the input. */ 6579 out_idx = num_out_sgpr; 6580 } 6581 6582 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out)); 6583 6584 if (param_size == 1) 6585 arg = out[out_idx]; 6586 else 6587 arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size); 6588 6589 if (LLVMTypeOf(arg) != param_type) { 6590 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 6591 if (LLVMGetPointerAddressSpace(param_type) == 6592 AC_ADDR_SPACE_CONST_32BIT) { 6593 arg = LLVMBuildBitCast(builder, arg, ctx->i32, ""); 6594 arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); 6595 } else { 6596 arg = LLVMBuildBitCast(builder, arg, ctx->i64, ""); 6597 arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); 6598 } 6599 } else { 6600 arg = LLVMBuildBitCast(builder, arg, param_type, ""); 6601 } 6602 } 6603 6604 in[param_idx] = arg; 6605 out_idx += param_size; 6606 } 6607 6608 ret = LLVMBuildCall(builder, parts[part], in, num_params, ""); 6609 6610 if (is_merged_shader(ctx) && 6611 part + 1 == next_shader_first_part) { 6612 lp_build_endif(&if_state); 6613 6614 /* The second half of the merged shader should use 6615 * the inputs from the toplevel (wrapper) function, 6616 * not the return value from the last call. 6617 * 6618 * That's because the last call was executed condi- 6619 * tionally, so we can't consume it in the main 6620 * block. 6621 */ 6622 memcpy(out, initial, sizeof(initial)); 6623 num_out = initial_num_out; 6624 num_out_sgpr = initial_num_out_sgpr; 6625 continue; 6626 } 6627 6628 /* Extract the returned GPRs. */ 6629 ret_type = LLVMTypeOf(ret); 6630 num_out = 0; 6631 num_out_sgpr = 0; 6632 6633 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { 6634 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); 6635 6636 unsigned ret_size = LLVMCountStructElementTypes(ret_type); 6637 6638 for (unsigned i = 0; i < ret_size; ++i) { 6639 LLVMValueRef val = 6640 LLVMBuildExtractValue(builder, ret, i, ""); 6641 6642 assert(num_out < ARRAY_SIZE(out)); 6643 out[num_out++] = val; 6644 6645 if (LLVMTypeOf(val) == ctx->i32) { 6646 assert(num_out_sgpr + 1 == num_out); 6647 num_out_sgpr = num_out; 6648 } 6649 } 6650 } 6651 } 6652 6653 LLVMBuildRetVoid(builder); 6654} 6655 6656static bool si_should_optimize_less(struct ac_llvm_compiler *compiler, 6657 struct si_shader_selector *sel) 6658{ 6659 if (!compiler->low_opt_passes) 6660 return false; 6661 6662 /* Assume a slow CPU. */ 6663 assert(!sel->screen->info.has_dedicated_vram && 6664 sel->screen->info.chip_class <= VI); 6665 6666 /* For a crazy dEQP test containing 2597 memory opcodes, mostly 6667 * buffer stores. */ 6668 return sel->type == PIPE_SHADER_COMPUTE && 6669 sel->info.num_memory_instructions > 1000; 6670} 6671 6672int si_compile_tgsi_shader(struct si_screen *sscreen, 6673 struct ac_llvm_compiler *compiler, 6674 struct si_shader *shader, 6675 struct pipe_debug_callback *debug) 6676{ 6677 struct si_shader_selector *sel = shader->selector; 6678 struct si_shader_context ctx; 6679 int r = -1; 6680 6681 /* Dump TGSI code before doing TGSI->LLVM conversion in case the 6682 * conversion fails. */ 6683 if (si_can_dump_shader(sscreen, sel->info.processor) && 6684 !(sscreen->debug_flags & DBG(NO_TGSI))) { 6685 if (sel->tokens) 6686 tgsi_dump(sel->tokens, 0); 6687 else 6688 nir_print_shader(sel->nir, stderr); 6689 si_dump_streamout(&sel->so); 6690 } 6691 6692 si_init_shader_ctx(&ctx, sscreen, compiler); 6693 si_llvm_context_set_tgsi(&ctx, shader); 6694 6695 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, 6696 sizeof(shader->info.vs_output_param_offset)); 6697 6698 shader->info.uses_instanceid = sel->info.uses_instanceid; 6699 6700 if (!si_compile_tgsi_main(&ctx)) { 6701 si_llvm_dispose(&ctx); 6702 return -1; 6703 } 6704 6705 if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) { 6706 LLVMValueRef parts[2]; 6707 bool need_prolog = sel->vs_needs_prolog; 6708 6709 parts[1] = ctx.main_fn; 6710 6711 if (need_prolog) { 6712 union si_shader_part_key prolog_key; 6713 si_get_vs_prolog_key(&sel->info, 6714 shader->info.num_input_sgprs, 6715 &shader->key.part.vs.prolog, 6716 shader, &prolog_key); 6717 si_build_vs_prolog_function(&ctx, &prolog_key); 6718 parts[0] = ctx.main_fn; 6719 } 6720 6721 si_build_wrapper_function(&ctx, parts + !need_prolog, 6722 1 + need_prolog, need_prolog, 0); 6723 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) { 6724 if (sscreen->info.chip_class >= GFX9) { 6725 struct si_shader_selector *ls = shader->key.part.tcs.ls; 6726 LLVMValueRef parts[4]; 6727 bool vs_needs_prolog = 6728 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog); 6729 6730 /* TCS main part */ 6731 parts[2] = ctx.main_fn; 6732 6733 /* TCS epilog */ 6734 union si_shader_part_key tcs_epilog_key; 6735 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key)); 6736 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 6737 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key); 6738 parts[3] = ctx.main_fn; 6739 6740 /* VS as LS main part */ 6741 struct si_shader shader_ls = {}; 6742 shader_ls.selector = ls; 6743 shader_ls.key.as_ls = 1; 6744 shader_ls.key.mono = shader->key.mono; 6745 shader_ls.key.opt = shader->key.opt; 6746 shader_ls.is_monolithic = true; 6747 si_llvm_context_set_tgsi(&ctx, &shader_ls); 6748 6749 if (!si_compile_tgsi_main(&ctx)) { 6750 si_llvm_dispose(&ctx); 6751 return -1; 6752 } 6753 shader->info.uses_instanceid |= ls->info.uses_instanceid; 6754 parts[1] = ctx.main_fn; 6755 6756 /* LS prolog */ 6757 if (vs_needs_prolog) { 6758 union si_shader_part_key vs_prolog_key; 6759 si_get_vs_prolog_key(&ls->info, 6760 shader_ls.info.num_input_sgprs, 6761 &shader->key.part.tcs.ls_prolog, 6762 shader, &vs_prolog_key); 6763 vs_prolog_key.vs_prolog.is_monolithic = true; 6764 si_build_vs_prolog_function(&ctx, &vs_prolog_key); 6765 parts[0] = ctx.main_fn; 6766 } 6767 6768 /* Reset the shader context. */ 6769 ctx.shader = shader; 6770 ctx.type = PIPE_SHADER_TESS_CTRL; 6771 6772 si_build_wrapper_function(&ctx, 6773 parts + !vs_needs_prolog, 6774 4 - !vs_needs_prolog, vs_needs_prolog, 6775 vs_needs_prolog ? 2 : 1); 6776 } else { 6777 LLVMValueRef parts[2]; 6778 union si_shader_part_key epilog_key; 6779 6780 parts[0] = ctx.main_fn; 6781 6782 memset(&epilog_key, 0, sizeof(epilog_key)); 6783 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 6784 si_build_tcs_epilog_function(&ctx, &epilog_key); 6785 parts[1] = ctx.main_fn; 6786 6787 si_build_wrapper_function(&ctx, parts, 2, 0, 0); 6788 } 6789 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) { 6790 if (ctx.screen->info.chip_class >= GFX9) { 6791 struct si_shader_selector *es = shader->key.part.gs.es; 6792 LLVMValueRef es_prolog = NULL; 6793 LLVMValueRef es_main = NULL; 6794 LLVMValueRef gs_prolog = NULL; 6795 LLVMValueRef gs_main = ctx.main_fn; 6796 6797 /* GS prolog */ 6798 union si_shader_part_key gs_prolog_key; 6799 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key)); 6800 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 6801 gs_prolog_key.gs_prolog.is_monolithic = true; 6802 si_build_gs_prolog_function(&ctx, &gs_prolog_key); 6803 gs_prolog = ctx.main_fn; 6804 6805 /* ES main part */ 6806 struct si_shader shader_es = {}; 6807 shader_es.selector = es; 6808 shader_es.key.as_es = 1; 6809 shader_es.key.mono = shader->key.mono; 6810 shader_es.key.opt = shader->key.opt; 6811 shader_es.is_monolithic = true; 6812 si_llvm_context_set_tgsi(&ctx, &shader_es); 6813 6814 if (!si_compile_tgsi_main(&ctx)) { 6815 si_llvm_dispose(&ctx); 6816 return -1; 6817 } 6818 shader->info.uses_instanceid |= es->info.uses_instanceid; 6819 es_main = ctx.main_fn; 6820 6821 /* ES prolog */ 6822 if (es->vs_needs_prolog) { 6823 union si_shader_part_key vs_prolog_key; 6824 si_get_vs_prolog_key(&es->info, 6825 shader_es.info.num_input_sgprs, 6826 &shader->key.part.gs.vs_prolog, 6827 shader, &vs_prolog_key); 6828 vs_prolog_key.vs_prolog.is_monolithic = true; 6829 si_build_vs_prolog_function(&ctx, &vs_prolog_key); 6830 es_prolog = ctx.main_fn; 6831 } 6832 6833 /* Reset the shader context. */ 6834 ctx.shader = shader; 6835 ctx.type = PIPE_SHADER_GEOMETRY; 6836 6837 /* Prepare the array of shader parts. */ 6838 LLVMValueRef parts[4]; 6839 unsigned num_parts = 0, main_part, next_first_part; 6840 6841 if (es_prolog) 6842 parts[num_parts++] = es_prolog; 6843 6844 parts[main_part = num_parts++] = es_main; 6845 parts[next_first_part = num_parts++] = gs_prolog; 6846 parts[num_parts++] = gs_main; 6847 6848 si_build_wrapper_function(&ctx, parts, num_parts, 6849 main_part, next_first_part); 6850 } else { 6851 LLVMValueRef parts[2]; 6852 union si_shader_part_key prolog_key; 6853 6854 parts[1] = ctx.main_fn; 6855 6856 memset(&prolog_key, 0, sizeof(prolog_key)); 6857 prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 6858 si_build_gs_prolog_function(&ctx, &prolog_key); 6859 parts[0] = ctx.main_fn; 6860 6861 si_build_wrapper_function(&ctx, parts, 2, 1, 0); 6862 } 6863 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) { 6864 LLVMValueRef parts[3]; 6865 union si_shader_part_key prolog_key; 6866 union si_shader_part_key epilog_key; 6867 bool need_prolog; 6868 6869 si_get_ps_prolog_key(shader, &prolog_key, false); 6870 need_prolog = si_need_ps_prolog(&prolog_key); 6871 6872 parts[need_prolog ? 1 : 0] = ctx.main_fn; 6873 6874 if (need_prolog) { 6875 si_build_ps_prolog_function(&ctx, &prolog_key); 6876 parts[0] = ctx.main_fn; 6877 } 6878 6879 si_get_ps_epilog_key(shader, &epilog_key); 6880 si_build_ps_epilog_function(&ctx, &epilog_key); 6881 parts[need_prolog ? 2 : 1] = ctx.main_fn; 6882 6883 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, 6884 need_prolog ? 1 : 0, 0); 6885 } 6886 6887 si_llvm_optimize_module(&ctx); 6888 6889 /* Post-optimization transformations and analysis. */ 6890 si_optimize_vs_outputs(&ctx); 6891 6892 if ((debug && debug->debug_message) || 6893 si_can_dump_shader(sscreen, ctx.type)) { 6894 ctx.shader->config.private_mem_vgprs = 6895 ac_count_scratch_private_memory(ctx.main_fn); 6896 } 6897 6898 /* Make sure the input is a pointer and not integer followed by inttoptr. */ 6899 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == 6900 LLVMPointerTypeKind); 6901 6902 /* Compile to bytecode. */ 6903 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, 6904 ctx.ac.module, debug, ctx.type, 6905 si_get_shader_name(shader, ctx.type), 6906 si_should_optimize_less(compiler, shader->selector)); 6907 si_llvm_dispose(&ctx); 6908 if (r) { 6909 fprintf(stderr, "LLVM failed to compile shader\n"); 6910 return r; 6911 } 6912 6913 /* Validate SGPR and VGPR usage for compute to detect compiler bugs. 6914 * LLVM 3.9svn has this bug. 6915 */ 6916 if (sel->type == PIPE_SHADER_COMPUTE) { 6917 unsigned wave_size = 64; 6918 unsigned max_vgprs = 256; 6919 unsigned max_sgprs = sscreen->info.chip_class >= VI ? 800 : 512; 6920 unsigned max_sgprs_per_wave = 128; 6921 unsigned max_block_threads = si_get_max_workgroup_size(shader); 6922 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size); 6923 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4); 6924 6925 max_vgprs = max_vgprs / min_waves_per_simd; 6926 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave); 6927 6928 if (shader->config.num_sgprs > max_sgprs || 6929 shader->config.num_vgprs > max_vgprs) { 6930 fprintf(stderr, "LLVM failed to compile a shader correctly: " 6931 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n", 6932 shader->config.num_sgprs, shader->config.num_vgprs, 6933 max_sgprs, max_vgprs); 6934 6935 /* Just terminate the process, because dependent 6936 * shaders can hang due to bad input data, but use 6937 * the env var to allow shader-db to work. 6938 */ 6939 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false)) 6940 abort(); 6941 } 6942 } 6943 6944 /* Add the scratch offset to input SGPRs. */ 6945 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(&ctx)) 6946 shader->info.num_input_sgprs += 1; /* scratch byte offset */ 6947 6948 /* Calculate the number of fragment input VGPRs. */ 6949 if (ctx.type == PIPE_SHADER_FRAGMENT) { 6950 shader->info.num_input_vgprs = 0; 6951 shader->info.face_vgpr_index = -1; 6952 shader->info.ancillary_vgpr_index = -1; 6953 6954 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr)) 6955 shader->info.num_input_vgprs += 2; 6956 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)) 6957 shader->info.num_input_vgprs += 2; 6958 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr)) 6959 shader->info.num_input_vgprs += 2; 6960 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr)) 6961 shader->info.num_input_vgprs += 3; 6962 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr)) 6963 shader->info.num_input_vgprs += 2; 6964 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)) 6965 shader->info.num_input_vgprs += 2; 6966 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr)) 6967 shader->info.num_input_vgprs += 2; 6968 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr)) 6969 shader->info.num_input_vgprs += 1; 6970 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr)) 6971 shader->info.num_input_vgprs += 1; 6972 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr)) 6973 shader->info.num_input_vgprs += 1; 6974 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr)) 6975 shader->info.num_input_vgprs += 1; 6976 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr)) 6977 shader->info.num_input_vgprs += 1; 6978 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) { 6979 shader->info.face_vgpr_index = shader->info.num_input_vgprs; 6980 shader->info.num_input_vgprs += 1; 6981 } 6982 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) { 6983 shader->info.ancillary_vgpr_index = shader->info.num_input_vgprs; 6984 shader->info.num_input_vgprs += 1; 6985 } 6986 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr)) 6987 shader->info.num_input_vgprs += 1; 6988 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)) 6989 shader->info.num_input_vgprs += 1; 6990 } 6991 6992 si_calculate_max_simd_waves(shader); 6993 si_shader_dump_stats_for_shader_db(shader, debug); 6994 return 0; 6995} 6996 6997/** 6998 * Create, compile and return a shader part (prolog or epilog). 6999 * 7000 * \param sscreen screen 7001 * \param list list of shader parts of the same category 7002 * \param type shader type 7003 * \param key shader part key 7004 * \param prolog whether the part being requested is a prolog 7005 * \param tm LLVM target machine 7006 * \param debug debug callback 7007 * \param build the callback responsible for building the main function 7008 * \return non-NULL on success 7009 */ 7010static struct si_shader_part * 7011si_get_shader_part(struct si_screen *sscreen, 7012 struct si_shader_part **list, 7013 enum pipe_shader_type type, 7014 bool prolog, 7015 union si_shader_part_key *key, 7016 struct ac_llvm_compiler *compiler, 7017 struct pipe_debug_callback *debug, 7018 void (*build)(struct si_shader_context *, 7019 union si_shader_part_key *), 7020 const char *name) 7021{ 7022 struct si_shader_part *result; 7023 7024 mtx_lock(&sscreen->shader_parts_mutex); 7025 7026 /* Find existing. */ 7027 for (result = *list; result; result = result->next) { 7028 if (memcmp(&result->key, key, sizeof(*key)) == 0) { 7029 mtx_unlock(&sscreen->shader_parts_mutex); 7030 return result; 7031 } 7032 } 7033 7034 /* Compile a new one. */ 7035 result = CALLOC_STRUCT(si_shader_part); 7036 result->key = *key; 7037 7038 struct si_shader shader = {}; 7039 struct si_shader_context ctx; 7040 7041 si_init_shader_ctx(&ctx, sscreen, compiler); 7042 ctx.shader = &shader; 7043 ctx.type = type; 7044 7045 switch (type) { 7046 case PIPE_SHADER_VERTEX: 7047 shader.key.as_ls = key->vs_prolog.as_ls; 7048 shader.key.as_es = key->vs_prolog.as_es; 7049 break; 7050 case PIPE_SHADER_TESS_CTRL: 7051 assert(!prolog); 7052 shader.key.part.tcs.epilog = key->tcs_epilog.states; 7053 break; 7054 case PIPE_SHADER_GEOMETRY: 7055 assert(prolog); 7056 break; 7057 case PIPE_SHADER_FRAGMENT: 7058 if (prolog) 7059 shader.key.part.ps.prolog = key->ps_prolog.states; 7060 else 7061 shader.key.part.ps.epilog = key->ps_epilog.states; 7062 break; 7063 default: 7064 unreachable("bad shader part"); 7065 } 7066 7067 build(&ctx, key); 7068 7069 /* Compile. */ 7070 si_llvm_optimize_module(&ctx); 7071 7072 if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler, 7073 ctx.ac.module, debug, ctx.type, name, false)) { 7074 FREE(result); 7075 result = NULL; 7076 goto out; 7077 } 7078 7079 result->next = *list; 7080 *list = result; 7081 7082out: 7083 si_llvm_dispose(&ctx); 7084 mtx_unlock(&sscreen->shader_parts_mutex); 7085 return result; 7086} 7087 7088static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx) 7089{ 7090 LLVMValueRef ptr[2], list; 7091 bool merged_shader = is_merged_shader(ctx); 7092 7093 ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); 7094 list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], 7095 ac_array_in_const32_addr_space(ctx->v4i32), ""); 7096 return list; 7097} 7098 7099/** 7100 * Build the vertex shader prolog function. 7101 * 7102 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). 7103 * All inputs are returned unmodified. The vertex load indices are 7104 * stored after them, which will be used by the API VS for fetching inputs. 7105 * 7106 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: 7107 * input_v0, 7108 * input_v1, 7109 * input_v2, 7110 * input_v3, 7111 * (VertexID + BaseVertex), 7112 * (InstanceID + StartInstance), 7113 * (InstanceID / 2 + StartInstance) 7114 */ 7115static void si_build_vs_prolog_function(struct si_shader_context *ctx, 7116 union si_shader_part_key *key) 7117{ 7118 struct si_function_info fninfo; 7119 LLVMTypeRef *returns; 7120 LLVMValueRef ret, func; 7121 int num_returns, i; 7122 unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs; 7123 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4; 7124 LLVMValueRef input_vgprs[9]; 7125 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + 7126 num_input_vgprs; 7127 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0; 7128 7129 si_init_function_info(&fninfo); 7130 7131 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ 7132 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) * 7133 sizeof(LLVMTypeRef)); 7134 num_returns = 0; 7135 7136 /* Declare input and output SGPRs. */ 7137 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { 7138 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7139 returns[num_returns++] = ctx->i32; 7140 } 7141 7142 /* Preloaded VGPRs (outputs must be floats) */ 7143 for (i = 0; i < num_input_vgprs; i++) { 7144 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]); 7145 returns[num_returns++] = ctx->f32; 7146 } 7147 7148 /* Vertex load indices. */ 7149 for (i = 0; i <= key->vs_prolog.last_input; i++) 7150 returns[num_returns++] = ctx->f32; 7151 7152 /* Create the function. */ 7153 si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0); 7154 func = ctx->main_fn; 7155 7156 if (key->vs_prolog.num_merged_next_stage_vgprs) { 7157 if (!key->vs_prolog.is_monolithic) 7158 si_init_exec_from_input(ctx, 3, 0); 7159 7160 if (key->vs_prolog.as_ls && 7161 ctx->screen->has_ls_vgpr_init_bug) { 7162 /* If there are no HS threads, SPI loads the LS VGPRs 7163 * starting at VGPR 0. Shift them back to where they 7164 * belong. 7165 */ 7166 LLVMValueRef has_hs_threads = 7167 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, 7168 si_unpack_param(ctx, 3, 8, 8), 7169 ctx->i32_0, ""); 7170 7171 for (i = 4; i > 0; --i) { 7172 input_vgprs[i + 1] = 7173 LLVMBuildSelect(ctx->ac.builder, has_hs_threads, 7174 input_vgprs[i + 1], 7175 input_vgprs[i - 1], ""); 7176 } 7177 } 7178 } 7179 7180 ctx->abi.vertex_id = input_vgprs[first_vs_vgpr]; 7181 ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)]; 7182 7183 /* Copy inputs to outputs. This should be no-op, as the registers match, 7184 * but it will prevent the compiler from overwriting them unintentionally. 7185 */ 7186 ret = ctx->return_value; 7187 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { 7188 LLVMValueRef p = LLVMGetParam(func, i); 7189 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); 7190 } 7191 for (i = 0; i < num_input_vgprs; i++) { 7192 LLVMValueRef p = input_vgprs[i]; 7193 p = ac_to_float(&ctx->ac, p); 7194 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, 7195 key->vs_prolog.num_input_sgprs + i, ""); 7196 } 7197 7198 /* Compute vertex load indices from instance divisors. */ 7199 LLVMValueRef instance_divisor_constbuf = NULL; 7200 7201 if (key->vs_prolog.states.instance_divisor_is_fetched) { 7202 LLVMValueRef list = si_prolog_get_rw_buffers(ctx); 7203 LLVMValueRef buf_index = 7204 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0); 7205 instance_divisor_constbuf = 7206 ac_build_load_to_sgpr(&ctx->ac, list, buf_index); 7207 } 7208 7209 for (i = 0; i <= key->vs_prolog.last_input; i++) { 7210 bool divisor_is_one = 7211 key->vs_prolog.states.instance_divisor_is_one & (1u << i); 7212 bool divisor_is_fetched = 7213 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); 7214 LLVMValueRef index = NULL; 7215 7216 if (divisor_is_one) { 7217 index = ctx->abi.instance_id; 7218 } else if (divisor_is_fetched) { 7219 LLVMValueRef udiv_factors[4]; 7220 7221 for (unsigned j = 0; j < 4; j++) { 7222 udiv_factors[j] = 7223 buffer_load_const(ctx, instance_divisor_constbuf, 7224 LLVMConstInt(ctx->i32, i*16 + j*4, 0)); 7225 udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); 7226 } 7227 /* The faster NUW version doesn't work when InstanceID == UINT_MAX. 7228 * Such InstanceID might not be achievable in a reasonable time though. 7229 */ 7230 index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, 7231 udiv_factors[0], udiv_factors[1], 7232 udiv_factors[2], udiv_factors[3]); 7233 } 7234 7235 if (divisor_is_one || divisor_is_fetched) { 7236 /* Add StartInstance. */ 7237 index = LLVMBuildAdd(ctx->ac.builder, index, 7238 LLVMGetParam(ctx->main_fn, user_sgpr_base + 7239 SI_SGPR_START_INSTANCE), ""); 7240 } else { 7241 /* VertexID + BaseVertex */ 7242 index = LLVMBuildAdd(ctx->ac.builder, 7243 ctx->abi.vertex_id, 7244 LLVMGetParam(func, user_sgpr_base + 7245 SI_SGPR_BASE_VERTEX), ""); 7246 } 7247 7248 index = ac_to_float(&ctx->ac, index); 7249 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, 7250 fninfo.num_params + i, ""); 7251 } 7252 7253 si_llvm_build_ret(ctx, ret); 7254} 7255 7256static bool si_get_vs_prolog(struct si_screen *sscreen, 7257 struct ac_llvm_compiler *compiler, 7258 struct si_shader *shader, 7259 struct pipe_debug_callback *debug, 7260 struct si_shader *main_part, 7261 const struct si_vs_prolog_bits *key) 7262{ 7263 struct si_shader_selector *vs = main_part->selector; 7264 7265 if (!si_vs_needs_prolog(vs, key)) 7266 return true; 7267 7268 /* Get the prolog. */ 7269 union si_shader_part_key prolog_key; 7270 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, 7271 key, shader, &prolog_key); 7272 7273 shader->prolog = 7274 si_get_shader_part(sscreen, &sscreen->vs_prologs, 7275 PIPE_SHADER_VERTEX, true, &prolog_key, compiler, 7276 debug, si_build_vs_prolog_function, 7277 "Vertex Shader Prolog"); 7278 return shader->prolog != NULL; 7279} 7280 7281/** 7282 * Select and compile (or reuse) vertex shader parts (prolog & epilog). 7283 */ 7284static bool si_shader_select_vs_parts(struct si_screen *sscreen, 7285 struct ac_llvm_compiler *compiler, 7286 struct si_shader *shader, 7287 struct pipe_debug_callback *debug) 7288{ 7289 return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, 7290 &shader->key.part.vs.prolog); 7291} 7292 7293/** 7294 * Compile the TCS epilog function. This writes tesselation factors to memory 7295 * based on the output primitive type of the tesselator (determined by TES). 7296 */ 7297static void si_build_tcs_epilog_function(struct si_shader_context *ctx, 7298 union si_shader_part_key *key) 7299{ 7300 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 7301 struct si_function_info fninfo; 7302 LLVMValueRef func; 7303 7304 si_init_function_info(&fninfo); 7305 7306 if (ctx->screen->info.chip_class >= GFX9) { 7307 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7308 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7309 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7310 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */ 7311 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7312 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7313 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7314 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7315 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7316 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7317 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7318 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7319 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7320 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7321 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7322 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7323 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7324 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7325 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7326 } else { 7327 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7328 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7329 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7330 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7331 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7332 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7333 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7334 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7335 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7336 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7337 } 7338 7339 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */ 7340 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */ 7341 unsigned tess_factors_idx = 7342 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */ 7343 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */ 7344 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */ 7345 7346 for (unsigned i = 0; i < 6; i++) 7347 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */ 7348 7349 /* Create the function. */ 7350 si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo, 7351 ctx->screen->info.chip_class >= CIK ? 128 : 64); 7352 ac_declare_lds_as_pointer(&ctx->ac); 7353 func = ctx->main_fn; 7354 7355 LLVMValueRef invoc0_tess_factors[6]; 7356 for (unsigned i = 0; i < 6; i++) 7357 invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i); 7358 7359 si_write_tess_factors(bld_base, 7360 LLVMGetParam(func, tess_factors_idx), 7361 LLVMGetParam(func, tess_factors_idx + 1), 7362 LLVMGetParam(func, tess_factors_idx + 2), 7363 invoc0_tess_factors, invoc0_tess_factors + 4); 7364 7365 LLVMBuildRetVoid(ctx->ac.builder); 7366} 7367 7368/** 7369 * Select and compile (or reuse) TCS parts (epilog). 7370 */ 7371static bool si_shader_select_tcs_parts(struct si_screen *sscreen, 7372 struct ac_llvm_compiler *compiler, 7373 struct si_shader *shader, 7374 struct pipe_debug_callback *debug) 7375{ 7376 if (sscreen->info.chip_class >= GFX9) { 7377 struct si_shader *ls_main_part = 7378 shader->key.part.tcs.ls->main_shader_part_ls; 7379 7380 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part, 7381 &shader->key.part.tcs.ls_prolog)) 7382 return false; 7383 7384 shader->previous_stage = ls_main_part; 7385 } 7386 7387 /* Get the epilog. */ 7388 union si_shader_part_key epilog_key; 7389 memset(&epilog_key, 0, sizeof(epilog_key)); 7390 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 7391 7392 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, 7393 PIPE_SHADER_TESS_CTRL, false, 7394 &epilog_key, compiler, debug, 7395 si_build_tcs_epilog_function, 7396 "Tessellation Control Shader Epilog"); 7397 return shader->epilog != NULL; 7398} 7399 7400/** 7401 * Select and compile (or reuse) GS parts (prolog). 7402 */ 7403static bool si_shader_select_gs_parts(struct si_screen *sscreen, 7404 struct ac_llvm_compiler *compiler, 7405 struct si_shader *shader, 7406 struct pipe_debug_callback *debug) 7407{ 7408 if (sscreen->info.chip_class >= GFX9) { 7409 struct si_shader *es_main_part = 7410 shader->key.part.gs.es->main_shader_part_es; 7411 7412 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX && 7413 !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part, 7414 &shader->key.part.gs.vs_prolog)) 7415 return false; 7416 7417 shader->previous_stage = es_main_part; 7418 } 7419 7420 if (!shader->key.part.gs.prolog.tri_strip_adj_fix) 7421 return true; 7422 7423 union si_shader_part_key prolog_key; 7424 memset(&prolog_key, 0, sizeof(prolog_key)); 7425 prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 7426 7427 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs, 7428 PIPE_SHADER_GEOMETRY, true, 7429 &prolog_key, compiler, debug, 7430 si_build_gs_prolog_function, 7431 "Geometry Shader Prolog"); 7432 return shader->prolog2 != NULL; 7433} 7434 7435/** 7436 * Build the pixel shader prolog function. This handles: 7437 * - two-side color selection and interpolation 7438 * - overriding interpolation parameters for the API PS 7439 * - polygon stippling 7440 * 7441 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are 7442 * overriden by other states. (e.g. per-sample interpolation) 7443 * Interpolated colors are stored after the preloaded VGPRs. 7444 */ 7445static void si_build_ps_prolog_function(struct si_shader_context *ctx, 7446 union si_shader_part_key *key) 7447{ 7448 struct si_function_info fninfo; 7449 LLVMValueRef ret, func; 7450 int num_returns, i, num_color_channels; 7451 7452 assert(si_need_ps_prolog(key)); 7453 7454 si_init_function_info(&fninfo); 7455 7456 /* Declare inputs. */ 7457 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) 7458 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7459 7460 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) 7461 add_arg(&fninfo, ARG_VGPR, ctx->f32); 7462 7463 /* Declare outputs (same as inputs + add colors if needed) */ 7464 num_returns = fninfo.num_params; 7465 num_color_channels = util_bitcount(key->ps_prolog.colors_read); 7466 for (i = 0; i < num_color_channels; i++) 7467 fninfo.types[num_returns++] = ctx->f32; 7468 7469 /* Create the function. */ 7470 si_create_function(ctx, "ps_prolog", fninfo.types, num_returns, 7471 &fninfo, 0); 7472 func = ctx->main_fn; 7473 7474 /* Copy inputs to outputs. This should be no-op, as the registers match, 7475 * but it will prevent the compiler from overwriting them unintentionally. 7476 */ 7477 ret = ctx->return_value; 7478 for (i = 0; i < fninfo.num_params; i++) { 7479 LLVMValueRef p = LLVMGetParam(func, i); 7480 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); 7481 } 7482 7483 /* Polygon stippling. */ 7484 if (key->ps_prolog.states.poly_stipple) { 7485 /* POS_FIXED_PT is always last. */ 7486 unsigned pos = key->ps_prolog.num_input_sgprs + 7487 key->ps_prolog.num_input_vgprs - 1; 7488 LLVMValueRef list = si_prolog_get_rw_buffers(ctx); 7489 7490 si_llvm_emit_polygon_stipple(ctx, list, pos); 7491 } 7492 7493 if (key->ps_prolog.states.bc_optimize_for_persp || 7494 key->ps_prolog.states.bc_optimize_for_linear) { 7495 unsigned i, base = key->ps_prolog.num_input_sgprs; 7496 LLVMValueRef center[2], centroid[2], tmp, bc_optimize; 7497 7498 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER; 7499 * The hw doesn't compute CENTROID if the whole wave only 7500 * contains fully-covered quads. 7501 * 7502 * PRIM_MASK is after user SGPRs. 7503 */ 7504 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); 7505 bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize, 7506 LLVMConstInt(ctx->i32, 31, 0), ""); 7507 bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, 7508 ctx->i1, ""); 7509 7510 if (key->ps_prolog.states.bc_optimize_for_persp) { 7511 /* Read PERSP_CENTER. */ 7512 for (i = 0; i < 2; i++) 7513 center[i] = LLVMGetParam(func, base + 2 + i); 7514 /* Read PERSP_CENTROID. */ 7515 for (i = 0; i < 2; i++) 7516 centroid[i] = LLVMGetParam(func, base + 4 + i); 7517 /* Select PERSP_CENTROID. */ 7518 for (i = 0; i < 2; i++) { 7519 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, 7520 center[i], centroid[i], ""); 7521 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7522 tmp, base + 4 + i, ""); 7523 } 7524 } 7525 if (key->ps_prolog.states.bc_optimize_for_linear) { 7526 /* Read LINEAR_CENTER. */ 7527 for (i = 0; i < 2; i++) 7528 center[i] = LLVMGetParam(func, base + 8 + i); 7529 /* Read LINEAR_CENTROID. */ 7530 for (i = 0; i < 2; i++) 7531 centroid[i] = LLVMGetParam(func, base + 10 + i); 7532 /* Select LINEAR_CENTROID. */ 7533 for (i = 0; i < 2; i++) { 7534 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, 7535 center[i], centroid[i], ""); 7536 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7537 tmp, base + 10 + i, ""); 7538 } 7539 } 7540 } 7541 7542 /* Force per-sample interpolation. */ 7543 if (key->ps_prolog.states.force_persp_sample_interp) { 7544 unsigned i, base = key->ps_prolog.num_input_sgprs; 7545 LLVMValueRef persp_sample[2]; 7546 7547 /* Read PERSP_SAMPLE. */ 7548 for (i = 0; i < 2; i++) 7549 persp_sample[i] = LLVMGetParam(func, base + i); 7550 /* Overwrite PERSP_CENTER. */ 7551 for (i = 0; i < 2; i++) 7552 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7553 persp_sample[i], base + 2 + i, ""); 7554 /* Overwrite PERSP_CENTROID. */ 7555 for (i = 0; i < 2; i++) 7556 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7557 persp_sample[i], base + 4 + i, ""); 7558 } 7559 if (key->ps_prolog.states.force_linear_sample_interp) { 7560 unsigned i, base = key->ps_prolog.num_input_sgprs; 7561 LLVMValueRef linear_sample[2]; 7562 7563 /* Read LINEAR_SAMPLE. */ 7564 for (i = 0; i < 2; i++) 7565 linear_sample[i] = LLVMGetParam(func, base + 6 + i); 7566 /* Overwrite LINEAR_CENTER. */ 7567 for (i = 0; i < 2; i++) 7568 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7569 linear_sample[i], base + 8 + i, ""); 7570 /* Overwrite LINEAR_CENTROID. */ 7571 for (i = 0; i < 2; i++) 7572 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7573 linear_sample[i], base + 10 + i, ""); 7574 } 7575 7576 /* Force center interpolation. */ 7577 if (key->ps_prolog.states.force_persp_center_interp) { 7578 unsigned i, base = key->ps_prolog.num_input_sgprs; 7579 LLVMValueRef persp_center[2]; 7580 7581 /* Read PERSP_CENTER. */ 7582 for (i = 0; i < 2; i++) 7583 persp_center[i] = LLVMGetParam(func, base + 2 + i); 7584 /* Overwrite PERSP_SAMPLE. */ 7585 for (i = 0; i < 2; i++) 7586 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7587 persp_center[i], base + i, ""); 7588 /* Overwrite PERSP_CENTROID. */ 7589 for (i = 0; i < 2; i++) 7590 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7591 persp_center[i], base + 4 + i, ""); 7592 } 7593 if (key->ps_prolog.states.force_linear_center_interp) { 7594 unsigned i, base = key->ps_prolog.num_input_sgprs; 7595 LLVMValueRef linear_center[2]; 7596 7597 /* Read LINEAR_CENTER. */ 7598 for (i = 0; i < 2; i++) 7599 linear_center[i] = LLVMGetParam(func, base + 8 + i); 7600 /* Overwrite LINEAR_SAMPLE. */ 7601 for (i = 0; i < 2; i++) 7602 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7603 linear_center[i], base + 6 + i, ""); 7604 /* Overwrite LINEAR_CENTROID. */ 7605 for (i = 0; i < 2; i++) 7606 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7607 linear_center[i], base + 10 + i, ""); 7608 } 7609 7610 /* Interpolate colors. */ 7611 unsigned color_out_idx = 0; 7612 for (i = 0; i < 2; i++) { 7613 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; 7614 unsigned face_vgpr = key->ps_prolog.num_input_sgprs + 7615 key->ps_prolog.face_vgpr_index; 7616 LLVMValueRef interp[2], color[4]; 7617 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL; 7618 7619 if (!writemask) 7620 continue; 7621 7622 /* If the interpolation qualifier is not CONSTANT (-1). */ 7623 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) { 7624 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs + 7625 key->ps_prolog.color_interp_vgpr_index[i]; 7626 7627 /* Get the (i,j) updated by bc_optimize handling. */ 7628 interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret, 7629 interp_vgpr, ""); 7630 interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret, 7631 interp_vgpr + 1, ""); 7632 interp_ij = ac_build_gather_values(&ctx->ac, interp, 2); 7633 } 7634 7635 /* Use the absolute location of the input. */ 7636 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); 7637 7638 if (key->ps_prolog.states.color_two_side) { 7639 face = LLVMGetParam(func, face_vgpr); 7640 face = ac_to_integer(&ctx->ac, face); 7641 } 7642 7643 interp_fs_input(ctx, 7644 key->ps_prolog.color_attr_index[i], 7645 TGSI_SEMANTIC_COLOR, i, 7646 key->ps_prolog.num_interp_inputs, 7647 key->ps_prolog.colors_read, interp_ij, 7648 prim_mask, face, color); 7649 7650 while (writemask) { 7651 unsigned chan = u_bit_scan(&writemask); 7652 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan], 7653 fninfo.num_params + color_out_idx++, ""); 7654 } 7655 } 7656 7657 /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec 7658 * says: 7659 * 7660 * "When per-sample shading is active due to the use of a fragment 7661 * input qualified by sample or due to the use of the gl_SampleID 7662 * or gl_SamplePosition variables, only the bit for the current 7663 * sample is set in gl_SampleMaskIn. When state specifies multiple 7664 * fragment shader invocations for a given fragment, the sample 7665 * mask for any single fragment shader invocation may specify a 7666 * subset of the covered samples for the fragment. In this case, 7667 * the bit corresponding to each covered sample will be set in 7668 * exactly one fragment shader invocation." 7669 * 7670 * The samplemask loaded by hardware is always the coverage of the 7671 * entire pixel/fragment, so mask bits out based on the sample ID. 7672 */ 7673 if (key->ps_prolog.states.samplemask_log_ps_iter) { 7674 /* The bit pattern matches that used by fixed function fragment 7675 * processing. */ 7676 static const uint16_t ps_iter_masks[] = { 7677 0xffff, /* not used */ 7678 0x5555, 7679 0x1111, 7680 0x0101, 7681 0x0001, 7682 }; 7683 assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks)); 7684 7685 uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter]; 7686 unsigned ancillary_vgpr = key->ps_prolog.num_input_sgprs + 7687 key->ps_prolog.ancillary_vgpr_index; 7688 LLVMValueRef sampleid = si_unpack_param(ctx, ancillary_vgpr, 8, 4); 7689 LLVMValueRef samplemask = LLVMGetParam(func, ancillary_vgpr + 1); 7690 7691 samplemask = ac_to_integer(&ctx->ac, samplemask); 7692 samplemask = LLVMBuildAnd( 7693 ctx->ac.builder, 7694 samplemask, 7695 LLVMBuildShl(ctx->ac.builder, 7696 LLVMConstInt(ctx->i32, ps_iter_mask, false), 7697 sampleid, ""), 7698 ""); 7699 samplemask = ac_to_float(&ctx->ac, samplemask); 7700 7701 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask, 7702 ancillary_vgpr + 1, ""); 7703 } 7704 7705 /* Tell LLVM to insert WQM instruction sequence when needed. */ 7706 if (key->ps_prolog.wqm) { 7707 LLVMAddTargetDependentFunctionAttr(func, 7708 "amdgpu-ps-wqm-outputs", ""); 7709 } 7710 7711 si_llvm_build_ret(ctx, ret); 7712} 7713 7714/** 7715 * Build the pixel shader epilog function. This handles everything that must be 7716 * emulated for pixel shader exports. (alpha-test, format conversions, etc) 7717 */ 7718static void si_build_ps_epilog_function(struct si_shader_context *ctx, 7719 union si_shader_part_key *key) 7720{ 7721 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 7722 struct si_function_info fninfo; 7723 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; 7724 int i; 7725 struct si_ps_exports exp = {}; 7726 7727 si_init_function_info(&fninfo); 7728 7729 /* Declare input SGPRs. */ 7730 ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7731 ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7732 ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7733 ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7734 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF); 7735 7736 /* Declare input VGPRs. */ 7737 unsigned required_num_params = 7738 fninfo.num_sgpr_params + 7739 util_bitcount(key->ps_epilog.colors_written) * 4 + 7740 key->ps_epilog.writes_z + 7741 key->ps_epilog.writes_stencil + 7742 key->ps_epilog.writes_samplemask; 7743 7744 required_num_params = MAX2(required_num_params, 7745 fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); 7746 7747 while (fninfo.num_params < required_num_params) 7748 add_arg(&fninfo, ARG_VGPR, ctx->f32); 7749 7750 /* Create the function. */ 7751 si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0); 7752 /* Disable elimination of unused inputs. */ 7753 ac_llvm_add_target_dep_function_attr(ctx->main_fn, 7754 "InitialPSInputAddr", 0xffffff); 7755 7756 /* Process colors. */ 7757 unsigned vgpr = fninfo.num_sgpr_params; 7758 unsigned colors_written = key->ps_epilog.colors_written; 7759 int last_color_export = -1; 7760 7761 /* Find the last color export. */ 7762 if (!key->ps_epilog.writes_z && 7763 !key->ps_epilog.writes_stencil && 7764 !key->ps_epilog.writes_samplemask) { 7765 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; 7766 7767 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ 7768 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { 7769 /* Just set this if any of the colorbuffers are enabled. */ 7770 if (spi_format & 7771 ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) 7772 last_color_export = 0; 7773 } else { 7774 for (i = 0; i < 8; i++) 7775 if (colors_written & (1 << i) && 7776 (spi_format >> (i * 4)) & 0xf) 7777 last_color_export = i; 7778 } 7779 } 7780 7781 while (colors_written) { 7782 LLVMValueRef color[4]; 7783 int mrt = u_bit_scan(&colors_written); 7784 7785 for (i = 0; i < 4; i++) 7786 color[i] = LLVMGetParam(ctx->main_fn, vgpr++); 7787 7788 si_export_mrt_color(bld_base, color, mrt, 7789 fninfo.num_params - 1, 7790 mrt == last_color_export, &exp); 7791 } 7792 7793 /* Process depth, stencil, samplemask. */ 7794 if (key->ps_epilog.writes_z) 7795 depth = LLVMGetParam(ctx->main_fn, vgpr++); 7796 if (key->ps_epilog.writes_stencil) 7797 stencil = LLVMGetParam(ctx->main_fn, vgpr++); 7798 if (key->ps_epilog.writes_samplemask) 7799 samplemask = LLVMGetParam(ctx->main_fn, vgpr++); 7800 7801 if (depth || stencil || samplemask) 7802 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp); 7803 else if (last_color_export == -1) 7804 ac_build_export_null(&ctx->ac); 7805 7806 if (exp.num) 7807 si_emit_ps_exports(ctx, &exp); 7808 7809 /* Compile. */ 7810 LLVMBuildRetVoid(ctx->ac.builder); 7811} 7812 7813/** 7814 * Select and compile (or reuse) pixel shader parts (prolog & epilog). 7815 */ 7816static bool si_shader_select_ps_parts(struct si_screen *sscreen, 7817 struct ac_llvm_compiler *compiler, 7818 struct si_shader *shader, 7819 struct pipe_debug_callback *debug) 7820{ 7821 union si_shader_part_key prolog_key; 7822 union si_shader_part_key epilog_key; 7823 7824 /* Get the prolog. */ 7825 si_get_ps_prolog_key(shader, &prolog_key, true); 7826 7827 /* The prolog is a no-op if these aren't set. */ 7828 if (si_need_ps_prolog(&prolog_key)) { 7829 shader->prolog = 7830 si_get_shader_part(sscreen, &sscreen->ps_prologs, 7831 PIPE_SHADER_FRAGMENT, true, 7832 &prolog_key, compiler, debug, 7833 si_build_ps_prolog_function, 7834 "Fragment Shader Prolog"); 7835 if (!shader->prolog) 7836 return false; 7837 } 7838 7839 /* Get the epilog. */ 7840 si_get_ps_epilog_key(shader, &epilog_key); 7841 7842 shader->epilog = 7843 si_get_shader_part(sscreen, &sscreen->ps_epilogs, 7844 PIPE_SHADER_FRAGMENT, false, 7845 &epilog_key, compiler, debug, 7846 si_build_ps_epilog_function, 7847 "Fragment Shader Epilog"); 7848 if (!shader->epilog) 7849 return false; 7850 7851 /* Enable POS_FIXED_PT if polygon stippling is enabled. */ 7852 if (shader->key.part.ps.prolog.poly_stipple) { 7853 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); 7854 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); 7855 } 7856 7857 /* Set up the enable bits for per-sample shading if needed. */ 7858 if (shader->key.part.ps.prolog.force_persp_sample_interp && 7859 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || 7860 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 7861 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; 7862 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; 7863 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); 7864 } 7865 if (shader->key.part.ps.prolog.force_linear_sample_interp && 7866 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || 7867 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 7868 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; 7869 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; 7870 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); 7871 } 7872 if (shader->key.part.ps.prolog.force_persp_center_interp && 7873 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) || 7874 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 7875 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA; 7876 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; 7877 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 7878 } 7879 if (shader->key.part.ps.prolog.force_linear_center_interp && 7880 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) || 7881 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 7882 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA; 7883 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; 7884 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 7885 } 7886 7887 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ 7888 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && 7889 !(shader->config.spi_ps_input_ena & 0xf)) { 7890 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 7891 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); 7892 } 7893 7894 /* At least one pair of interpolation weights must be enabled. */ 7895 if (!(shader->config.spi_ps_input_ena & 0x7f)) { 7896 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 7897 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); 7898 } 7899 7900 /* Samplemask fixup requires the sample ID. */ 7901 if (shader->key.part.ps.prolog.samplemask_log_ps_iter) { 7902 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1); 7903 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)); 7904 } 7905 7906 /* The sample mask input is always enabled, because the API shader always 7907 * passes it through to the epilog. Disable it here if it's unused. 7908 */ 7909 if (!shader->key.part.ps.epilog.poly_line_smoothing && 7910 !shader->selector->info.reads_samplemask) 7911 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; 7912 7913 return true; 7914} 7915 7916void si_multiwave_lds_size_workaround(struct si_screen *sscreen, 7917 unsigned *lds_size) 7918{ 7919 /* If tessellation is all offchip and on-chip GS isn't used, this 7920 * workaround is not needed. 7921 */ 7922 return; 7923 7924 /* SPI barrier management bug: 7925 * Make sure we have at least 4k of LDS in use to avoid the bug. 7926 * It applies to workgroup sizes of more than one wavefront. 7927 */ 7928 if (sscreen->info.family == CHIP_BONAIRE || 7929 sscreen->info.family == CHIP_KABINI || 7930 sscreen->info.family == CHIP_MULLINS) 7931 *lds_size = MAX2(*lds_size, 8); 7932} 7933 7934static void si_fix_resource_usage(struct si_screen *sscreen, 7935 struct si_shader *shader) 7936{ 7937 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ 7938 7939 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); 7940 7941 if (shader->selector->type == PIPE_SHADER_COMPUTE && 7942 si_get_max_workgroup_size(shader) > 64) { 7943 si_multiwave_lds_size_workaround(sscreen, 7944 &shader->config.lds_size); 7945 } 7946} 7947 7948int si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 7949 struct si_shader *shader, 7950 struct pipe_debug_callback *debug) 7951{ 7952 struct si_shader_selector *sel = shader->selector; 7953 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key); 7954 int r; 7955 7956 /* LS, ES, VS are compiled on demand if the main part hasn't been 7957 * compiled for that stage. 7958 * 7959 * Vertex shaders are compiled on demand when a vertex fetch 7960 * workaround must be applied. 7961 */ 7962 if (shader->is_monolithic) { 7963 /* Monolithic shader (compiled as a whole, has many variants, 7964 * may take a long time to compile). 7965 */ 7966 r = si_compile_tgsi_shader(sscreen, compiler, shader, debug); 7967 if (r) 7968 return r; 7969 } else { 7970 /* The shader consists of several parts: 7971 * 7972 * - the middle part is the user shader, it has 1 variant only 7973 * and it was compiled during the creation of the shader 7974 * selector 7975 * - the prolog part is inserted at the beginning 7976 * - the epilog part is inserted at the end 7977 * 7978 * The prolog and epilog have many (but simple) variants. 7979 * 7980 * Starting with gfx9, geometry and tessellation control 7981 * shaders also contain the prolog and user shader parts of 7982 * the previous shader stage. 7983 */ 7984 7985 if (!mainp) 7986 return -1; 7987 7988 /* Copy the compiled TGSI shader data over. */ 7989 shader->is_binary_shared = true; 7990 shader->binary = mainp->binary; 7991 shader->config = mainp->config; 7992 shader->info.num_input_sgprs = mainp->info.num_input_sgprs; 7993 shader->info.num_input_vgprs = mainp->info.num_input_vgprs; 7994 shader->info.face_vgpr_index = mainp->info.face_vgpr_index; 7995 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index; 7996 memcpy(shader->info.vs_output_param_offset, 7997 mainp->info.vs_output_param_offset, 7998 sizeof(mainp->info.vs_output_param_offset)); 7999 shader->info.uses_instanceid = mainp->info.uses_instanceid; 8000 shader->info.nr_pos_exports = mainp->info.nr_pos_exports; 8001 shader->info.nr_param_exports = mainp->info.nr_param_exports; 8002 8003 /* Select prologs and/or epilogs. */ 8004 switch (sel->type) { 8005 case PIPE_SHADER_VERTEX: 8006 if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug)) 8007 return -1; 8008 break; 8009 case PIPE_SHADER_TESS_CTRL: 8010 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug)) 8011 return -1; 8012 break; 8013 case PIPE_SHADER_TESS_EVAL: 8014 break; 8015 case PIPE_SHADER_GEOMETRY: 8016 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug)) 8017 return -1; 8018 break; 8019 case PIPE_SHADER_FRAGMENT: 8020 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug)) 8021 return -1; 8022 8023 /* Make sure we have at least as many VGPRs as there 8024 * are allocated inputs. 8025 */ 8026 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8027 shader->info.num_input_vgprs); 8028 break; 8029 } 8030 8031 /* Update SGPR and VGPR counts. */ 8032 if (shader->prolog) { 8033 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8034 shader->prolog->config.num_sgprs); 8035 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8036 shader->prolog->config.num_vgprs); 8037 } 8038 if (shader->previous_stage) { 8039 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8040 shader->previous_stage->config.num_sgprs); 8041 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8042 shader->previous_stage->config.num_vgprs); 8043 shader->config.spilled_sgprs = 8044 MAX2(shader->config.spilled_sgprs, 8045 shader->previous_stage->config.spilled_sgprs); 8046 shader->config.spilled_vgprs = 8047 MAX2(shader->config.spilled_vgprs, 8048 shader->previous_stage->config.spilled_vgprs); 8049 shader->config.private_mem_vgprs = 8050 MAX2(shader->config.private_mem_vgprs, 8051 shader->previous_stage->config.private_mem_vgprs); 8052 shader->config.scratch_bytes_per_wave = 8053 MAX2(shader->config.scratch_bytes_per_wave, 8054 shader->previous_stage->config.scratch_bytes_per_wave); 8055 shader->info.uses_instanceid |= 8056 shader->previous_stage->info.uses_instanceid; 8057 } 8058 if (shader->prolog2) { 8059 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8060 shader->prolog2->config.num_sgprs); 8061 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8062 shader->prolog2->config.num_vgprs); 8063 } 8064 if (shader->epilog) { 8065 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8066 shader->epilog->config.num_sgprs); 8067 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8068 shader->epilog->config.num_vgprs); 8069 } 8070 si_calculate_max_simd_waves(shader); 8071 } 8072 8073 si_fix_resource_usage(sscreen, shader); 8074 si_shader_dump(sscreen, shader, debug, sel->info.processor, 8075 stderr, true); 8076 8077 /* Upload. */ 8078 r = si_shader_binary_upload(sscreen, shader); 8079 if (r) { 8080 fprintf(stderr, "LLVM failed to upload shader\n"); 8081 return r; 8082 } 8083 8084 return 0; 8085} 8086 8087void si_shader_destroy(struct si_shader *shader) 8088{ 8089 if (shader->scratch_bo) 8090 si_resource_reference(&shader->scratch_bo, NULL); 8091 8092 si_resource_reference(&shader->bo, NULL); 8093 8094 if (!shader->is_binary_shared) 8095 ac_shader_binary_clean(&shader->binary); 8096 8097 free(shader->shader_log); 8098} 8099