1/* 2 * Copyright © 2016 Red Hat. 3 * Copyright © 2016 Bas Nieuwenhuizen 4 * 5 * based in part on anv driver which is: 6 * Copyright © 2015 Intel Corporation 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the next 16 * paragraph) shall be included in all copies or substantial portions of the 17 * Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 * DEALINGS IN THE SOFTWARE. 26 */ 27 28#include "common/freedreno_guardband.h" 29#include "tu_private.h" 30 31#include "ir3/ir3_nir.h" 32#include "main/menums.h" 33#include "nir/nir.h" 34#include "nir/nir_builder.h" 35#include "spirv/nir_spirv.h" 36#include "util/debug.h" 37#include "util/mesa-sha1.h" 38#include "util/u_atomic.h" 39#include "vk_format.h" 40#include "vk_util.h" 41 42#include "tu_cs.h" 43 44/* Emit IB that preloads the descriptors that the shader uses */ 45 46static void 47emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st, 48 enum a6xx_state_block sb, unsigned base, unsigned offset, 49 unsigned count) 50{ 51 /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not 52 * clear if emitting more packets will even help anything. Presumably the 53 * descriptor cache is relatively small, and these packets stop doing 54 * anything when there are too many descriptors. 55 */ 56 tu_cs_emit_pkt7(cs, opcode, 3); 57 tu_cs_emit(cs, 58 CP_LOAD_STATE6_0_STATE_TYPE(st) | 59 CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) | 60 CP_LOAD_STATE6_0_STATE_BLOCK(sb) | 61 CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1))); 62 tu_cs_emit_qw(cs, offset | (base << 28)); 63} 64 65static unsigned 66tu6_load_state_size(struct tu_pipeline *pipeline, bool compute) 67{ 68 const unsigned load_state_size = 4; 69 unsigned size = 0; 70 for (unsigned i = 0; i < pipeline->layout->num_sets; i++) { 71 if (!(pipeline->active_desc_sets & (1u << i))) 72 continue; 73 74 struct tu_descriptor_set_layout *set_layout = pipeline->layout->set[i].layout; 75 for (unsigned j = 0; j < set_layout->binding_count; j++) { 76 struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j]; 77 unsigned count = 0; 78 /* Note: some users, like amber for example, pass in 79 * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so 80 * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly. 81 */ 82 VkShaderStageFlags stages = compute ? 83 binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT : 84 binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS; 85 unsigned stage_count = util_bitcount(stages); 86 87 if (!binding->array_size) 88 continue; 89 90 switch (binding->type) { 91 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: 92 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: 93 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: 94 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: 95 /* IBO-backed resources only need one packet for all graphics stages */ 96 if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) 97 count += 1; 98 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) 99 count += 1; 100 break; 101 case VK_DESCRIPTOR_TYPE_SAMPLER: 102 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: 103 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: 104 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: 105 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: 106 /* Textures and UBO's needs a packet for each stage */ 107 count = stage_count; 108 break; 109 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: 110 /* Because of how we pack combined images and samplers, we 111 * currently can't use one packet for the whole array. 112 */ 113 count = stage_count * binding->array_size * 2; 114 break; 115 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: 116 case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE: 117 break; 118 default: 119 unreachable("bad descriptor type"); 120 } 121 size += count * load_state_size; 122 } 123 } 124 return size; 125} 126 127static void 128tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute) 129{ 130 unsigned size = tu6_load_state_size(pipeline, compute); 131 if (size == 0) 132 return; 133 134 struct tu_cs cs; 135 tu_cs_begin_sub_stream(&pipeline->cs, size, &cs); 136 137 struct tu_pipeline_layout *layout = pipeline->layout; 138 for (unsigned i = 0; i < layout->num_sets; i++) { 139 /* From 13.2.7. Descriptor Set Binding: 140 * 141 * A compatible descriptor set must be bound for all set numbers that 142 * any shaders in a pipeline access, at the time that a draw or 143 * dispatch command is recorded to execute using that pipeline. 144 * However, if none of the shaders in a pipeline statically use any 145 * bindings with a particular set number, then no descriptor set need 146 * be bound for that set number, even if the pipeline layout includes 147 * a non-trivial descriptor set layout for that set number. 148 * 149 * This means that descriptor sets unused by the pipeline may have a 150 * garbage or 0 BINDLESS_BASE register, which will cause context faults 151 * when prefetching descriptors from these sets. Skip prefetching for 152 * descriptors from them to avoid this. This is also an optimization, 153 * since these prefetches would be useless. 154 */ 155 if (!(pipeline->active_desc_sets & (1u << i))) 156 continue; 157 158 struct tu_descriptor_set_layout *set_layout = layout->set[i].layout; 159 for (unsigned j = 0; j < set_layout->binding_count; j++) { 160 struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j]; 161 unsigned base = i; 162 unsigned offset = binding->offset / 4; 163 /* Note: some users, like amber for example, pass in 164 * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so 165 * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly. 166 */ 167 VkShaderStageFlags stages = compute ? 168 binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT : 169 binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS; 170 unsigned count = binding->array_size; 171 if (count == 0 || stages == 0) 172 continue; 173 switch (binding->type) { 174 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: 175 base = MAX_SETS; 176 offset = (layout->set[i].dynamic_offset_start + 177 binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS; 178 FALLTHROUGH; 179 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: 180 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: 181 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: 182 /* IBO-backed resources only need one packet for all graphics stages */ 183 if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) { 184 emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO, 185 base, offset, count); 186 } 187 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) { 188 emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER, 189 base, offset, count); 190 } 191 break; 192 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: 193 case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE: 194 /* nothing - input attachment doesn't use bindless */ 195 break; 196 case VK_DESCRIPTOR_TYPE_SAMPLER: 197 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: 198 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: { 199 tu_foreach_stage(stage, stages) { 200 emit_load_state(&cs, tu6_stage2opcode(stage), 201 binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ? 202 ST6_SHADER : ST6_CONSTANTS, 203 tu6_stage2texsb(stage), base, offset, count); 204 } 205 break; 206 } 207 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: 208 base = MAX_SETS; 209 offset = (layout->set[i].dynamic_offset_start + 210 binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS; 211 FALLTHROUGH; 212 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: { 213 tu_foreach_stage(stage, stages) { 214 emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO, 215 tu6_stage2shadersb(stage), base, offset, count); 216 } 217 break; 218 } 219 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: { 220 tu_foreach_stage(stage, stages) { 221 /* TODO: We could emit less CP_LOAD_STATE6 if we used 222 * struct-of-arrays instead of array-of-structs. 223 */ 224 for (unsigned i = 0; i < count; i++) { 225 unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS; 226 unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS; 227 emit_load_state(&cs, tu6_stage2opcode(stage), 228 ST6_CONSTANTS, tu6_stage2texsb(stage), 229 base, tex_offset, 1); 230 emit_load_state(&cs, tu6_stage2opcode(stage), 231 ST6_SHADER, tu6_stage2texsb(stage), 232 base, sam_offset, 1); 233 } 234 } 235 break; 236 } 237 default: 238 unreachable("bad descriptor type"); 239 } 240 } 241 } 242 243 pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs); 244} 245 246struct tu_pipeline_builder 247{ 248 struct tu_device *device; 249 struct tu_pipeline_cache *cache; 250 struct tu_pipeline_layout *layout; 251 const VkAllocationCallbacks *alloc; 252 const VkGraphicsPipelineCreateInfo *create_info; 253 254 struct tu_shader *shaders[MESA_SHADER_FRAGMENT + 1]; 255 struct ir3_shader_variant *variants[MESA_SHADER_FRAGMENT + 1]; 256 struct ir3_shader_variant *binning_variant; 257 uint64_t shader_iova[MESA_SHADER_FRAGMENT + 1]; 258 uint64_t binning_vs_iova; 259 260 uint32_t additional_cs_reserve_size; 261 262 struct tu_pvtmem_config pvtmem; 263 264 bool rasterizer_discard; 265 /* these states are affectd by rasterizer_discard */ 266 bool emit_msaa_state; 267 VkSampleCountFlagBits samples; 268 bool use_color_attachments; 269 bool use_dual_src_blend; 270 bool alpha_to_coverage; 271 uint32_t color_attachment_count; 272 VkFormat color_attachment_formats[MAX_RTS]; 273 VkFormat depth_attachment_format; 274 uint32_t render_components; 275 uint32_t multiview_mask; 276}; 277 278static bool 279tu_logic_op_reads_dst(VkLogicOp op) 280{ 281 switch (op) { 282 case VK_LOGIC_OP_CLEAR: 283 case VK_LOGIC_OP_COPY: 284 case VK_LOGIC_OP_COPY_INVERTED: 285 case VK_LOGIC_OP_SET: 286 return false; 287 default: 288 return true; 289 } 290} 291 292static VkBlendFactor 293tu_blend_factor_no_dst_alpha(VkBlendFactor factor) 294{ 295 /* treat dst alpha as 1.0 and avoid reading it */ 296 switch (factor) { 297 case VK_BLEND_FACTOR_DST_ALPHA: 298 return VK_BLEND_FACTOR_ONE; 299 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA: 300 return VK_BLEND_FACTOR_ZERO; 301 default: 302 return factor; 303 } 304} 305 306static bool tu_blend_factor_is_dual_src(VkBlendFactor factor) 307{ 308 switch (factor) { 309 case VK_BLEND_FACTOR_SRC1_COLOR: 310 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR: 311 case VK_BLEND_FACTOR_SRC1_ALPHA: 312 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA: 313 return true; 314 default: 315 return false; 316 } 317} 318 319static bool 320tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo *info) 321{ 322 if (!info) 323 return false; 324 325 for (unsigned i = 0; i < info->attachmentCount; i++) { 326 const VkPipelineColorBlendAttachmentState *blend = &info->pAttachments[i]; 327 if (tu_blend_factor_is_dual_src(blend->srcColorBlendFactor) || 328 tu_blend_factor_is_dual_src(blend->dstColorBlendFactor) || 329 tu_blend_factor_is_dual_src(blend->srcAlphaBlendFactor) || 330 tu_blend_factor_is_dual_src(blend->dstAlphaBlendFactor)) 331 return true; 332 } 333 334 return false; 335} 336 337static const struct xs_config { 338 uint16_t reg_sp_xs_ctrl; 339 uint16_t reg_sp_xs_config; 340 uint16_t reg_sp_xs_instrlen; 341 uint16_t reg_hlsq_xs_ctrl; 342 uint16_t reg_sp_xs_first_exec_offset; 343 uint16_t reg_sp_xs_pvt_mem_hw_stack_offset; 344} xs_config[] = { 345 [MESA_SHADER_VERTEX] = { 346 REG_A6XX_SP_VS_CTRL_REG0, 347 REG_A6XX_SP_VS_CONFIG, 348 REG_A6XX_SP_VS_INSTRLEN, 349 REG_A6XX_HLSQ_VS_CNTL, 350 REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET, 351 REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET, 352 }, 353 [MESA_SHADER_TESS_CTRL] = { 354 REG_A6XX_SP_HS_CTRL_REG0, 355 REG_A6XX_SP_HS_CONFIG, 356 REG_A6XX_SP_HS_INSTRLEN, 357 REG_A6XX_HLSQ_HS_CNTL, 358 REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET, 359 REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET, 360 }, 361 [MESA_SHADER_TESS_EVAL] = { 362 REG_A6XX_SP_DS_CTRL_REG0, 363 REG_A6XX_SP_DS_CONFIG, 364 REG_A6XX_SP_DS_INSTRLEN, 365 REG_A6XX_HLSQ_DS_CNTL, 366 REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET, 367 REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET, 368 }, 369 [MESA_SHADER_GEOMETRY] = { 370 REG_A6XX_SP_GS_CTRL_REG0, 371 REG_A6XX_SP_GS_CONFIG, 372 REG_A6XX_SP_GS_INSTRLEN, 373 REG_A6XX_HLSQ_GS_CNTL, 374 REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET, 375 REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET, 376 }, 377 [MESA_SHADER_FRAGMENT] = { 378 REG_A6XX_SP_FS_CTRL_REG0, 379 REG_A6XX_SP_FS_CONFIG, 380 REG_A6XX_SP_FS_INSTRLEN, 381 REG_A6XX_HLSQ_FS_CNTL, 382 REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET, 383 REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET, 384 }, 385 [MESA_SHADER_COMPUTE] = { 386 REG_A6XX_SP_CS_CTRL_REG0, 387 REG_A6XX_SP_CS_CONFIG, 388 REG_A6XX_SP_CS_INSTRLEN, 389 REG_A6XX_HLSQ_CS_CNTL, 390 REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET, 391 REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET, 392 }, 393}; 394 395static uint32_t 396tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs) 397{ 398 const struct ir3_const_state *const_state = ir3_const_state(xs); 399 uint32_t base = const_state->offsets.immediate; 400 int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4); 401 402 /* truncate size to avoid writing constants that shader 403 * does not use: 404 */ 405 size = MIN2(size + base, xs->constlen) - base; 406 407 return MAX2(size, 0) * 4; 408} 409 410/* We allocate fixed-length substreams for shader state, however some 411 * parts of the state may have unbound length. Their additional space 412 * requirements should be calculated here. 413 */ 414static uint32_t 415tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs) 416{ 417 uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs); 418 return size; 419} 420 421void 422tu6_emit_xs_config(struct tu_cs *cs, 423 gl_shader_stage stage, /* xs->type, but xs may be NULL */ 424 const struct ir3_shader_variant *xs) 425{ 426 const struct xs_config *cfg = &xs_config[stage]; 427 428 if (!xs) { 429 /* shader stage disabled */ 430 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1); 431 tu_cs_emit(cs, 0); 432 433 tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1); 434 tu_cs_emit(cs, 0); 435 return; 436 } 437 438 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1); 439 tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED | 440 COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) | 441 COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) | 442 COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) | 443 COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) | 444 A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) | 445 A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp)); 446 447 tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1); 448 tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) | 449 A6XX_HLSQ_VS_CNTL_ENABLED); 450} 451 452void 453tu6_emit_xs(struct tu_cs *cs, 454 gl_shader_stage stage, /* xs->type, but xs may be NULL */ 455 const struct ir3_shader_variant *xs, 456 const struct tu_pvtmem_config *pvtmem, 457 uint64_t binary_iova) 458{ 459 const struct xs_config *cfg = &xs_config[stage]; 460 461 if (!xs) { 462 /* shader stage disabled */ 463 return; 464 } 465 466 enum a6xx_threadsize thrsz = 467 xs->info.double_threadsize ? THREAD128 : THREAD64; 468 switch (stage) { 469 case MESA_SHADER_VERTEX: 470 tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0( 471 .fullregfootprint = xs->info.max_reg + 1, 472 .halfregfootprint = xs->info.max_half_reg + 1, 473 .branchstack = ir3_shader_branchstack_hw(xs), 474 .mergedregs = xs->mergedregs, 475 )); 476 break; 477 case MESA_SHADER_TESS_CTRL: 478 tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0( 479 .fullregfootprint = xs->info.max_reg + 1, 480 .halfregfootprint = xs->info.max_half_reg + 1, 481 .branchstack = ir3_shader_branchstack_hw(xs), 482 )); 483 break; 484 case MESA_SHADER_TESS_EVAL: 485 tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0( 486 .fullregfootprint = xs->info.max_reg + 1, 487 .halfregfootprint = xs->info.max_half_reg + 1, 488 .branchstack = ir3_shader_branchstack_hw(xs), 489 .mergedregs = xs->mergedregs, 490 )); 491 break; 492 case MESA_SHADER_GEOMETRY: 493 tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0( 494 .fullregfootprint = xs->info.max_reg + 1, 495 .halfregfootprint = xs->info.max_half_reg + 1, 496 .branchstack = ir3_shader_branchstack_hw(xs), 497 )); 498 break; 499 case MESA_SHADER_FRAGMENT: 500 tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0( 501 .fullregfootprint = xs->info.max_reg + 1, 502 .halfregfootprint = xs->info.max_half_reg + 1, 503 .branchstack = ir3_shader_branchstack_hw(xs), 504 .mergedregs = xs->mergedregs, 505 .threadsize = thrsz, 506 .pixlodenable = xs->need_pixlod, 507 .diff_fine = xs->need_fine_derivatives, 508 .varying = xs->total_in != 0, 509 /* unknown bit, seems unnecessary */ 510 .unk24 = true, 511 )); 512 break; 513 case MESA_SHADER_COMPUTE: 514 tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0( 515 .fullregfootprint = xs->info.max_reg + 1, 516 .halfregfootprint = xs->info.max_half_reg + 1, 517 .branchstack = ir3_shader_branchstack_hw(xs), 518 .mergedregs = xs->mergedregs, 519 .threadsize = thrsz, 520 )); 521 break; 522 default: 523 unreachable("bad shader stage"); 524 } 525 526 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1); 527 tu_cs_emit(cs, xs->instrlen); 528 529 /* emit program binary & private memory layout 530 * binary_iova should be aligned to 1 instrlen unit (128 bytes) 531 */ 532 533 assert((binary_iova & 0x7f) == 0); 534 assert((pvtmem->iova & 0x1f) == 0); 535 536 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7); 537 tu_cs_emit(cs, 0); 538 tu_cs_emit_qw(cs, binary_iova); 539 tu_cs_emit(cs, 540 A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size)); 541 tu_cs_emit_qw(cs, pvtmem->iova); 542 tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) | 543 COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT)); 544 545 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1); 546 tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size)); 547 548 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); 549 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 550 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | 551 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 552 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 553 CP_LOAD_STATE6_0_NUM_UNIT(xs->instrlen)); 554 tu_cs_emit_qw(cs, binary_iova); 555 556 /* emit immediates */ 557 558 const struct ir3_const_state *const_state = ir3_const_state(xs); 559 uint32_t base = const_state->offsets.immediate; 560 unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs); 561 562 if (immediate_size > 0) { 563 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size); 564 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | 565 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 566 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 567 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 568 CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4)); 569 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 570 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 571 572 tu_cs_emit_array(cs, const_state->immediates, immediate_size); 573 } 574 575 if (const_state->constant_data_ubo != -1) { 576 uint64_t iova = binary_iova + xs->info.constant_data_offset; 577 578 /* Upload UBO state for the constant data. */ 579 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5); 580 tu_cs_emit(cs, 581 CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) | 582 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)| 583 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 584 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 585 CP_LOAD_STATE6_0_NUM_UNIT(1)); 586 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 587 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 588 int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16); 589 tu_cs_emit_qw(cs, 590 iova | 591 (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32); 592 593 /* Upload the constant data to the const file if needed. */ 594 const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state; 595 596 for (int i = 0; i < ubo_state->num_enabled; i++) { 597 if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo || 598 ubo_state->range[i].ubo.bindless) { 599 continue; 600 } 601 602 uint32_t start = ubo_state->range[i].start; 603 uint32_t end = ubo_state->range[i].end; 604 uint32_t size = MIN2(end - start, 605 (16 * xs->constlen) - ubo_state->range[i].offset); 606 607 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); 608 tu_cs_emit(cs, 609 CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) | 610 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 611 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 612 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 613 CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); 614 tu_cs_emit_qw(cs, iova + start); 615 } 616 } 617} 618 619static void 620tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader, 621 const struct ir3_shader_variant *v, 622 const struct tu_pvtmem_config *pvtmem, 623 uint64_t binary_iova) 624{ 625 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( 626 .cs_state = true, 627 .cs_ibo = true)); 628 629 tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v); 630 tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova); 631 632 uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1); 633 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); 634 tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) | 635 A6XX_SP_CS_UNKNOWN_A9B1_UNK6); 636 637 if (cs->device->physical_device->info->a6xx.has_lpac) { 638 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1); 639 tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) | 640 A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6); 641 } 642 643 uint32_t local_invocation_id = 644 ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); 645 uint32_t work_group_id = 646 ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID); 647 648 enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64; 649 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2); 650 tu_cs_emit(cs, 651 A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | 652 A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | 653 A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | 654 A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); 655 tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | 656 A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz)); 657 658 if (cs->device->physical_device->info->a6xx.has_lpac) { 659 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2); 660 tu_cs_emit(cs, 661 A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) | 662 A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | 663 A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | 664 A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); 665 tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | 666 A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz)); 667 } 668} 669 670static void 671tu6_emit_vs_system_values(struct tu_cs *cs, 672 const struct ir3_shader_variant *vs, 673 const struct ir3_shader_variant *hs, 674 const struct ir3_shader_variant *ds, 675 const struct ir3_shader_variant *gs, 676 bool primid_passthru) 677{ 678 const uint32_t vertexid_regid = 679 ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); 680 const uint32_t instanceid_regid = 681 ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); 682 const uint32_t tess_coord_x_regid = hs ? 683 ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD) : 684 regid(63, 0); 685 const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ? 686 tess_coord_x_regid + 1 : 687 regid(63, 0); 688 const uint32_t hs_rel_patch_regid = hs ? 689 ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3) : 690 regid(63, 0); 691 const uint32_t ds_rel_patch_regid = hs ? 692 ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3) : 693 regid(63, 0); 694 const uint32_t hs_invocation_regid = hs ? 695 ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3) : 696 regid(63, 0); 697 const uint32_t gs_primitiveid_regid = gs ? 698 ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) : 699 regid(63, 0); 700 const uint32_t vs_primitiveid_regid = hs ? 701 ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID) : 702 gs_primitiveid_regid; 703 const uint32_t ds_primitiveid_regid = ds ? 704 ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID) : 705 regid(63, 0); 706 const uint32_t gsheader_regid = gs ? 707 ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3) : 708 regid(63, 0); 709 710 /* Note: we currently don't support multiview with tess or GS. If we did, 711 * and the HW actually works, then we'd have to somehow share this across 712 * stages. Note that the blob doesn't support this either. 713 */ 714 const uint32_t viewid_regid = 715 ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX); 716 717 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 6); 718 tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) | 719 A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) | 720 A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) | 721 A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid)); 722 tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) | 723 A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid)); 724 tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) | 725 A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) | 726 A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) | 727 A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid)); 728 tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */ 729 tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) | 730 0xfc00); /* VFD_CONTROL_5 */ 731 tu_cs_emit(cs, COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */ 732} 733 734static void 735tu6_setup_streamout(struct tu_cs *cs, 736 const struct ir3_shader_variant *v, 737 struct ir3_shader_linkage *l) 738{ 739 const struct ir3_stream_output_info *info = &v->shader->stream_output; 740 /* Note: 64 here comes from the HW layout of the program RAM. The program 741 * for stream N is at DWORD 64 * N. 742 */ 743#define A6XX_SO_PROG_DWORDS 64 744 uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {}; 745 BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0}; 746 uint32_t ncomp[IR3_MAX_SO_BUFFERS] = {}; 747 748 /* TODO: streamout state should be in a non-GMEM draw state */ 749 750 /* no streamout: */ 751 if (info->num_outputs == 0) { 752 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4); 753 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); 754 tu_cs_emit(cs, 0); 755 tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL); 756 tu_cs_emit(cs, 0); 757 return; 758 } 759 760 /* is there something to do with info->stride[i]? */ 761 762 for (unsigned i = 0; i < info->num_outputs; i++) { 763 const struct ir3_stream_output *out = &info->output[i]; 764 unsigned k = out->register_index; 765 unsigned idx; 766 767 /* Skip it, if it's an output that was never assigned a register. */ 768 if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG) 769 continue; 770 771 ncomp[out->output_buffer] += out->num_components; 772 773 /* linkage map sorted by order frag shader wants things, so 774 * a bit less ideal here.. 775 */ 776 for (idx = 0; idx < l->cnt; idx++) 777 if (l->var[idx].regid == v->outputs[k].regid) 778 break; 779 780 debug_assert(idx < l->cnt); 781 782 for (unsigned j = 0; j < out->num_components; j++) { 783 unsigned c = j + out->start_component; 784 unsigned loc = l->var[idx].loc + c; 785 unsigned off = j + out->dst_offset; /* in dwords */ 786 787 assert(loc < A6XX_SO_PROG_DWORDS * 2); 788 unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2; 789 if (loc & 1) { 790 prog[dword] |= A6XX_VPC_SO_PROG_B_EN | 791 A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) | 792 A6XX_VPC_SO_PROG_B_OFF(off * 4); 793 } else { 794 prog[dword] |= A6XX_VPC_SO_PROG_A_EN | 795 A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) | 796 A6XX_VPC_SO_PROG_A_OFF(off * 4); 797 } 798 BITSET_SET(valid_dwords, dword); 799 } 800 } 801 802 unsigned prog_count = 0; 803 unsigned start, end; 804 BITSET_FOREACH_RANGE(start, end, valid_dwords, 805 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) { 806 prog_count += end - start + 1; 807 } 808 809 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count); 810 tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL); 811 tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) | 812 COND(ncomp[0] > 0, 813 A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) | 814 COND(ncomp[1] > 0, 815 A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) | 816 COND(ncomp[2] > 0, 817 A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) | 818 COND(ncomp[3] > 0, 819 A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3]))); 820 for (uint32_t i = 0; i < 4; i++) { 821 tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(i)); 822 tu_cs_emit(cs, ncomp[i]); 823 } 824 bool first = true; 825 BITSET_FOREACH_RANGE(start, end, valid_dwords, 826 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) { 827 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); 828 tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) | 829 A6XX_VPC_SO_CNTL_ADDR(start)); 830 for (unsigned i = start; i < end; i++) { 831 tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG); 832 tu_cs_emit(cs, prog[i]); 833 } 834 first = false; 835 } 836} 837 838static void 839tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base, 840 enum a6xx_state_block block, uint32_t offset, 841 uint32_t size, const uint32_t *dwords) { 842 assert(size % 4 == 0); 843 844 tu_cs_emit_pkt7(cs, opcode, 3 + size); 845 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | 846 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 847 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 848 CP_LOAD_STATE6_0_STATE_BLOCK(block) | 849 CP_LOAD_STATE6_0_NUM_UNIT(size / 4)); 850 851 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 852 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 853 dwords = (uint32_t *)&((uint8_t *)dwords)[offset]; 854 855 tu_cs_emit_array(cs, dwords, size); 856} 857 858static void 859tu6_emit_link_map(struct tu_cs *cs, 860 const struct ir3_shader_variant *producer, 861 const struct ir3_shader_variant *consumer, 862 enum a6xx_state_block sb) 863{ 864 const struct ir3_const_state *const_state = ir3_const_state(consumer); 865 uint32_t base = const_state->offsets.primitive_map; 866 int size = DIV_ROUND_UP(consumer->input_size, 4); 867 868 size = (MIN2(size + base, consumer->constlen) - base) * 4; 869 if (size <= 0) 870 return; 871 872 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size, 873 producer->output_loc); 874} 875 876static uint16_t 877gl_primitive_to_tess(uint16_t primitive) { 878 switch (primitive) { 879 case GL_POINTS: 880 return TESS_POINTS; 881 case GL_LINE_STRIP: 882 return TESS_LINES; 883 case GL_TRIANGLE_STRIP: 884 return TESS_CW_TRIS; 885 default: 886 unreachable(""); 887 } 888} 889 890void 891tu6_emit_vpc(struct tu_cs *cs, 892 const struct ir3_shader_variant *vs, 893 const struct ir3_shader_variant *hs, 894 const struct ir3_shader_variant *ds, 895 const struct ir3_shader_variant *gs, 896 const struct ir3_shader_variant *fs, 897 uint32_t patch_control_points) 898{ 899 /* note: doesn't compile as static because of the array regs.. */ 900 const struct reg_config { 901 uint16_t reg_sp_xs_out_reg; 902 uint16_t reg_sp_xs_vpc_dst_reg; 903 uint16_t reg_vpc_xs_pack; 904 uint16_t reg_vpc_xs_clip_cntl; 905 uint16_t reg_gras_xs_cl_cntl; 906 uint16_t reg_pc_xs_out_cntl; 907 uint16_t reg_sp_xs_primitive_cntl; 908 uint16_t reg_vpc_xs_layer_cntl; 909 uint16_t reg_gras_xs_layer_cntl; 910 } reg_config[] = { 911 [MESA_SHADER_VERTEX] = { 912 REG_A6XX_SP_VS_OUT_REG(0), 913 REG_A6XX_SP_VS_VPC_DST_REG(0), 914 REG_A6XX_VPC_VS_PACK, 915 REG_A6XX_VPC_VS_CLIP_CNTL, 916 REG_A6XX_GRAS_VS_CL_CNTL, 917 REG_A6XX_PC_VS_OUT_CNTL, 918 REG_A6XX_SP_VS_PRIMITIVE_CNTL, 919 REG_A6XX_VPC_VS_LAYER_CNTL, 920 REG_A6XX_GRAS_VS_LAYER_CNTL 921 }, 922 [MESA_SHADER_TESS_CTRL] = { 923 0, 924 0, 925 0, 926 0, 927 0, 928 REG_A6XX_PC_HS_OUT_CNTL, 929 0, 930 0, 931 0 932 }, 933 [MESA_SHADER_TESS_EVAL] = { 934 REG_A6XX_SP_DS_OUT_REG(0), 935 REG_A6XX_SP_DS_VPC_DST_REG(0), 936 REG_A6XX_VPC_DS_PACK, 937 REG_A6XX_VPC_DS_CLIP_CNTL, 938 REG_A6XX_GRAS_DS_CL_CNTL, 939 REG_A6XX_PC_DS_OUT_CNTL, 940 REG_A6XX_SP_DS_PRIMITIVE_CNTL, 941 REG_A6XX_VPC_DS_LAYER_CNTL, 942 REG_A6XX_GRAS_DS_LAYER_CNTL 943 }, 944 [MESA_SHADER_GEOMETRY] = { 945 REG_A6XX_SP_GS_OUT_REG(0), 946 REG_A6XX_SP_GS_VPC_DST_REG(0), 947 REG_A6XX_VPC_GS_PACK, 948 REG_A6XX_VPC_GS_CLIP_CNTL, 949 REG_A6XX_GRAS_GS_CL_CNTL, 950 REG_A6XX_PC_GS_OUT_CNTL, 951 REG_A6XX_SP_GS_PRIMITIVE_CNTL, 952 REG_A6XX_VPC_GS_LAYER_CNTL, 953 REG_A6XX_GRAS_GS_LAYER_CNTL 954 }, 955 }; 956 957 const struct ir3_shader_variant *last_shader; 958 if (gs) { 959 last_shader = gs; 960 } else if (hs) { 961 last_shader = ds; 962 } else { 963 last_shader = vs; 964 } 965 966 const struct reg_config *cfg = ®_config[last_shader->type]; 967 968 struct ir3_shader_linkage linkage = { 969 .primid_loc = 0xff, 970 .clip0_loc = 0xff, 971 .clip1_loc = 0xff, 972 }; 973 if (fs) 974 ir3_link_shaders(&linkage, last_shader, fs, true); 975 976 if (last_shader->shader->stream_output.num_outputs) 977 ir3_link_stream_out(&linkage, last_shader); 978 979 /* We do this after linking shaders in order to know whether PrimID 980 * passthrough needs to be enabled. 981 */ 982 bool primid_passthru = linkage.primid_loc != 0xff; 983 tu6_emit_vs_system_values(cs, vs, hs, ds, gs, primid_passthru); 984 985 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4); 986 tu_cs_emit(cs, ~linkage.varmask[0]); 987 tu_cs_emit(cs, ~linkage.varmask[1]); 988 tu_cs_emit(cs, ~linkage.varmask[2]); 989 tu_cs_emit(cs, ~linkage.varmask[3]); 990 991 /* a6xx finds position/pointsize at the end */ 992 const uint32_t pointsize_regid = 993 ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ); 994 const uint32_t layer_regid = 995 ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER); 996 const uint32_t view_regid = 997 ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT); 998 const uint32_t clip0_regid = 999 ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0); 1000 const uint32_t clip1_regid = 1001 ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1); 1002 uint32_t flags_regid = gs ? 1003 ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0; 1004 1005 uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff; 1006 1007 if (layer_regid != regid(63, 0)) { 1008 layer_loc = linkage.max_loc; 1009 ir3_link_add(&linkage, layer_regid, 0x1, linkage.max_loc); 1010 } 1011 1012 if (view_regid != regid(63, 0)) { 1013 view_loc = linkage.max_loc; 1014 ir3_link_add(&linkage, view_regid, 0x1, linkage.max_loc); 1015 } 1016 1017 unsigned extra_pos = 0; 1018 1019 for (unsigned i = 0; i < last_shader->outputs_count; i++) { 1020 if (last_shader->outputs[i].slot != VARYING_SLOT_POS) 1021 continue; 1022 1023 if (position_loc == 0xff) 1024 position_loc = linkage.max_loc; 1025 1026 ir3_link_add(&linkage, last_shader->outputs[i].regid, 1027 0xf, position_loc + 4 * last_shader->outputs[i].view); 1028 extra_pos = MAX2(extra_pos, last_shader->outputs[i].view); 1029 } 1030 1031 if (pointsize_regid != regid(63, 0)) { 1032 pointsize_loc = linkage.max_loc; 1033 ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc); 1034 } 1035 1036 uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask; 1037 1038 /* Handle the case where clip/cull distances aren't read by the FS */ 1039 uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc; 1040 if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) { 1041 clip0_loc = linkage.max_loc; 1042 ir3_link_add(&linkage, clip0_regid, clip_cull_mask & 0xf, linkage.max_loc); 1043 } 1044 if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) { 1045 clip1_loc = linkage.max_loc; 1046 ir3_link_add(&linkage, clip1_regid, clip_cull_mask >> 4, linkage.max_loc); 1047 } 1048 1049 tu6_setup_streamout(cs, last_shader, &linkage); 1050 1051 /* The GPU hangs on some models when there are no outputs (xs_pack::CNT), 1052 * at least when a DS is the last stage, so add a dummy output to keep it 1053 * happy if there aren't any. We do this late in order to avoid emitting 1054 * any unused code and make sure that optimizations don't remove it. 1055 */ 1056 if (linkage.cnt == 0) 1057 ir3_link_add(&linkage, 0, 0x1, linkage.max_loc); 1058 1059 /* map outputs of the last shader to VPC */ 1060 assert(linkage.cnt <= 32); 1061 const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2); 1062 const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4); 1063 uint32_t sp_out[16] = {0}; 1064 uint32_t sp_vpc_dst[8] = {0}; 1065 for (uint32_t i = 0; i < linkage.cnt; i++) { 1066 ((uint16_t *) sp_out)[i] = 1067 A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) | 1068 A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask); 1069 ((uint8_t *) sp_vpc_dst)[i] = 1070 A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc); 1071 } 1072 1073 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count); 1074 tu_cs_emit_array(cs, sp_out, sp_out_count); 1075 1076 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count); 1077 tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count); 1078 1079 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1); 1080 tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) | 1081 A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) | 1082 A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) | 1083 A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos)); 1084 1085 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1); 1086 tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | 1087 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | 1088 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); 1089 1090 tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1); 1091 tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) | 1092 A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask)); 1093 1094 const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs }; 1095 1096 for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) { 1097 const struct ir3_shader_variant *shader = geom_shaders[i]; 1098 if (!shader) 1099 continue; 1100 1101 bool primid = shader->type != MESA_SHADER_VERTEX && 1102 VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID)); 1103 1104 tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1); 1105 if (shader == last_shader) { 1106 tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) | 1107 CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) | 1108 CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) | 1109 CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) | 1110 COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) | 1111 A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); 1112 } else { 1113 tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID)); 1114 } 1115 } 1116 1117 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1); 1118 tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) | 1119 A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid)); 1120 1121 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1); 1122 tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) | 1123 A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc)); 1124 1125 tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1); 1126 tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) | 1127 CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW)); 1128 1129 tu_cs_emit_regs(cs, A6XX_PC_PRIMID_PASSTHRU(primid_passthru)); 1130 1131 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1); 1132 tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs ? fs->total_in : 0) | 1133 COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) | 1134 A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) | 1135 A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc)); 1136 1137 if (hs) { 1138 shader_info *hs_info = &hs->shader->nir->info; 1139 1140 tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1); 1141 tu_cs_emit(cs, hs_info->tess.tcs_vertices_out); 1142 1143 /* Total attribute slots in HS incoming patch. */ 1144 tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1); 1145 tu_cs_emit(cs, patch_control_points * vs->output_size / 4); 1146 1147 const uint32_t wavesize = 64; 1148 const uint32_t max_wave_input_size = 64; 1149 1150 /* note: if HS is really just the VS extended, then this 1151 * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out) 1152 * however that doesn't match the blob, and fails some dEQP tests. 1153 */ 1154 uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out; 1155 uint32_t max_prims_per_wave = 1156 max_wave_input_size * wavesize / (vs->output_size * patch_control_points); 1157 prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); 1158 1159 uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave; 1160 uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); 1161 1162 tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); 1163 tu_cs_emit(cs, wave_input_size); 1164 1165 /* In SPIR-V generated from GLSL, the tessellation primitive params are 1166 * are specified in the tess eval shader, but in SPIR-V generated from 1167 * HLSL, they are specified in the tess control shader. */ 1168 shader_info *tess_info = 1169 ds->shader->nir->info.tess.spacing == TESS_SPACING_UNSPECIFIED ? 1170 &hs->shader->nir->info : &ds->shader->nir->info; 1171 tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_CNTL, 1); 1172 uint32_t output; 1173 if (tess_info->tess.point_mode) 1174 output = TESS_POINTS; 1175 else if (tess_info->tess.primitive_mode == GL_ISOLINES) 1176 output = TESS_LINES; 1177 else if (tess_info->tess.ccw) 1178 output = TESS_CCW_TRIS; 1179 else 1180 output = TESS_CW_TRIS; 1181 1182 enum a6xx_tess_spacing spacing; 1183 switch (tess_info->tess.spacing) { 1184 case TESS_SPACING_EQUAL: 1185 spacing = TESS_EQUAL; 1186 break; 1187 case TESS_SPACING_FRACTIONAL_ODD: 1188 spacing = TESS_FRACTIONAL_ODD; 1189 break; 1190 case TESS_SPACING_FRACTIONAL_EVEN: 1191 spacing = TESS_FRACTIONAL_EVEN; 1192 break; 1193 case TESS_SPACING_UNSPECIFIED: 1194 default: 1195 unreachable("invalid tess spacing"); 1196 } 1197 tu_cs_emit(cs, A6XX_PC_TESS_CNTL_SPACING(spacing) | 1198 A6XX_PC_TESS_CNTL_OUTPUT(output)); 1199 1200 tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER); 1201 tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER); 1202 } 1203 1204 1205 if (gs) { 1206 uint32_t vertices_out, invocations, output, vec4_size; 1207 uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size; 1208 1209 /* this detects the tu_clear_blit path, which doesn't set ->nir */ 1210 if (gs->shader->nir) { 1211 if (hs) { 1212 tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER); 1213 } else { 1214 tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER); 1215 } 1216 vertices_out = gs->shader->nir->info.gs.vertices_out - 1; 1217 output = gl_primitive_to_tess(gs->shader->nir->info.gs.output_primitive); 1218 invocations = gs->shader->nir->info.gs.invocations - 1; 1219 /* Size of per-primitive alloction in ldlw memory in vec4s. */ 1220 vec4_size = gs->shader->nir->info.gs.vertices_in * 1221 DIV_ROUND_UP(prev_stage_output_size, 4); 1222 } else { 1223 vertices_out = 3; 1224 output = TESS_CW_TRIS; 1225 invocations = 0; 1226 vec4_size = 0; 1227 } 1228 1229 tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1); 1230 tu_cs_emit(cs, 1231 A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) | 1232 A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | 1233 A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations)); 1234 1235 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1); 1236 tu_cs_emit(cs, 0xff); 1237 1238 tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); 1239 tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); 1240 1241 uint32_t prim_size = prev_stage_output_size; 1242 if (prim_size > 64) 1243 prim_size = 64; 1244 else if (prim_size == 64) 1245 prim_size = 63; 1246 tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1); 1247 tu_cs_emit(cs, prim_size); 1248 } 1249} 1250 1251static int 1252tu6_vpc_varying_mode(const struct ir3_shader_variant *fs, 1253 uint32_t index, 1254 uint8_t *interp_mode, 1255 uint8_t *ps_repl_mode) 1256{ 1257 enum 1258 { 1259 INTERP_SMOOTH = 0, 1260 INTERP_FLAT = 1, 1261 INTERP_ZERO = 2, 1262 INTERP_ONE = 3, 1263 }; 1264 enum 1265 { 1266 PS_REPL_NONE = 0, 1267 PS_REPL_S = 1, 1268 PS_REPL_T = 2, 1269 PS_REPL_ONE_MINUS_T = 3, 1270 }; 1271 1272 const uint32_t compmask = fs->inputs[index].compmask; 1273 1274 /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and 1275 * fourth component occupy three consecutive varying slots 1276 */ 1277 int shift = 0; 1278 *interp_mode = 0; 1279 *ps_repl_mode = 0; 1280 if (fs->inputs[index].slot == VARYING_SLOT_PNTC) { 1281 if (compmask & 0x1) { 1282 *ps_repl_mode |= PS_REPL_S << shift; 1283 shift += 2; 1284 } 1285 if (compmask & 0x2) { 1286 *ps_repl_mode |= PS_REPL_T << shift; 1287 shift += 2; 1288 } 1289 if (compmask & 0x4) { 1290 *interp_mode |= INTERP_ZERO << shift; 1291 shift += 2; 1292 } 1293 if (compmask & 0x8) { 1294 *interp_mode |= INTERP_ONE << 6; 1295 shift += 2; 1296 } 1297 } else if (fs->inputs[index].flat) { 1298 for (int i = 0; i < 4; i++) { 1299 if (compmask & (1 << i)) { 1300 *interp_mode |= INTERP_FLAT << shift; 1301 shift += 2; 1302 } 1303 } 1304 } 1305 1306 return shift; 1307} 1308 1309static void 1310tu6_emit_vpc_varying_modes(struct tu_cs *cs, 1311 const struct ir3_shader_variant *fs) 1312{ 1313 uint32_t interp_modes[8] = { 0 }; 1314 uint32_t ps_repl_modes[8] = { 0 }; 1315 1316 if (fs) { 1317 for (int i = -1; 1318 (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) { 1319 1320 /* get the mode for input i */ 1321 uint8_t interp_mode; 1322 uint8_t ps_repl_mode; 1323 const int bits = 1324 tu6_vpc_varying_mode(fs, i, &interp_mode, &ps_repl_mode); 1325 1326 /* OR the mode into the array */ 1327 const uint32_t inloc = fs->inputs[i].inloc * 2; 1328 uint32_t n = inloc / 32; 1329 uint32_t shift = inloc % 32; 1330 interp_modes[n] |= interp_mode << shift; 1331 ps_repl_modes[n] |= ps_repl_mode << shift; 1332 if (shift + bits > 32) { 1333 n++; 1334 shift = 32 - shift; 1335 1336 interp_modes[n] |= interp_mode >> shift; 1337 ps_repl_modes[n] |= ps_repl_mode >> shift; 1338 } 1339 } 1340 } 1341 1342 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); 1343 tu_cs_emit_array(cs, interp_modes, 8); 1344 1345 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); 1346 tu_cs_emit_array(cs, ps_repl_modes, 8); 1347} 1348 1349void 1350tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) 1351{ 1352 uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; 1353 uint32_t ij_regid[IJ_COUNT]; 1354 uint32_t smask_in_regid; 1355 1356 bool sample_shading = fs->per_samp | fs->key.sample_shading; 1357 bool enable_varyings = fs->total_in > 0; 1358 1359 samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); 1360 smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); 1361 face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); 1362 coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); 1363 zwcoord_regid = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0); 1364 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) 1365 ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); 1366 1367 if (fs->num_sampler_prefetch > 0) { 1368 assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL])); 1369 /* also, it seems like ij_pix is *required* to be r0.x */ 1370 assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0)); 1371 } 1372 1373 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); 1374 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | 1375 A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) | 1376 0x7000); // XXX); 1377 for (int i = 0; i < fs->num_sampler_prefetch; i++) { 1378 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; 1379 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) | 1380 A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) | 1381 A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) | 1382 A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) | 1383 A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) | 1384 COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) | 1385 A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd)); 1386 } 1387 1388 if (fs->num_sampler_prefetch > 0) { 1389 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch); 1390 for (int i = 0; i < fs->num_sampler_prefetch; i++) { 1391 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; 1392 tu_cs_emit(cs, 1393 A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) | 1394 A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id)); 1395 } 1396 } 1397 1398 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5); 1399 tu_cs_emit(cs, 0x7); 1400 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | 1401 A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | 1402 A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) | 1403 A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_regid[IJ_PERSP_SIZE])); 1404 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | 1405 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | 1406 A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) | 1407 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID])); 1408 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | 1409 A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | 1410 A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) | 1411 A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE])); 1412 tu_cs_emit(cs, 0xfcfc); 1413 1414 enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64; 1415 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1); 1416 tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz) | 1417 COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS)); 1418 1419 bool need_size = fs->frag_face || fs->fragcoord_compmask != 0; 1420 bool need_size_persamp = false; 1421 if (VALIDREG(ij_regid[IJ_PERSP_SIZE])) { 1422 if (sample_shading) 1423 need_size_persamp = true; 1424 else 1425 need_size = true; 1426 } 1427 1428 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1); 1429 tu_cs_emit(cs, 1430 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) | 1431 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) | 1432 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) | 1433 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | 1434 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) | 1435 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | 1436 COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | 1437 COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | 1438 COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask))); 1439 1440 tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2); 1441 tu_cs_emit(cs, 1442 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) | 1443 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) | 1444 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) | 1445 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | 1446 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) | 1447 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | 1448 COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | 1449 COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) | 1450 COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | 1451 COND(fs->fragcoord_compmask != 0, 1452 A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask))); 1453 tu_cs_emit(cs, 1454 A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE( 1455 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) | 1456 CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | 1457 CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | 1458 CONDREG(ij_regid[IJ_PERSP_SIZE], A6XX_RB_RENDER_CONTROL1_SIZE) | 1459 COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); 1460 1461 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1); 1462 tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE)); 1463 1464 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1); 1465 tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) | 1466 A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE( 1467 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER)); 1468 1469 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1); 1470 tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE)); 1471} 1472 1473static void 1474tu6_emit_fs_outputs(struct tu_cs *cs, 1475 const struct ir3_shader_variant *fs, 1476 uint32_t mrt_count, bool dual_src_blend, 1477 uint32_t render_components, 1478 bool no_earlyz, 1479 struct tu_pipeline *pipeline) 1480{ 1481 uint32_t smask_regid, posz_regid, stencilref_regid; 1482 1483 posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); 1484 smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK); 1485 stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL); 1486 1487 uint32_t fragdata_regid[8]; 1488 if (fs->color0_mrt) { 1489 fragdata_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_COLOR); 1490 for (uint32_t i = 1; i < ARRAY_SIZE(fragdata_regid); i++) 1491 fragdata_regid[i] = fragdata_regid[0]; 1492 } else { 1493 for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) 1494 fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i); 1495 } 1496 1497 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); 1498 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | 1499 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | 1500 A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) | 1501 COND(dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); 1502 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count)); 1503 1504 uint32_t fs_render_components = 0; 1505 1506 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 8); 1507 for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) { 1508 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) | 1509 (COND(fragdata_regid[i] & HALF_REG_ID, 1510 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION))); 1511 1512 if (VALIDREG(fragdata_regid[i])) { 1513 fs_render_components |= 0xf << (i * 4); 1514 } 1515 } 1516 1517 /* dual source blending has an extra fs output in the 2nd slot */ 1518 if (dual_src_blend) { 1519 fs_render_components |= 0xf << 4; 1520 } 1521 1522 /* There is no point in having component enabled which is not written 1523 * by the shader. Per VK spec it is an UB, however a few apps depend on 1524 * attachment not being changed if FS doesn't have corresponding output. 1525 */ 1526 fs_render_components &= render_components; 1527 1528 tu_cs_emit_regs(cs, 1529 A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components)); 1530 1531 tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2); 1532 tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) | 1533 COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) | 1534 COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) | 1535 COND(dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); 1536 tu_cs_emit(cs, A6XX_RB_FS_OUTPUT_CNTL1_MRT(mrt_count)); 1537 1538 tu_cs_emit_regs(cs, 1539 A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components)); 1540 1541 if (pipeline) { 1542 pipeline->lrz.fs_has_kill = fs->has_kill; 1543 pipeline->lrz.early_fragment_tests = fs->shader->nir->info.fs.early_fragment_tests; 1544 1545 if ((fs->shader && !fs->shader->nir->info.fs.early_fragment_tests) && 1546 (fs->no_earlyz || fs->has_kill || fs->writes_pos || fs->writes_stencilref || no_earlyz || fs->writes_smask)) { 1547 pipeline->lrz.force_late_z = true; 1548 } 1549 } 1550} 1551 1552static void 1553tu6_emit_geom_tess_consts(struct tu_cs *cs, 1554 const struct ir3_shader_variant *vs, 1555 const struct ir3_shader_variant *hs, 1556 const struct ir3_shader_variant *ds, 1557 const struct ir3_shader_variant *gs, 1558 uint32_t cps_per_patch) 1559{ 1560 uint32_t num_vertices = 1561 hs ? cps_per_patch : gs->shader->nir->info.gs.vertices_in; 1562 1563 uint32_t vs_params[4] = { 1564 vs->output_size * num_vertices * 4, /* vs primitive stride */ 1565 vs->output_size * 4, /* vs vertex stride */ 1566 0, 1567 0, 1568 }; 1569 uint32_t vs_base = ir3_const_state(vs)->offsets.primitive_param; 1570 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0, 1571 ARRAY_SIZE(vs_params), vs_params); 1572 1573 if (hs) { 1574 assert(ds->type != MESA_SHADER_NONE); 1575 uint32_t hs_params[4] = { 1576 vs->output_size * num_vertices * 4, /* hs primitive stride */ 1577 vs->output_size * 4, /* hs vertex stride */ 1578 hs->output_size, 1579 cps_per_patch, 1580 }; 1581 1582 uint32_t hs_base = hs->const_state->offsets.primitive_param; 1583 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0, 1584 ARRAY_SIZE(hs_params), hs_params); 1585 if (gs) 1586 num_vertices = gs->shader->nir->info.gs.vertices_in; 1587 1588 uint32_t ds_params[4] = { 1589 ds->output_size * num_vertices * 4, /* ds primitive stride */ 1590 ds->output_size * 4, /* ds vertex stride */ 1591 hs->output_size, /* hs vertex stride (dwords) */ 1592 hs->shader->nir->info.tess.tcs_vertices_out 1593 }; 1594 1595 uint32_t ds_base = ds->const_state->offsets.primitive_param; 1596 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0, 1597 ARRAY_SIZE(ds_params), ds_params); 1598 } 1599 1600 if (gs) { 1601 const struct ir3_shader_variant *prev = ds ? ds : vs; 1602 uint32_t gs_params[4] = { 1603 prev->output_size * num_vertices * 4, /* gs primitive stride */ 1604 prev->output_size * 4, /* gs vertex stride */ 1605 0, 1606 0, 1607 }; 1608 uint32_t gs_base = gs->const_state->offsets.primitive_param; 1609 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0, 1610 ARRAY_SIZE(gs_params), gs_params); 1611 } 1612} 1613 1614static void 1615tu6_emit_program_config(struct tu_cs *cs, 1616 struct tu_pipeline_builder *builder) 1617{ 1618 gl_shader_stage stage = MESA_SHADER_VERTEX; 1619 1620 STATIC_ASSERT(MESA_SHADER_VERTEX == 0); 1621 1622 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( 1623 .vs_state = true, 1624 .hs_state = true, 1625 .ds_state = true, 1626 .gs_state = true, 1627 .fs_state = true, 1628 .gfx_ibo = true)); 1629 for (; stage < ARRAY_SIZE(builder->shaders); stage++) { 1630 tu6_emit_xs_config(cs, stage, builder->variants[stage]); 1631 } 1632} 1633 1634static void 1635tu6_emit_program(struct tu_cs *cs, 1636 struct tu_pipeline_builder *builder, 1637 bool binning_pass, 1638 struct tu_pipeline *pipeline) 1639{ 1640 const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX]; 1641 const struct ir3_shader_variant *bs = builder->binning_variant; 1642 const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL]; 1643 const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL]; 1644 const struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY]; 1645 const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT]; 1646 gl_shader_stage stage = MESA_SHADER_VERTEX; 1647 uint32_t cps_per_patch = builder->create_info->pTessellationState ? 1648 builder->create_info->pTessellationState->patchControlPoints : 0; 1649 bool multi_pos_output = builder->shaders[MESA_SHADER_VERTEX]->multi_pos_output; 1650 1651 /* Don't use the binning pass variant when GS is present because we don't 1652 * support compiling correct binning pass variants with GS. 1653 */ 1654 if (binning_pass && !gs) { 1655 vs = bs; 1656 tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova); 1657 stage++; 1658 } 1659 1660 for (; stage < ARRAY_SIZE(builder->shaders); stage++) { 1661 const struct ir3_shader_variant *xs = builder->variants[stage]; 1662 1663 if (stage == MESA_SHADER_FRAGMENT && binning_pass) 1664 fs = xs = NULL; 1665 1666 tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]); 1667 } 1668 1669 uint32_t multiview_views = util_logbase2(builder->multiview_mask) + 1; 1670 uint32_t multiview_cntl = builder->multiview_mask ? 1671 A6XX_PC_MULTIVIEW_CNTL_ENABLE | 1672 A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) | 1673 COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS) 1674 : 0; 1675 1676 /* Copy what the blob does here. This will emit an extra 0x3f 1677 * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what 1678 * this is working around yet. 1679 */ 1680 if (builder->device->physical_device->info->a6xx.has_cp_reg_write) { 1681 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3); 1682 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE)); 1683 tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL); 1684 } else { 1685 tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1); 1686 } 1687 tu_cs_emit(cs, multiview_cntl); 1688 1689 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1); 1690 tu_cs_emit(cs, multiview_cntl); 1691 1692 if (multiview_cntl && 1693 builder->device->physical_device->info->a6xx.supports_multiview_mask) { 1694 tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1); 1695 tu_cs_emit(cs, builder->multiview_mask); 1696 } 1697 1698 tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); 1699 tu_cs_emit(cs, 0); 1700 1701 tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch); 1702 tu6_emit_vpc_varying_modes(cs, fs); 1703 1704 bool no_earlyz = builder->depth_attachment_format == VK_FORMAT_S8_UINT; 1705 uint32_t mrt_count = builder->color_attachment_count; 1706 uint32_t render_components = builder->render_components; 1707 1708 if (builder->alpha_to_coverage) { 1709 /* alpha to coverage can behave like a discard */ 1710 no_earlyz = true; 1711 /* alpha value comes from first mrt */ 1712 render_components |= 0xf; 1713 if (!mrt_count) { 1714 mrt_count = 1; 1715 /* Disable memory write for dummy mrt because it doesn't get set otherwise */ 1716 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = 0)); 1717 } 1718 } 1719 1720 if (fs) { 1721 tu6_emit_fs_inputs(cs, fs); 1722 tu6_emit_fs_outputs(cs, fs, mrt_count, 1723 builder->use_dual_src_blend, 1724 render_components, 1725 no_earlyz, 1726 pipeline); 1727 } else { 1728 /* TODO: check if these can be skipped if fs is disabled */ 1729 struct ir3_shader_variant dummy_variant = {}; 1730 tu6_emit_fs_inputs(cs, &dummy_variant); 1731 tu6_emit_fs_outputs(cs, &dummy_variant, mrt_count, 1732 builder->use_dual_src_blend, 1733 render_components, 1734 no_earlyz, 1735 NULL); 1736 } 1737 1738 if (gs || hs) { 1739 tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs, cps_per_patch); 1740 } 1741} 1742 1743static void 1744tu6_emit_vertex_input(struct tu_pipeline *pipeline, 1745 struct tu_cs *cs, 1746 const struct ir3_shader_variant *vs, 1747 const VkPipelineVertexInputStateCreateInfo *info) 1748{ 1749 uint32_t vfd_decode_idx = 0; 1750 uint32_t binding_instanced = 0; /* bitmask of instanced bindings */ 1751 uint32_t step_rate[MAX_VBS]; 1752 1753 for (uint32_t i = 0; i < info->vertexBindingDescriptionCount; i++) { 1754 const VkVertexInputBindingDescription *binding = 1755 &info->pVertexBindingDescriptions[i]; 1756 1757 if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) { 1758 tu_cs_emit_regs(cs, 1759 A6XX_VFD_FETCH_STRIDE(binding->binding, binding->stride)); 1760 } 1761 1762 if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) 1763 binding_instanced |= 1 << binding->binding; 1764 1765 step_rate[binding->binding] = 1; 1766 } 1767 1768 const VkPipelineVertexInputDivisorStateCreateInfoEXT *div_state = 1769 vk_find_struct_const(info->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT); 1770 if (div_state) { 1771 for (uint32_t i = 0; i < div_state->vertexBindingDivisorCount; i++) { 1772 const VkVertexInputBindingDivisorDescriptionEXT *desc = 1773 &div_state->pVertexBindingDivisors[i]; 1774 step_rate[desc->binding] = desc->divisor; 1775 } 1776 } 1777 1778 /* TODO: emit all VFD_DECODE/VFD_DEST_CNTL in same (two) pkt4 */ 1779 1780 for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) { 1781 const VkVertexInputAttributeDescription *attr = 1782 &info->pVertexAttributeDescriptions[i]; 1783 uint32_t input_idx; 1784 1785 for (input_idx = 0; input_idx < vs->inputs_count; input_idx++) { 1786 if ((vs->inputs[input_idx].slot - VERT_ATTRIB_GENERIC0) == attr->location) 1787 break; 1788 } 1789 1790 /* attribute not used, skip it */ 1791 if (input_idx == vs->inputs_count) 1792 continue; 1793 1794 const struct tu_native_format format = tu6_format_vtx(attr->format); 1795 tu_cs_emit_regs(cs, 1796 A6XX_VFD_DECODE_INSTR(vfd_decode_idx, 1797 .idx = attr->binding, 1798 .offset = attr->offset, 1799 .instanced = binding_instanced & (1 << attr->binding), 1800 .format = format.fmt, 1801 .swap = format.swap, 1802 .unk30 = 1, 1803 ._float = !vk_format_is_int(attr->format)), 1804 A6XX_VFD_DECODE_STEP_RATE(vfd_decode_idx, step_rate[attr->binding])); 1805 1806 tu_cs_emit_regs(cs, 1807 A6XX_VFD_DEST_CNTL_INSTR(vfd_decode_idx, 1808 .writemask = vs->inputs[input_idx].compmask, 1809 .regid = vs->inputs[input_idx].regid)); 1810 1811 vfd_decode_idx++; 1812 } 1813 1814 tu_cs_emit_regs(cs, 1815 A6XX_VFD_CONTROL_0( 1816 .fetch_cnt = vfd_decode_idx, /* decode_cnt for binning pass ? */ 1817 .decode_cnt = vfd_decode_idx)); 1818} 1819 1820void 1821tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewports, uint32_t num_viewport) 1822{ 1823 VkExtent2D guardband = {511, 511}; 1824 1825 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), num_viewport * 6); 1826 for (uint32_t i = 0; i < num_viewport; i++) { 1827 const VkViewport *viewport = &viewports[i]; 1828 float offsets[3]; 1829 float scales[3]; 1830 scales[0] = viewport->width / 2.0f; 1831 scales[1] = viewport->height / 2.0f; 1832 scales[2] = viewport->maxDepth - viewport->minDepth; 1833 offsets[0] = viewport->x + scales[0]; 1834 offsets[1] = viewport->y + scales[1]; 1835 offsets[2] = viewport->minDepth; 1836 for (uint32_t j = 0; j < 3; j++) { 1837 tu_cs_emit(cs, fui(offsets[j])); 1838 tu_cs_emit(cs, fui(scales[j])); 1839 } 1840 1841 guardband.width = 1842 MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false)); 1843 guardband.height = 1844 MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false)); 1845 } 1846 1847 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), num_viewport * 2); 1848 for (uint32_t i = 0; i < num_viewport; i++) { 1849 const VkViewport *viewport = &viewports[i]; 1850 VkOffset2D min; 1851 VkOffset2D max; 1852 min.x = (int32_t) viewport->x; 1853 max.x = (int32_t) ceilf(viewport->x + viewport->width); 1854 if (viewport->height >= 0.0f) { 1855 min.y = (int32_t) viewport->y; 1856 max.y = (int32_t) ceilf(viewport->y + viewport->height); 1857 } else { 1858 min.y = (int32_t)(viewport->y + viewport->height); 1859 max.y = (int32_t) ceilf(viewport->y); 1860 } 1861 /* the spec allows viewport->height to be 0.0f */ 1862 if (min.y == max.y) 1863 max.y++; 1864 /* allow viewport->width = 0.0f for un-initialized viewports: */ 1865 if (min.x == max.x) 1866 max.x++; 1867 1868 min.x = MAX2(min.x, 0); 1869 min.y = MAX2(min.y, 0); 1870 1871 assert(min.x < max.x); 1872 assert(min.y < max.y); 1873 tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) | 1874 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y)); 1875 tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(max.x - 1) | 1876 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(max.y - 1)); 1877 } 1878 1879 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), num_viewport * 2); 1880 for (uint32_t i = 0; i < num_viewport; i++) { 1881 const VkViewport *viewport = &viewports[i]; 1882 tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth))); 1883 tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth))); 1884 } 1885 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1); 1886 tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) | 1887 A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height)); 1888 1889 /* TODO: what to do about this and multi viewport ? */ 1890 float z_clamp_min = num_viewport ? MIN2(viewports[0].minDepth, viewports[0].maxDepth) : 0; 1891 float z_clamp_max = num_viewport ? MAX2(viewports[0].minDepth, viewports[0].maxDepth) : 0; 1892 1893 tu_cs_emit_regs(cs, 1894 A6XX_RB_Z_CLAMP_MIN(z_clamp_min), 1895 A6XX_RB_Z_CLAMP_MAX(z_clamp_max)); 1896} 1897 1898void 1899tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissors, uint32_t scissor_count) 1900{ 1901 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), scissor_count * 2); 1902 1903 for (uint32_t i = 0; i < scissor_count; i++) { 1904 const VkRect2D *scissor = &scissors[i]; 1905 1906 uint32_t min_x = scissor->offset.x; 1907 uint32_t min_y = scissor->offset.y; 1908 uint32_t max_x = min_x + scissor->extent.width - 1; 1909 uint32_t max_y = min_y + scissor->extent.height - 1; 1910 1911 if (!scissor->extent.width || !scissor->extent.height) { 1912 min_x = min_y = 1; 1913 max_x = max_y = 0; 1914 } else { 1915 /* avoid overflow */ 1916 uint32_t scissor_max = BITFIELD_MASK(15); 1917 min_x = MIN2(scissor_max, min_x); 1918 min_y = MIN2(scissor_max, min_y); 1919 max_x = MIN2(scissor_max, max_x); 1920 max_y = MIN2(scissor_max, max_y); 1921 } 1922 1923 tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) | 1924 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y)); 1925 tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) | 1926 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y)); 1927 } 1928} 1929 1930void 1931tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc) 1932{ 1933 if (!samp_loc) { 1934 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1); 1935 tu_cs_emit(cs, 0); 1936 1937 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1); 1938 tu_cs_emit(cs, 0); 1939 1940 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1); 1941 tu_cs_emit(cs, 0); 1942 return; 1943 } 1944 1945 assert(samp_loc->sampleLocationsPerPixel == samp_loc->sampleLocationsCount); 1946 assert(samp_loc->sampleLocationGridSize.width == 1); 1947 assert(samp_loc->sampleLocationGridSize.height == 1); 1948 1949 uint32_t sample_config = 1950 A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE; 1951 uint32_t sample_locations = 0; 1952 for (uint32_t i = 0; i < samp_loc->sampleLocationsCount; i++) { 1953 sample_locations |= 1954 (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(samp_loc->pSampleLocations[i].x) | 1955 A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(samp_loc->pSampleLocations[i].y)) << i*8; 1956 } 1957 1958 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 2); 1959 tu_cs_emit(cs, sample_config); 1960 tu_cs_emit(cs, sample_locations); 1961 1962 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 2); 1963 tu_cs_emit(cs, sample_config); 1964 tu_cs_emit(cs, sample_locations); 1965 1966 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 2); 1967 tu_cs_emit(cs, sample_config); 1968 tu_cs_emit(cs, sample_locations); 1969} 1970 1971static uint32_t 1972tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info, 1973 enum a5xx_line_mode line_mode, 1974 bool multiview) 1975{ 1976 uint32_t gras_su_cntl = 0; 1977 1978 if (rast_info->cullMode & VK_CULL_MODE_FRONT_BIT) 1979 gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT; 1980 if (rast_info->cullMode & VK_CULL_MODE_BACK_BIT) 1981 gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK; 1982 1983 if (rast_info->frontFace == VK_FRONT_FACE_CLOCKWISE) 1984 gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW; 1985 1986 gras_su_cntl |= 1987 A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(rast_info->lineWidth / 2.0f); 1988 1989 if (rast_info->depthBiasEnable) 1990 gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET; 1991 1992 gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINE_MODE(line_mode); 1993 1994 if (multiview) { 1995 gras_su_cntl |= 1996 A6XX_GRAS_SU_CNTL_UNK17 | 1997 A6XX_GRAS_SU_CNTL_MULTIVIEW_ENABLE; 1998 } 1999 2000 return gras_su_cntl; 2001} 2002 2003void 2004tu6_emit_depth_bias(struct tu_cs *cs, 2005 float constant_factor, 2006 float clamp, 2007 float slope_factor) 2008{ 2009 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3); 2010 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor).value); 2011 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor).value); 2012 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp).value); 2013} 2014 2015static uint32_t 2016tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState *att, 2017 bool has_alpha) 2018{ 2019 const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->colorBlendOp); 2020 const enum adreno_rb_blend_factor src_color_factor = tu6_blend_factor( 2021 has_alpha ? att->srcColorBlendFactor 2022 : tu_blend_factor_no_dst_alpha(att->srcColorBlendFactor)); 2023 const enum adreno_rb_blend_factor dst_color_factor = tu6_blend_factor( 2024 has_alpha ? att->dstColorBlendFactor 2025 : tu_blend_factor_no_dst_alpha(att->dstColorBlendFactor)); 2026 const enum a3xx_rb_blend_opcode alpha_op = tu6_blend_op(att->alphaBlendOp); 2027 const enum adreno_rb_blend_factor src_alpha_factor = 2028 tu6_blend_factor(att->srcAlphaBlendFactor); 2029 const enum adreno_rb_blend_factor dst_alpha_factor = 2030 tu6_blend_factor(att->dstAlphaBlendFactor); 2031 2032 return A6XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(src_color_factor) | 2033 A6XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(color_op) | 2034 A6XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(dst_color_factor) | 2035 A6XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(src_alpha_factor) | 2036 A6XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(alpha_op) | 2037 A6XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(dst_alpha_factor); 2038} 2039 2040static uint32_t 2041tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att, 2042 uint32_t rb_mrt_control_rop, 2043 bool has_alpha) 2044{ 2045 uint32_t rb_mrt_control = 2046 A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE(att->colorWriteMask); 2047 2048 rb_mrt_control |= rb_mrt_control_rop; 2049 2050 if (att->blendEnable) { 2051 rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND; 2052 2053 if (has_alpha) 2054 rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND2; 2055 } 2056 2057 return rb_mrt_control; 2058} 2059 2060static void 2061tu6_emit_rb_mrt_controls(struct tu_cs *cs, 2062 const VkPipelineColorBlendStateCreateInfo *blend_info, 2063 const VkFormat attachment_formats[MAX_RTS], 2064 uint32_t *blend_enable_mask) 2065{ 2066 *blend_enable_mask = 0; 2067 2068 bool rop_reads_dst = false; 2069 uint32_t rb_mrt_control_rop = 0; 2070 if (blend_info->logicOpEnable) { 2071 rop_reads_dst = tu_logic_op_reads_dst(blend_info->logicOp); 2072 rb_mrt_control_rop = 2073 A6XX_RB_MRT_CONTROL_ROP_ENABLE | 2074 A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(blend_info->logicOp)); 2075 } 2076 2077 for (uint32_t i = 0; i < blend_info->attachmentCount; i++) { 2078 const VkPipelineColorBlendAttachmentState *att = 2079 &blend_info->pAttachments[i]; 2080 const VkFormat format = attachment_formats[i]; 2081 2082 uint32_t rb_mrt_control = 0; 2083 uint32_t rb_mrt_blend_control = 0; 2084 if (format != VK_FORMAT_UNDEFINED) { 2085 const bool has_alpha = vk_format_has_alpha(format); 2086 2087 rb_mrt_control = 2088 tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha); 2089 rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha); 2090 2091 if (att->blendEnable || rop_reads_dst) 2092 *blend_enable_mask |= 1 << i; 2093 } 2094 2095 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_CONTROL(i), 2); 2096 tu_cs_emit(cs, rb_mrt_control); 2097 tu_cs_emit(cs, rb_mrt_blend_control); 2098 } 2099} 2100 2101static void 2102tu6_emit_blend_control(struct tu_cs *cs, 2103 uint32_t blend_enable_mask, 2104 bool dual_src_blend, 2105 const VkPipelineMultisampleStateCreateInfo *msaa_info) 2106{ 2107 const uint32_t sample_mask = 2108 msaa_info->pSampleMask ? (*msaa_info->pSampleMask & 0xffff) 2109 : ((1 << msaa_info->rasterizationSamples) - 1); 2110 2111 tu_cs_emit_regs(cs, 2112 A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask, 2113 .dual_color_in_enable = dual_src_blend, 2114 .alpha_to_coverage = msaa_info->alphaToCoverageEnable, 2115 .unk8 = true)); 2116 2117 /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */ 2118 tu_cs_emit_regs(cs, 2119 A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask, 2120 .independent_blend = true, 2121 .sample_mask = sample_mask, 2122 .dual_color_in_enable = dual_src_blend, 2123 .alpha_to_coverage = msaa_info->alphaToCoverageEnable, 2124 .alpha_to_one = msaa_info->alphaToOneEnable)); 2125} 2126 2127static uint32_t 2128calc_pvtmem_size(struct tu_device *dev, struct tu_pvtmem_config *config, 2129 uint32_t pvtmem_bytes) 2130{ 2131 uint32_t per_fiber_size = ALIGN(pvtmem_bytes, 512); 2132 uint32_t per_sp_size = 2133 ALIGN(per_fiber_size * dev->physical_device->info->a6xx.fibers_per_sp, 1 << 12); 2134 2135 if (config) { 2136 config->per_fiber_size = per_fiber_size; 2137 config->per_sp_size = per_sp_size; 2138 } 2139 2140 return dev->physical_device->info->num_sp_cores * per_sp_size; 2141} 2142 2143static VkResult 2144tu_setup_pvtmem(struct tu_device *dev, 2145 struct tu_pipeline *pipeline, 2146 struct tu_pvtmem_config *config, 2147 uint32_t pvtmem_bytes, bool per_wave) 2148{ 2149 if (!pvtmem_bytes) { 2150 memset(config, 0, sizeof(*config)); 2151 return VK_SUCCESS; 2152 } 2153 2154 uint32_t total_size = calc_pvtmem_size(dev, config, pvtmem_bytes); 2155 config->per_wave = per_wave; 2156 2157 VkResult result = 2158 tu_bo_init_new(dev, &pipeline->pvtmem_bo, total_size, 2159 TU_BO_ALLOC_NO_FLAGS); 2160 if (result != VK_SUCCESS) 2161 return result; 2162 2163 config->iova = pipeline->pvtmem_bo.iova; 2164 2165 return result; 2166} 2167 2168 2169static VkResult 2170tu_pipeline_allocate_cs(struct tu_device *dev, 2171 struct tu_pipeline *pipeline, 2172 struct tu_pipeline_builder *builder, 2173 struct ir3_shader_variant *compute) 2174{ 2175 uint32_t size = 2048 + tu6_load_state_size(pipeline, compute); 2176 2177 /* graphics case: */ 2178 if (builder) { 2179 uint32_t pvtmem_bytes = 0; 2180 for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) { 2181 if (builder->variants[i]) { 2182 size += builder->variants[i]->info.size / 4; 2183 pvtmem_bytes = MAX2(pvtmem_bytes, builder->variants[i]->pvtmem_size); 2184 } 2185 } 2186 2187 size += builder->binning_variant->info.size / 4; 2188 pvtmem_bytes = MAX2(pvtmem_bytes, builder->binning_variant->pvtmem_size); 2189 2190 size += calc_pvtmem_size(dev, NULL, pvtmem_bytes) / 4; 2191 2192 builder->additional_cs_reserve_size = 0; 2193 for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) { 2194 struct ir3_shader_variant *variant = builder->variants[i]; 2195 if (variant) { 2196 builder->additional_cs_reserve_size += 2197 tu_xs_get_additional_cs_size_dwords(variant); 2198 2199 if (variant->binning) { 2200 builder->additional_cs_reserve_size += 2201 tu_xs_get_additional_cs_size_dwords(variant->binning); 2202 } 2203 } 2204 } 2205 2206 size += builder->additional_cs_reserve_size; 2207 } else { 2208 size += compute->info.size / 4; 2209 size += calc_pvtmem_size(dev, NULL, compute->pvtmem_size) / 4; 2210 2211 size += tu_xs_get_additional_cs_size_dwords(compute); 2212 } 2213 2214 tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size); 2215 2216 /* Reserve the space now such that tu_cs_begin_sub_stream never fails. Note 2217 * that LOAD_STATE can potentially take up a large amount of space so we 2218 * calculate its size explicitly. 2219 */ 2220 return tu_cs_reserve_space(&pipeline->cs, size); 2221} 2222 2223static void 2224tu_pipeline_shader_key_init(struct ir3_shader_key *key, 2225 const struct tu_pipeline *pipeline, 2226 const VkGraphicsPipelineCreateInfo *pipeline_info) 2227{ 2228 for (uint32_t i = 0; i < pipeline_info->stageCount; i++) { 2229 if (pipeline_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) { 2230 key->has_gs = true; 2231 break; 2232 } 2233 } 2234 2235 if (pipeline_info->pRasterizationState->rasterizerDiscardEnable && 2236 !(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD))) 2237 return; 2238 2239 const VkPipelineMultisampleStateCreateInfo *msaa_info = pipeline_info->pMultisampleState; 2240 const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations = 2241 vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT); 2242 if (msaa_info->rasterizationSamples > 1 || 2243 /* also set msaa key when sample location is not the default 2244 * since this affects varying interpolation */ 2245 (sample_locations && sample_locations->sampleLocationsEnable)) { 2246 key->msaa = true; 2247 } 2248 2249 /* note: not actually used by ir3, just checked in tu6_emit_fs_inputs */ 2250 if (msaa_info->sampleShadingEnable) 2251 key->sample_shading = true; 2252 2253 /* We set this after we compile to NIR because we need the prim mode */ 2254 key->tessellation = IR3_TESS_NONE; 2255} 2256 2257static uint32_t 2258tu6_get_tessmode(struct tu_shader* shader) 2259{ 2260 uint32_t primitive_mode = shader->ir3_shader->nir->info.tess.primitive_mode; 2261 switch (primitive_mode) { 2262 case GL_ISOLINES: 2263 return IR3_TESS_ISOLINES; 2264 case GL_TRIANGLES: 2265 return IR3_TESS_TRIANGLES; 2266 case GL_QUADS: 2267 return IR3_TESS_QUADS; 2268 case GL_NONE: 2269 return IR3_TESS_NONE; 2270 default: 2271 unreachable("bad tessmode"); 2272 } 2273} 2274 2275static uint64_t 2276tu_upload_variant(struct tu_pipeline *pipeline, 2277 const struct ir3_shader_variant *variant) 2278{ 2279 struct tu_cs_memory memory; 2280 2281 if (!variant) 2282 return 0; 2283 2284 /* this expects to get enough alignment because shaders are allocated first 2285 * and total size is always aligned correctly 2286 * note: an assert in tu6_emit_xs_config validates the alignment 2287 */ 2288 tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory); 2289 2290 memcpy(memory.map, variant->bin, variant->info.size); 2291 return memory.iova; 2292} 2293 2294static void 2295tu_append_executable(struct tu_pipeline *pipeline, struct ir3_shader_variant *variant, 2296 char *nir_from_spirv) 2297{ 2298 ralloc_steal(pipeline->executables_mem_ctx, variant->disasm_info.nir); 2299 ralloc_steal(pipeline->executables_mem_ctx, variant->disasm_info.disasm); 2300 2301 struct tu_pipeline_executable exe = { 2302 .stage = variant->shader->type, 2303 .nir_from_spirv = nir_from_spirv, 2304 .nir_final = variant->disasm_info.nir, 2305 .disasm = variant->disasm_info.disasm, 2306 .stats = variant->info, 2307 .is_binning = variant->binning_pass, 2308 }; 2309 2310 util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe); 2311} 2312 2313static VkResult 2314tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder, 2315 struct tu_pipeline *pipeline) 2316{ 2317 const struct ir3_compiler *compiler = builder->device->compiler; 2318 const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = { 2319 NULL 2320 }; 2321 for (uint32_t i = 0; i < builder->create_info->stageCount; i++) { 2322 gl_shader_stage stage = 2323 vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage); 2324 stage_infos[stage] = &builder->create_info->pStages[i]; 2325 } 2326 2327 struct ir3_shader_key key = {}; 2328 tu_pipeline_shader_key_init(&key, pipeline, builder->create_info); 2329 2330 nir_shader *nir[ARRAY_SIZE(builder->shaders)] = { NULL }; 2331 2332 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 2333 stage < ARRAY_SIZE(nir); stage++) { 2334 const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage]; 2335 if (!stage_info) 2336 continue; 2337 2338 nir[stage] = tu_spirv_to_nir(builder->device, stage_info, stage); 2339 if (!nir[stage]) 2340 return VK_ERROR_OUT_OF_HOST_MEMORY; 2341 } 2342 2343 if (!nir[MESA_SHADER_FRAGMENT]) { 2344 const nir_shader_compiler_options *nir_options = 2345 ir3_get_compiler_options(builder->device->compiler); 2346 nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, 2347 nir_options, 2348 "noop_fs"); 2349 nir[MESA_SHADER_FRAGMENT] = fs_b.shader; 2350 } 2351 2352 const bool executable_info = builder->create_info->flags & 2353 VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR; 2354 2355 char *nir_initial_disasm[ARRAY_SIZE(builder->shaders)] = { NULL }; 2356 2357 if (executable_info) { 2358 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 2359 stage < ARRAY_SIZE(nir); stage++) { 2360 if (!nir[stage]) 2361 continue; 2362 2363 nir_initial_disasm[stage] = 2364 nir_shader_as_str(nir[stage], pipeline->executables_mem_ctx); 2365 } 2366 } 2367 2368 /* TODO do intra-stage linking here */ 2369 2370 uint32_t desc_sets = 0; 2371 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 2372 stage < ARRAY_SIZE(nir); stage++) { 2373 if (!nir[stage]) 2374 continue; 2375 2376 struct tu_shader *shader = 2377 tu_shader_create(builder->device, nir[stage], 2378 builder->multiview_mask, builder->layout, 2379 builder->alloc); 2380 if (!shader) 2381 return VK_ERROR_OUT_OF_HOST_MEMORY; 2382 2383 /* In SPIR-V generated from GLSL, the primitive mode is specified in the 2384 * tessellation evaluation shader, but in SPIR-V generated from HLSL, 2385 * the mode is specified in the tessellation control shader. */ 2386 if ((stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_TESS_CTRL) && 2387 key.tessellation == IR3_TESS_NONE) { 2388 key.tessellation = tu6_get_tessmode(shader); 2389 } 2390 2391 if (stage > MESA_SHADER_TESS_CTRL) { 2392 if (stage == MESA_SHADER_FRAGMENT) { 2393 key.tcs_store_primid = key.tcs_store_primid || 2394 (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID)); 2395 } else { 2396 key.tcs_store_primid = key.tcs_store_primid || 2397 BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID); 2398 } 2399 } 2400 2401 /* Keep track of the status of each shader's active descriptor sets, 2402 * which is set in tu_lower_io. */ 2403 desc_sets |= shader->active_desc_sets; 2404 2405 builder->shaders[stage] = shader; 2406 } 2407 pipeline->active_desc_sets = desc_sets; 2408 2409 struct tu_shader *last_shader = builder->shaders[MESA_SHADER_GEOMETRY]; 2410 if (!last_shader) 2411 last_shader = builder->shaders[MESA_SHADER_TESS_EVAL]; 2412 if (!last_shader) 2413 last_shader = builder->shaders[MESA_SHADER_VERTEX]; 2414 2415 uint64_t outputs_written = last_shader->ir3_shader->nir->info.outputs_written; 2416 2417 key.layer_zero = !(outputs_written & VARYING_BIT_LAYER); 2418 key.view_zero = !(outputs_written & VARYING_BIT_VIEWPORT); 2419 2420 pipeline->tess.patch_type = key.tessellation; 2421 2422 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 2423 stage < ARRAY_SIZE(builder->shaders); stage++) { 2424 if (!builder->shaders[stage]) 2425 continue; 2426 2427 bool created; 2428 builder->variants[stage] = 2429 ir3_shader_get_variant(builder->shaders[stage]->ir3_shader, 2430 &key, false, executable_info, &created); 2431 if (!builder->variants[stage]) 2432 return VK_ERROR_OUT_OF_HOST_MEMORY; 2433 } 2434 2435 uint32_t safe_constlens = ir3_trim_constlen(builder->variants, compiler); 2436 2437 key.safe_constlen = true; 2438 2439 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 2440 stage < ARRAY_SIZE(builder->shaders); stage++) { 2441 if (!builder->shaders[stage]) 2442 continue; 2443 2444 if (safe_constlens & (1 << stage)) { 2445 bool created; 2446 builder->variants[stage] = 2447 ir3_shader_get_variant(builder->shaders[stage]->ir3_shader, 2448 &key, false, executable_info, &created); 2449 if (!builder->variants[stage]) 2450 return VK_ERROR_OUT_OF_HOST_MEMORY; 2451 } 2452 } 2453 2454 const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX]; 2455 struct ir3_shader_variant *variant; 2456 2457 if (vs->ir3_shader->stream_output.num_outputs || 2458 !ir3_has_binning_vs(&key)) { 2459 variant = builder->variants[MESA_SHADER_VERTEX]; 2460 } else { 2461 bool created; 2462 key.safe_constlen = !!(safe_constlens & (1 << MESA_SHADER_VERTEX)); 2463 variant = ir3_shader_get_variant(vs->ir3_shader, &key, 2464 true, executable_info, &created); 2465 if (!variant) 2466 return VK_ERROR_OUT_OF_HOST_MEMORY; 2467 } 2468 2469 builder->binning_variant = variant; 2470 2471 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 2472 stage < ARRAY_SIZE(nir); stage++) { 2473 if (builder->variants[stage]) { 2474 tu_append_executable(pipeline, builder->variants[stage], 2475 nir_initial_disasm[stage]); 2476 } 2477 } 2478 2479 if (builder->binning_variant != builder->variants[MESA_SHADER_VERTEX]) { 2480 tu_append_executable(pipeline, builder->binning_variant, NULL); 2481 } 2482 2483 return VK_SUCCESS; 2484} 2485 2486static void 2487tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder *builder, 2488 struct tu_pipeline *pipeline) 2489{ 2490 const VkPipelineDynamicStateCreateInfo *dynamic_info = 2491 builder->create_info->pDynamicState; 2492 2493 pipeline->gras_su_cntl_mask = ~0u; 2494 pipeline->rb_depth_cntl_mask = ~0u; 2495 pipeline->rb_stencil_cntl_mask = ~0u; 2496 pipeline->pc_raster_cntl_mask = ~0u; 2497 pipeline->vpc_unknown_9107_mask = ~0u; 2498 2499 if (!dynamic_info) 2500 return; 2501 2502 for (uint32_t i = 0; i < dynamic_info->dynamicStateCount; i++) { 2503 VkDynamicState state = dynamic_info->pDynamicStates[i]; 2504 switch (state) { 2505 case VK_DYNAMIC_STATE_VIEWPORT ... VK_DYNAMIC_STATE_STENCIL_REFERENCE: 2506 if (state == VK_DYNAMIC_STATE_LINE_WIDTH) 2507 pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK; 2508 pipeline->dynamic_state_mask |= BIT(state); 2509 break; 2510 case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT: 2511 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS); 2512 break; 2513 case VK_DYNAMIC_STATE_CULL_MODE_EXT: 2514 pipeline->gras_su_cntl_mask &= 2515 ~(A6XX_GRAS_SU_CNTL_CULL_BACK | A6XX_GRAS_SU_CNTL_CULL_FRONT); 2516 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL); 2517 break; 2518 case VK_DYNAMIC_STATE_FRONT_FACE_EXT: 2519 pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_FRONT_CW; 2520 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL); 2521 break; 2522 case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT: 2523 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY); 2524 break; 2525 case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT: 2526 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VB_STRIDE); 2527 break; 2528 case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT: 2529 pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT); 2530 break; 2531 case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT: 2532 pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR); 2533 break; 2534 case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT: 2535 pipeline->rb_depth_cntl_mask &= 2536 ~(A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE); 2537 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); 2538 break; 2539 case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT: 2540 pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; 2541 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); 2542 break; 2543 case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT: 2544 pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK; 2545 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); 2546 break; 2547 case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT: 2548 pipeline->rb_depth_cntl_mask &= 2549 ~(A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE); 2550 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); 2551 break; 2552 case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT: 2553 pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | 2554 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | 2555 A6XX_RB_STENCIL_CONTROL_STENCIL_READ); 2556 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL); 2557 break; 2558 case VK_DYNAMIC_STATE_STENCIL_OP_EXT: 2559 pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_FUNC__MASK | 2560 A6XX_RB_STENCIL_CONTROL_FAIL__MASK | 2561 A6XX_RB_STENCIL_CONTROL_ZPASS__MASK | 2562 A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK | 2563 A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK | 2564 A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK | 2565 A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK | 2566 A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK); 2567 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL); 2568 break; 2569 case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT: 2570 pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET; 2571 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL); 2572 break; 2573 case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT: 2574 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE); 2575 break; 2576 case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT: 2577 pipeline->pc_raster_cntl_mask &= ~A6XX_PC_RASTER_CNTL_DISCARD; 2578 pipeline->vpc_unknown_9107_mask &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD; 2579 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD); 2580 break; 2581 default: 2582 assert(!"unsupported dynamic state"); 2583 break; 2584 } 2585 } 2586} 2587 2588static void 2589tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link, 2590 struct tu_shader *shader, 2591 struct ir3_shader_variant *v) 2592{ 2593 link->const_state = *ir3_const_state(v); 2594 link->constlen = v->constlen; 2595 link->push_consts = shader->push_consts; 2596} 2597 2598static void 2599tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, 2600 struct tu_pipeline *pipeline) 2601{ 2602 struct tu_cs prog_cs; 2603 2604 /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything 2605 * else that could depend on that state (like push constants) 2606 * 2607 * Note also that this always uses the full VS even in binning pass. The 2608 * binning pass variant has the same const layout as the full VS, and 2609 * the constlen for the VS will be the same or greater than the constlen 2610 * for the binning pass variant. It is required that the constlen state 2611 * matches between binning and draw passes, as some parts of the push 2612 * consts are emitted in state groups that are shared between the binning 2613 * and draw passes. 2614 */ 2615 tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs); 2616 tu6_emit_program_config(&prog_cs, builder); 2617 pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); 2618 2619 tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs); 2620 tu6_emit_program(&prog_cs, builder, false, pipeline); 2621 pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); 2622 2623 tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs); 2624 tu6_emit_program(&prog_cs, builder, true, pipeline); 2625 pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); 2626 2627 VkShaderStageFlags stages = 0; 2628 for (unsigned i = 0; i < builder->create_info->stageCount; i++) { 2629 stages |= builder->create_info->pStages[i].stage; 2630 } 2631 pipeline->active_stages = stages; 2632 2633 for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders); i++) { 2634 if (!builder->shaders[i]) 2635 continue; 2636 2637 tu_pipeline_set_linkage(&pipeline->program.link[i], 2638 builder->shaders[i], 2639 builder->variants[i]); 2640 } 2641} 2642 2643static void 2644tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder, 2645 struct tu_pipeline *pipeline) 2646{ 2647 const VkPipelineVertexInputStateCreateInfo *vi_info = 2648 builder->create_info->pVertexInputState; 2649 const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX]; 2650 const struct ir3_shader_variant *bs = builder->binning_variant; 2651 2652 /* Bindings may contain holes */ 2653 for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) { 2654 pipeline->num_vbs = 2655 MAX2(pipeline->num_vbs, vi_info->pVertexBindingDescriptions[i].binding + 1); 2656 } 2657 2658 struct tu_cs vi_cs; 2659 tu_cs_begin_sub_stream(&pipeline->cs, 2660 MAX_VERTEX_ATTRIBS * 7 + 2, &vi_cs); 2661 tu6_emit_vertex_input(pipeline, &vi_cs, vs, vi_info); 2662 pipeline->vi.state = tu_cs_end_draw_state(&pipeline->cs, &vi_cs); 2663 2664 if (bs) { 2665 tu_cs_begin_sub_stream(&pipeline->cs, 2666 MAX_VERTEX_ATTRIBS * 7 + 2, &vi_cs); 2667 tu6_emit_vertex_input(pipeline, &vi_cs, bs, vi_info); 2668 pipeline->vi.binning_state = 2669 tu_cs_end_draw_state(&pipeline->cs, &vi_cs); 2670 } 2671} 2672 2673static void 2674tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder *builder, 2675 struct tu_pipeline *pipeline) 2676{ 2677 const VkPipelineInputAssemblyStateCreateInfo *ia_info = 2678 builder->create_info->pInputAssemblyState; 2679 2680 pipeline->ia.primtype = tu6_primtype(ia_info->topology); 2681 pipeline->ia.primitive_restart = ia_info->primitiveRestartEnable; 2682} 2683 2684static bool 2685tu_pipeline_static_state(struct tu_pipeline *pipeline, struct tu_cs *cs, 2686 uint32_t id, uint32_t size) 2687{ 2688 assert(id < ARRAY_SIZE(pipeline->dynamic_state)); 2689 2690 if (pipeline->dynamic_state_mask & BIT(id)) 2691 return false; 2692 2693 pipeline->dynamic_state[id] = tu_cs_draw_state(&pipeline->cs, cs, size); 2694 return true; 2695} 2696 2697static void 2698tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder, 2699 struct tu_pipeline *pipeline) 2700{ 2701 if (!(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) || 2702 !(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)) 2703 return; 2704 2705 const VkPipelineTessellationStateCreateInfo *tess_info = 2706 builder->create_info->pTessellationState; 2707 2708 assert(pipeline->ia.primtype == DI_PT_PATCHES0); 2709 assert(tess_info->patchControlPoints <= 32); 2710 pipeline->ia.primtype += tess_info->patchControlPoints; 2711 const VkPipelineTessellationDomainOriginStateCreateInfo *domain_info = 2712 vk_find_struct_const(tess_info->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO); 2713 pipeline->tess.upper_left_domain_origin = !domain_info || 2714 domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT; 2715 const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL]; 2716 const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL]; 2717 pipeline->tess.param_stride = hs->output_size * 4; 2718 pipeline->tess.hs_bo_regid = hs->const_state->offsets.primitive_param + 1; 2719 pipeline->tess.ds_bo_regid = ds->const_state->offsets.primitive_param + 1; 2720} 2721 2722static void 2723tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder, 2724 struct tu_pipeline *pipeline) 2725{ 2726 /* The spec says: 2727 * 2728 * pViewportState is a pointer to an instance of the 2729 * VkPipelineViewportStateCreateInfo structure, and is ignored if the 2730 * pipeline has rasterization disabled." 2731 * 2732 * We leave the relevant registers stale in that case. 2733 */ 2734 if (builder->rasterizer_discard) 2735 return; 2736 2737 const VkPipelineViewportStateCreateInfo *vp_info = 2738 builder->create_info->pViewportState; 2739 2740 struct tu_cs cs; 2741 2742 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount)) 2743 tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount); 2744 2745 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount)) 2746 tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount); 2747} 2748 2749static void 2750tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder, 2751 struct tu_pipeline *pipeline) 2752{ 2753 const VkPipelineRasterizationStateCreateInfo *rast_info = 2754 builder->create_info->pRasterizationState; 2755 2756 enum a6xx_polygon_mode mode = tu6_polygon_mode(rast_info->polygonMode); 2757 2758 bool depth_clip_disable = rast_info->depthClampEnable; 2759 2760 const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state = 2761 vk_find_struct_const(rast_info, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT); 2762 if (depth_clip_state) 2763 depth_clip_disable = !depth_clip_state->depthClipEnable; 2764 2765 pipeline->line_mode = RECTANGULAR; 2766 2767 if (tu6_primtype_line(pipeline->ia.primtype)) { 2768 const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_state = 2769 vk_find_struct_const(rast_info->pNext, 2770 PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT); 2771 2772 if (rast_line_state && rast_line_state->lineRasterizationMode == 2773 VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) { 2774 pipeline->line_mode = BRESENHAM; 2775 } 2776 } 2777 2778 struct tu_cs cs; 2779 uint32_t cs_size = 9 + 2780 (builder->device->physical_device->info->a6xx.has_shading_rate ? 8 : 0) + 2781 (builder->emit_msaa_state ? 11 : 0); 2782 pipeline->rast_state = tu_cs_draw_state(&pipeline->cs, &cs, cs_size); 2783 2784 tu_cs_emit_regs(&cs, 2785 A6XX_GRAS_CL_CNTL( 2786 .znear_clip_disable = depth_clip_disable, 2787 .zfar_clip_disable = depth_clip_disable, 2788 /* TODO should this be depth_clip_disable instead? */ 2789 .unk5 = rast_info->depthClampEnable, 2790 .zero_gb_scale_z = 1, 2791 .vp_clip_code_ignore = 1)); 2792 2793 tu_cs_emit_regs(&cs, 2794 A6XX_VPC_POLYGON_MODE(mode)); 2795 2796 tu_cs_emit_regs(&cs, 2797 A6XX_PC_POLYGON_MODE(mode)); 2798 2799 /* move to hw ctx init? */ 2800 tu_cs_emit_regs(&cs, 2801 A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f), 2802 A6XX_GRAS_SU_POINT_SIZE(1.0f)); 2803 2804 if (builder->device->physical_device->info->a6xx.has_shading_rate) { 2805 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A00()); 2806 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A10()); 2807 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A20()); 2808 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A30()); 2809 } 2810 2811 /* If samples count couldn't be devised from the subpass, we should emit it here. 2812 * It happens when subpass doesn't use any color/depth attachment. 2813 */ 2814 if (builder->emit_msaa_state) 2815 tu6_emit_msaa(&cs, builder->samples, pipeline->line_mode); 2816 2817 const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info = 2818 vk_find_struct_const(rast_info->pNext, 2819 PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT); 2820 unsigned stream = stream_info ? stream_info->rasterizationStream : 0; 2821 2822 pipeline->pc_raster_cntl = A6XX_PC_RASTER_CNTL_STREAM(stream); 2823 pipeline->vpc_unknown_9107 = 0; 2824 if (rast_info->rasterizerDiscardEnable) { 2825 pipeline->pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD; 2826 pipeline->vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD; 2827 } 2828 2829 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4)) { 2830 tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = pipeline->pc_raster_cntl)); 2831 tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = pipeline->vpc_unknown_9107)); 2832 } 2833 2834 pipeline->gras_su_cntl = 2835 tu6_gras_su_cntl(rast_info, pipeline->line_mode, builder->multiview_mask != 0); 2836 2837 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2)) 2838 tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = pipeline->gras_su_cntl)); 2839 2840 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BIAS, 4)) { 2841 tu6_emit_depth_bias(&cs, rast_info->depthBiasConstantFactor, 2842 rast_info->depthBiasClamp, 2843 rast_info->depthBiasSlopeFactor); 2844 } 2845 2846 const struct VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_state = 2847 vk_find_struct_const(rast_info->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT); 2848 pipeline->provoking_vertex_last = provoking_vtx_state && 2849 provoking_vtx_state->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT; 2850} 2851 2852static void 2853tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder, 2854 struct tu_pipeline *pipeline) 2855{ 2856 /* The spec says: 2857 * 2858 * pDepthStencilState is a pointer to an instance of the 2859 * VkPipelineDepthStencilStateCreateInfo structure, and is ignored if 2860 * the pipeline has rasterization disabled or if the subpass of the 2861 * render pass the pipeline is created against does not use a 2862 * depth/stencil attachment. 2863 */ 2864 const VkPipelineDepthStencilStateCreateInfo *ds_info = 2865 builder->create_info->pDepthStencilState; 2866 const VkPipelineRasterizationStateCreateInfo *rast_info = 2867 builder->create_info->pRasterizationState; 2868 uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0; 2869 struct tu_cs cs; 2870 2871 if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED && 2872 builder->depth_attachment_format != VK_FORMAT_S8_UINT) { 2873 if (ds_info->depthTestEnable) { 2874 rb_depth_cntl |= 2875 A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | 2876 A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) | 2877 A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; /* TODO: don't set for ALWAYS/NEVER */ 2878 2879 if (rast_info->depthClampEnable) 2880 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_CLAMP_ENABLE; 2881 2882 if (ds_info->depthWriteEnable) 2883 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; 2884 } 2885 2886 if (ds_info->depthBoundsTestEnable) 2887 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; 2888 2889 if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable) 2890 tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl); 2891 } else { 2892 /* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set 2893 * to 0 when this pipeline is used, as enabling depth test when there 2894 * is no depth attachment is a problem (at least for the S8_UINT case) 2895 */ 2896 if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL)) 2897 pipeline->rb_depth_cntl_disable = true; 2898 } 2899 2900 if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) { 2901 const VkStencilOpState *front = &ds_info->front; 2902 const VkStencilOpState *back = &ds_info->back; 2903 2904 rb_stencil_cntl |= 2905 A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) | 2906 A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) | 2907 A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) | 2908 A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) | 2909 A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) | 2910 A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) | 2911 A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) | 2912 A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp)); 2913 2914 if (ds_info->stencilTestEnable) { 2915 rb_stencil_cntl |= 2916 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | 2917 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | 2918 A6XX_RB_STENCIL_CONTROL_STENCIL_READ; 2919 } 2920 } 2921 2922 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) { 2923 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_CNTL, 1); 2924 tu_cs_emit(&cs, rb_depth_cntl); 2925 } 2926 pipeline->rb_depth_cntl = rb_depth_cntl; 2927 2928 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2)) { 2929 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_STENCIL_CONTROL, 1); 2930 tu_cs_emit(&cs, rb_stencil_cntl); 2931 } 2932 pipeline->rb_stencil_cntl = rb_stencil_cntl; 2933 2934 /* the remaining draw states arent used if there is no d/s, leave them empty */ 2935 if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED) 2936 return; 2937 2938 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3)) { 2939 tu_cs_emit_regs(&cs, 2940 A6XX_RB_Z_BOUNDS_MIN(ds_info->minDepthBounds), 2941 A6XX_RB_Z_BOUNDS_MAX(ds_info->maxDepthBounds)); 2942 } 2943 2944 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2)) { 2945 tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.mask = ds_info->front.compareMask & 0xff, 2946 .bfmask = ds_info->back.compareMask & 0xff)); 2947 } 2948 2949 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2)) { 2950 update_stencil_mask(&pipeline->stencil_wrmask, VK_STENCIL_FACE_FRONT_BIT, ds_info->front.writeMask); 2951 update_stencil_mask(&pipeline->stencil_wrmask, VK_STENCIL_FACE_BACK_BIT, ds_info->back.writeMask); 2952 tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = pipeline->stencil_wrmask)); 2953 } 2954 2955 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2)) { 2956 tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.ref = ds_info->front.reference & 0xff, 2957 .bfref = ds_info->back.reference & 0xff)); 2958 } 2959 2960 if (builder->shaders[MESA_SHADER_FRAGMENT]) { 2961 const struct ir3_shader_variant *fs = &builder->shaders[MESA_SHADER_FRAGMENT]->ir3_shader->variants[0]; 2962 if (fs->has_kill || fs->no_earlyz || fs->writes_pos) { 2963 pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE; 2964 } 2965 if (fs->no_earlyz || fs->writes_pos) { 2966 pipeline->lrz.force_disable_mask = TU_LRZ_FORCE_DISABLE_LRZ; 2967 } 2968 } 2969} 2970 2971static void 2972tu_pipeline_builder_parse_multisample_and_color_blend( 2973 struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline) 2974{ 2975 /* The spec says: 2976 * 2977 * pMultisampleState is a pointer to an instance of the 2978 * VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline 2979 * has rasterization disabled. 2980 * 2981 * Also, 2982 * 2983 * pColorBlendState is a pointer to an instance of the 2984 * VkPipelineColorBlendStateCreateInfo structure, and is ignored if the 2985 * pipeline has rasterization disabled or if the subpass of the render 2986 * pass the pipeline is created against does not use any color 2987 * attachments. 2988 * 2989 * We leave the relevant registers stale when rasterization is disabled. 2990 */ 2991 if (builder->rasterizer_discard) 2992 return; 2993 2994 static const VkPipelineColorBlendStateCreateInfo dummy_blend_info; 2995 const VkPipelineMultisampleStateCreateInfo *msaa_info = 2996 builder->create_info->pMultisampleState; 2997 const VkPipelineColorBlendStateCreateInfo *blend_info = 2998 builder->use_color_attachments ? builder->create_info->pColorBlendState 2999 : &dummy_blend_info; 3000 3001 struct tu_cs cs; 3002 pipeline->blend_state = 3003 tu_cs_draw_state(&pipeline->cs, &cs, blend_info->attachmentCount * 3 + 4); 3004 3005 uint32_t blend_enable_mask; 3006 tu6_emit_rb_mrt_controls(&cs, blend_info, 3007 builder->color_attachment_formats, 3008 &blend_enable_mask); 3009 3010 tu6_emit_blend_control(&cs, blend_enable_mask, 3011 builder->use_dual_src_blend, msaa_info); 3012 3013 assert(cs.cur == cs.end); /* validate draw state size */ 3014 3015 if (blend_enable_mask) { 3016 for (int i = 0; i < blend_info->attachmentCount; i++) { 3017 VkPipelineColorBlendAttachmentState blendAttachment = blend_info->pAttachments[i]; 3018 /* Disable LRZ writes when blend is enabled, since the 3019 * resulting pixel value from the blend-draw 3020 * depends on an earlier draw, which LRZ in the draw pass 3021 * could early-reject if the previous blend-enabled draw wrote LRZ. 3022 * 3023 * From the PoV of LRZ, having masked color channels is 3024 * the same as having blend enabled, in that the draw will 3025 * care about the fragments from an earlier draw. 3026 * 3027 * TODO: We need to disable LRZ writes only for the binning pass. 3028 * Therefore, we need to emit it in a separate draw state. We keep 3029 * it disabled for sysmem path as well for the moment. 3030 */ 3031 if (blendAttachment.blendEnable || blendAttachment.colorWriteMask != 0xf) { 3032 pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE; 3033 } 3034 } 3035 } 3036 3037 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5)) { 3038 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4); 3039 tu_cs_emit_array(&cs, (const uint32_t *) blend_info->blendConstants, 4); 3040 } 3041 3042 const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations = 3043 vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT); 3044 const VkSampleLocationsInfoEXT *samp_loc = NULL; 3045 3046 if (sample_locations && sample_locations->sampleLocationsEnable) 3047 samp_loc = &sample_locations->sampleLocationsInfo; 3048 3049 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, 3050 samp_loc ? 9 : 6)) { 3051 tu6_emit_sample_locations(&cs, samp_loc); 3052 } 3053} 3054 3055static void 3056tu_pipeline_finish(struct tu_pipeline *pipeline, 3057 struct tu_device *dev, 3058 const VkAllocationCallbacks *alloc) 3059{ 3060 tu_cs_finish(&pipeline->cs); 3061 3062 if (pipeline->pvtmem_bo.size) 3063 tu_bo_finish(dev, &pipeline->pvtmem_bo); 3064 3065 ralloc_free(pipeline->executables_mem_ctx); 3066} 3067 3068static VkResult 3069tu_pipeline_builder_build(struct tu_pipeline_builder *builder, 3070 struct tu_pipeline **pipeline) 3071{ 3072 VkResult result; 3073 3074 *pipeline = vk_object_zalloc(&builder->device->vk, builder->alloc, 3075 sizeof(**pipeline), VK_OBJECT_TYPE_PIPELINE); 3076 if (!*pipeline) 3077 return VK_ERROR_OUT_OF_HOST_MEMORY; 3078 3079 (*pipeline)->layout = builder->layout; 3080 (*pipeline)->executables_mem_ctx = ralloc_context(NULL); 3081 util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx); 3082 3083 /* compile and upload shaders */ 3084 result = tu_pipeline_builder_compile_shaders(builder, *pipeline); 3085 if (result != VK_SUCCESS) { 3086 vk_object_free(&builder->device->vk, builder->alloc, *pipeline); 3087 return result; 3088 } 3089 3090 result = tu_pipeline_allocate_cs(builder->device, *pipeline, builder, NULL); 3091 if (result != VK_SUCCESS) { 3092 vk_object_free(&builder->device->vk, builder->alloc, *pipeline); 3093 return result; 3094 } 3095 3096 for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) 3097 builder->shader_iova[i] = tu_upload_variant(*pipeline, builder->variants[i]); 3098 3099 builder->binning_vs_iova = 3100 tu_upload_variant(*pipeline, builder->binning_variant); 3101 3102 /* Setup private memory. Note that because we're sharing the same private 3103 * memory for all stages, all stages must use the same config, or else 3104 * fibers from one stage might overwrite fibers in another. 3105 */ 3106 3107 uint32_t pvtmem_size = 0; 3108 bool per_wave = true; 3109 for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) { 3110 if (builder->variants[i]) { 3111 pvtmem_size = MAX2(pvtmem_size, builder->variants[i]->pvtmem_size); 3112 if (!builder->variants[i]->pvtmem_per_wave) 3113 per_wave = false; 3114 } 3115 } 3116 3117 if (builder->binning_variant) { 3118 pvtmem_size = MAX2(pvtmem_size, builder->binning_variant->pvtmem_size); 3119 if (!builder->binning_variant->pvtmem_per_wave) 3120 per_wave = false; 3121 } 3122 3123 result = tu_setup_pvtmem(builder->device, *pipeline, &builder->pvtmem, 3124 pvtmem_size, per_wave); 3125 if (result != VK_SUCCESS) { 3126 vk_object_free(&builder->device->vk, builder->alloc, *pipeline); 3127 return result; 3128 } 3129 3130 tu_pipeline_builder_parse_dynamic(builder, *pipeline); 3131 tu_pipeline_builder_parse_shader_stages(builder, *pipeline); 3132 tu_pipeline_builder_parse_vertex_input(builder, *pipeline); 3133 tu_pipeline_builder_parse_input_assembly(builder, *pipeline); 3134 tu_pipeline_builder_parse_tessellation(builder, *pipeline); 3135 tu_pipeline_builder_parse_viewport(builder, *pipeline); 3136 tu_pipeline_builder_parse_rasterization(builder, *pipeline); 3137 tu_pipeline_builder_parse_depth_stencil(builder, *pipeline); 3138 tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline); 3139 tu6_emit_load_state(*pipeline, false); 3140 3141 /* we should have reserved enough space upfront such that the CS never 3142 * grows 3143 */ 3144 assert((*pipeline)->cs.bo_count == 1); 3145 3146 return VK_SUCCESS; 3147} 3148 3149static void 3150tu_pipeline_builder_finish(struct tu_pipeline_builder *builder) 3151{ 3152 for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders); i++) { 3153 if (!builder->shaders[i]) 3154 continue; 3155 tu_shader_destroy(builder->device, builder->shaders[i], builder->alloc); 3156 } 3157} 3158 3159static void 3160tu_pipeline_builder_init_graphics( 3161 struct tu_pipeline_builder *builder, 3162 struct tu_device *dev, 3163 struct tu_pipeline_cache *cache, 3164 const VkGraphicsPipelineCreateInfo *create_info, 3165 const VkAllocationCallbacks *alloc) 3166{ 3167 TU_FROM_HANDLE(tu_pipeline_layout, layout, create_info->layout); 3168 3169 *builder = (struct tu_pipeline_builder) { 3170 .device = dev, 3171 .cache = cache, 3172 .create_info = create_info, 3173 .alloc = alloc, 3174 .layout = layout, 3175 }; 3176 3177 bool rasterizer_discard_dynamic = false; 3178 if (create_info->pDynamicState) { 3179 for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) { 3180 if (create_info->pDynamicState->pDynamicStates[i] == 3181 VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT) { 3182 rasterizer_discard_dynamic = true; 3183 break; 3184 } 3185 } 3186 } 3187 3188 const struct tu_render_pass *pass = 3189 tu_render_pass_from_handle(create_info->renderPass); 3190 const struct tu_subpass *subpass = 3191 &pass->subpasses[create_info->subpass]; 3192 3193 builder->multiview_mask = subpass->multiview_mask; 3194 3195 builder->rasterizer_discard = 3196 builder->create_info->pRasterizationState->rasterizerDiscardEnable && 3197 !rasterizer_discard_dynamic; 3198 3199 /* variableMultisampleRate support */ 3200 builder->emit_msaa_state = (subpass->samples == 0) && !builder->rasterizer_discard; 3201 3202 if (builder->rasterizer_discard) { 3203 builder->samples = VK_SAMPLE_COUNT_1_BIT; 3204 } else { 3205 builder->samples = create_info->pMultisampleState->rasterizationSamples; 3206 builder->alpha_to_coverage = create_info->pMultisampleState->alphaToCoverageEnable; 3207 3208 const uint32_t a = subpass->depth_stencil_attachment.attachment; 3209 builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ? 3210 pass->attachments[a].format : VK_FORMAT_UNDEFINED; 3211 3212 assert(subpass->color_count == 0 || 3213 !create_info->pColorBlendState || 3214 subpass->color_count == create_info->pColorBlendState->attachmentCount); 3215 builder->color_attachment_count = subpass->color_count; 3216 for (uint32_t i = 0; i < subpass->color_count; i++) { 3217 const uint32_t a = subpass->color_attachments[i].attachment; 3218 if (a == VK_ATTACHMENT_UNUSED) 3219 continue; 3220 3221 builder->color_attachment_formats[i] = pass->attachments[a].format; 3222 builder->use_color_attachments = true; 3223 builder->render_components |= 0xf << (i * 4); 3224 } 3225 3226 if (tu_blend_state_is_dual_src(create_info->pColorBlendState)) { 3227 builder->color_attachment_count++; 3228 builder->use_dual_src_blend = true; 3229 /* dual source blending has an extra fs output in the 2nd slot */ 3230 if (subpass->color_attachments[0].attachment != VK_ATTACHMENT_UNUSED) 3231 builder->render_components |= 0xf << 4; 3232 } 3233 } 3234} 3235 3236static VkResult 3237tu_graphics_pipeline_create(VkDevice device, 3238 VkPipelineCache pipelineCache, 3239 const VkGraphicsPipelineCreateInfo *pCreateInfo, 3240 const VkAllocationCallbacks *pAllocator, 3241 VkPipeline *pPipeline) 3242{ 3243 TU_FROM_HANDLE(tu_device, dev, device); 3244 TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache); 3245 3246 struct tu_pipeline_builder builder; 3247 tu_pipeline_builder_init_graphics(&builder, dev, cache, 3248 pCreateInfo, pAllocator); 3249 3250 struct tu_pipeline *pipeline = NULL; 3251 VkResult result = tu_pipeline_builder_build(&builder, &pipeline); 3252 tu_pipeline_builder_finish(&builder); 3253 3254 if (result == VK_SUCCESS) 3255 *pPipeline = tu_pipeline_to_handle(pipeline); 3256 else 3257 *pPipeline = VK_NULL_HANDLE; 3258 3259 return result; 3260} 3261 3262VKAPI_ATTR VkResult VKAPI_CALL 3263tu_CreateGraphicsPipelines(VkDevice device, 3264 VkPipelineCache pipelineCache, 3265 uint32_t count, 3266 const VkGraphicsPipelineCreateInfo *pCreateInfos, 3267 const VkAllocationCallbacks *pAllocator, 3268 VkPipeline *pPipelines) 3269{ 3270 VkResult final_result = VK_SUCCESS; 3271 3272 for (uint32_t i = 0; i < count; i++) { 3273 VkResult result = tu_graphics_pipeline_create(device, pipelineCache, 3274 &pCreateInfos[i], pAllocator, 3275 &pPipelines[i]); 3276 3277 if (result != VK_SUCCESS) 3278 final_result = result; 3279 } 3280 3281 return final_result; 3282} 3283 3284static VkResult 3285tu_compute_pipeline_create(VkDevice device, 3286 VkPipelineCache _cache, 3287 const VkComputePipelineCreateInfo *pCreateInfo, 3288 const VkAllocationCallbacks *pAllocator, 3289 VkPipeline *pPipeline) 3290{ 3291 TU_FROM_HANDLE(tu_device, dev, device); 3292 TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout); 3293 const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage; 3294 VkResult result; 3295 3296 struct tu_pipeline *pipeline; 3297 3298 *pPipeline = VK_NULL_HANDLE; 3299 3300 pipeline = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pipeline), 3301 VK_OBJECT_TYPE_PIPELINE); 3302 if (!pipeline) 3303 return VK_ERROR_OUT_OF_HOST_MEMORY; 3304 3305 pipeline->layout = layout; 3306 3307 pipeline->executables_mem_ctx = ralloc_context(NULL); 3308 util_dynarray_init(&pipeline->executables, pipeline->executables_mem_ctx); 3309 3310 struct ir3_shader_key key = {}; 3311 3312 nir_shader *nir = tu_spirv_to_nir(dev, stage_info, MESA_SHADER_COMPUTE); 3313 3314 const bool executable_info = pCreateInfo->flags & 3315 VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR; 3316 3317 char *nir_initial_disasm = executable_info ? 3318 nir_shader_as_str(nir, pipeline->executables_mem_ctx) : NULL; 3319 3320 struct tu_shader *shader = 3321 tu_shader_create(dev, nir, 0, layout, pAllocator); 3322 if (!shader) { 3323 result = VK_ERROR_OUT_OF_HOST_MEMORY; 3324 goto fail; 3325 } 3326 3327 pipeline->active_desc_sets = shader->active_desc_sets; 3328 3329 bool created; 3330 struct ir3_shader_variant *v = 3331 ir3_shader_get_variant(shader->ir3_shader, &key, false, executable_info, &created); 3332 if (!v) { 3333 result = VK_ERROR_OUT_OF_HOST_MEMORY; 3334 goto fail; 3335 } 3336 3337 tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE], 3338 shader, v); 3339 3340 result = tu_pipeline_allocate_cs(dev, pipeline, NULL, v); 3341 if (result != VK_SUCCESS) 3342 goto fail; 3343 3344 uint64_t shader_iova = tu_upload_variant(pipeline, v); 3345 3346 struct tu_pvtmem_config pvtmem; 3347 tu_setup_pvtmem(dev, pipeline, &pvtmem, v->pvtmem_size, v->pvtmem_per_wave); 3348 3349 for (int i = 0; i < 3; i++) 3350 pipeline->compute.local_size[i] = v->local_size[i]; 3351 3352 pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64; 3353 3354 struct tu_cs prog_cs; 3355 uint32_t additional_reserve_size = tu_xs_get_additional_cs_size_dwords(v); 3356 tu_cs_begin_sub_stream(&pipeline->cs, 64 + additional_reserve_size, &prog_cs); 3357 tu6_emit_cs_config(&prog_cs, shader, v, &pvtmem, shader_iova); 3358 pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); 3359 3360 tu6_emit_load_state(pipeline, true); 3361 3362 tu_append_executable(pipeline, v, nir_initial_disasm); 3363 3364 tu_shader_destroy(dev, shader, pAllocator); 3365 3366 *pPipeline = tu_pipeline_to_handle(pipeline); 3367 3368 return VK_SUCCESS; 3369 3370fail: 3371 if (shader) 3372 tu_shader_destroy(dev, shader, pAllocator); 3373 3374 vk_object_free(&dev->vk, pAllocator, pipeline); 3375 3376 return result; 3377} 3378 3379VKAPI_ATTR VkResult VKAPI_CALL 3380tu_CreateComputePipelines(VkDevice device, 3381 VkPipelineCache pipelineCache, 3382 uint32_t count, 3383 const VkComputePipelineCreateInfo *pCreateInfos, 3384 const VkAllocationCallbacks *pAllocator, 3385 VkPipeline *pPipelines) 3386{ 3387 VkResult final_result = VK_SUCCESS; 3388 3389 for (uint32_t i = 0; i < count; i++) { 3390 VkResult result = tu_compute_pipeline_create(device, pipelineCache, 3391 &pCreateInfos[i], 3392 pAllocator, &pPipelines[i]); 3393 if (result != VK_SUCCESS) 3394 final_result = result; 3395 } 3396 3397 return final_result; 3398} 3399 3400VKAPI_ATTR void VKAPI_CALL 3401tu_DestroyPipeline(VkDevice _device, 3402 VkPipeline _pipeline, 3403 const VkAllocationCallbacks *pAllocator) 3404{ 3405 TU_FROM_HANDLE(tu_device, dev, _device); 3406 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline); 3407 3408 if (!_pipeline) 3409 return; 3410 3411 tu_pipeline_finish(pipeline, dev, pAllocator); 3412 vk_object_free(&dev->vk, pAllocator, pipeline); 3413} 3414 3415#define WRITE_STR(field, ...) ({ \ 3416 memset(field, 0, sizeof(field)); \ 3417 UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \ 3418 assert(_i > 0 && _i < sizeof(field)); \ 3419}) 3420 3421static const struct tu_pipeline_executable * 3422tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index) 3423{ 3424 assert(index < util_dynarray_num_elements(&pipeline->executables, 3425 struct tu_pipeline_executable)); 3426 return util_dynarray_element( 3427 &pipeline->executables, struct tu_pipeline_executable, index); 3428} 3429 3430VKAPI_ATTR VkResult VKAPI_CALL 3431tu_GetPipelineExecutablePropertiesKHR( 3432 VkDevice _device, 3433 const VkPipelineInfoKHR* pPipelineInfo, 3434 uint32_t* pExecutableCount, 3435 VkPipelineExecutablePropertiesKHR* pProperties) 3436{ 3437 TU_FROM_HANDLE(tu_device, dev, _device); 3438 TU_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline); 3439 VK_OUTARRAY_MAKE(out, pProperties, pExecutableCount); 3440 3441 util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) { 3442 vk_outarray_append(&out, props) { 3443 gl_shader_stage stage = exe->stage; 3444 props->stages = mesa_to_vk_shader_stage(stage); 3445 3446 if (!exe->is_binning) 3447 WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage)); 3448 else 3449 WRITE_STR(props->name, "Binning VS"); 3450 3451 WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage)); 3452 3453 props->subgroupSize = 3454 dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1); 3455 } 3456 } 3457 3458 return vk_outarray_status(&out); 3459} 3460 3461VKAPI_ATTR VkResult VKAPI_CALL 3462tu_GetPipelineExecutableStatisticsKHR( 3463 VkDevice _device, 3464 const VkPipelineExecutableInfoKHR* pExecutableInfo, 3465 uint32_t* pStatisticCount, 3466 VkPipelineExecutableStatisticKHR* pStatistics) 3467{ 3468 TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline); 3469 VK_OUTARRAY_MAKE(out, pStatistics, pStatisticCount); 3470 3471 const struct tu_pipeline_executable *exe = 3472 tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex); 3473 3474 vk_outarray_append(&out, stat) { 3475 WRITE_STR(stat->name, "Max Waves Per Core"); 3476 WRITE_STR(stat->description, 3477 "Maximum number of simultaneous waves per core."); 3478 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3479 stat->value.u64 = exe->stats.max_waves; 3480 } 3481 3482 vk_outarray_append(&out, stat) { 3483 WRITE_STR(stat->name, "Instruction Count"); 3484 WRITE_STR(stat->description, 3485 "Total number of IR3 instructions in the final generated " 3486 "shader executable."); 3487 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3488 stat->value.u64 = exe->stats.instrs_count; 3489 } 3490 3491 vk_outarray_append(&out, stat) { 3492 WRITE_STR(stat->name, "NOPs Count"); 3493 WRITE_STR(stat->description, 3494 "Number of NOP instructions in the final generated " 3495 "shader executable."); 3496 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3497 stat->value.u64 = exe->stats.nops_count; 3498 } 3499 3500 vk_outarray_append(&out, stat) { 3501 WRITE_STR(stat->name, "MOV Count"); 3502 WRITE_STR(stat->description, 3503 "Number of MOV instructions in the final generated " 3504 "shader executable."); 3505 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3506 stat->value.u64 = exe->stats.mov_count; 3507 } 3508 3509 vk_outarray_append(&out, stat) { 3510 WRITE_STR(stat->name, "COV Count"); 3511 WRITE_STR(stat->description, 3512 "Number of COV instructions in the final generated " 3513 "shader executable."); 3514 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3515 stat->value.u64 = exe->stats.cov_count; 3516 } 3517 3518 vk_outarray_append(&out, stat) { 3519 WRITE_STR(stat->name, "Registers used"); 3520 WRITE_STR(stat->description, 3521 "Number of registers used in the final generated " 3522 "shader executable."); 3523 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3524 stat->value.u64 = exe->stats.max_reg + 1; 3525 } 3526 3527 vk_outarray_append(&out, stat) { 3528 WRITE_STR(stat->name, "Half-registers used"); 3529 WRITE_STR(stat->description, 3530 "Number of half-registers used in the final generated " 3531 "shader executable."); 3532 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3533 stat->value.u64 = exe->stats.max_half_reg + 1; 3534 } 3535 3536 vk_outarray_append(&out, stat) { 3537 WRITE_STR(stat->name, "Instructions with SS sync bit"); 3538 WRITE_STR(stat->description, 3539 "SS bit is set for instructions which depend on a result " 3540 "of \"long\" instructions to prevent RAW hazard."); 3541 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3542 stat->value.u64 = exe->stats.ss; 3543 } 3544 3545 vk_outarray_append(&out, stat) { 3546 WRITE_STR(stat->name, "Instructions with SY sync bit"); 3547 WRITE_STR(stat->description, 3548 "SY bit is set for instructions which depend on a result " 3549 "of loads from global memory to prevent RAW hazard."); 3550 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3551 stat->value.u64 = exe->stats.sy; 3552 } 3553 3554 vk_outarray_append(&out, stat) { 3555 WRITE_STR(stat->name, "Estimated cycles stalled on SS"); 3556 WRITE_STR(stat->description, 3557 "A better metric to estimate the impact of SS syncs."); 3558 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3559 stat->value.u64 = exe->stats.sstall; 3560 } 3561 3562 for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) { 3563 vk_outarray_append(&out, stat) { 3564 WRITE_STR(stat->name, "cat%d instructions", i); 3565 WRITE_STR(stat->description, 3566 "Number of cat%d instructions.", i); 3567 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3568 stat->value.u64 = exe->stats.instrs_per_cat[i]; 3569 } 3570 } 3571 3572 vk_outarray_append(&out, stat) { 3573 WRITE_STR(stat->name, "STP Count"); 3574 WRITE_STR(stat->description, 3575 "Number of STore Private instructions in the final generated " 3576 "shader executable."); 3577 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3578 stat->value.u64 = exe->stats.stp_count; 3579 } 3580 3581 vk_outarray_append(&out, stat) { 3582 WRITE_STR(stat->name, "LDP Count"); 3583 WRITE_STR(stat->description, 3584 "Number of LoaD Private instructions in the final generated " 3585 "shader executable."); 3586 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 3587 stat->value.u64 = exe->stats.ldp_count; 3588 } 3589 3590 return vk_outarray_status(&out); 3591} 3592 3593static bool 3594write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir, 3595 const char *data) 3596{ 3597 ir->isText = VK_TRUE; 3598 3599 size_t data_len = strlen(data) + 1; 3600 3601 if (ir->pData == NULL) { 3602 ir->dataSize = data_len; 3603 return true; 3604 } 3605 3606 strncpy(ir->pData, data, ir->dataSize); 3607 if (ir->dataSize < data_len) 3608 return false; 3609 3610 ir->dataSize = data_len; 3611 return true; 3612} 3613 3614VKAPI_ATTR VkResult VKAPI_CALL 3615tu_GetPipelineExecutableInternalRepresentationsKHR( 3616 VkDevice _device, 3617 const VkPipelineExecutableInfoKHR* pExecutableInfo, 3618 uint32_t* pInternalRepresentationCount, 3619 VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations) 3620{ 3621 TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline); 3622 VK_OUTARRAY_MAKE(out, pInternalRepresentations, pInternalRepresentationCount); 3623 bool incomplete_text = false; 3624 3625 const struct tu_pipeline_executable *exe = 3626 tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex); 3627 3628 if (exe->nir_from_spirv) { 3629 vk_outarray_append(&out, ir) { 3630 WRITE_STR(ir->name, "NIR from SPIRV"); 3631 WRITE_STR(ir->description, 3632 "Initial NIR before any optimizations"); 3633 3634 if (!write_ir_text(ir, exe->nir_from_spirv)) 3635 incomplete_text = true; 3636 } 3637 } 3638 3639 if (exe->nir_final) { 3640 vk_outarray_append(&out, ir) { 3641 WRITE_STR(ir->name, "Final NIR"); 3642 WRITE_STR(ir->description, 3643 "Final NIR before going into the back-end compiler"); 3644 3645 if (!write_ir_text(ir, exe->nir_final)) 3646 incomplete_text = true; 3647 } 3648 } 3649 3650 if (exe->disasm) { 3651 vk_outarray_append(&out, ir) { 3652 WRITE_STR(ir->name, "IR3 Assembly"); 3653 WRITE_STR(ir->description, 3654 "Final IR3 assembly for the generated shader binary"); 3655 3656 if (!write_ir_text(ir, exe->disasm)) 3657 incomplete_text = true; 3658 } 3659 } 3660 3661 return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out); 3662} 3663