1361fc4cbSmaya/* 2361fc4cbSmaya * Copyright © 2016 Red Hat. 3361fc4cbSmaya * Copyright © 2016 Bas Nieuwenhuizen 4361fc4cbSmaya * 5361fc4cbSmaya * based in part on anv driver which is: 6361fc4cbSmaya * Copyright © 2015 Intel Corporation 7361fc4cbSmaya * 8361fc4cbSmaya * Permission is hereby granted, free of charge, to any person obtaining a 9361fc4cbSmaya * copy of this software and associated documentation files (the "Software"), 10361fc4cbSmaya * to deal in the Software without restriction, including without limitation 11361fc4cbSmaya * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12361fc4cbSmaya * and/or sell copies of the Software, and to permit persons to whom the 13361fc4cbSmaya * Software is furnished to do so, subject to the following conditions: 14361fc4cbSmaya * 15361fc4cbSmaya * The above copyright notice and this permission notice (including the next 16361fc4cbSmaya * paragraph) shall be included in all copies or substantial portions of the 17361fc4cbSmaya * Software. 18361fc4cbSmaya * 19361fc4cbSmaya * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20361fc4cbSmaya * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21361fc4cbSmaya * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22361fc4cbSmaya * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23361fc4cbSmaya * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24361fc4cbSmaya * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25361fc4cbSmaya * DEALINGS IN THE SOFTWARE. 26361fc4cbSmaya */ 27361fc4cbSmaya 287ec681f3Smrg#include "common/freedreno_guardband.h" 29361fc4cbSmaya#include "tu_private.h" 30361fc4cbSmaya 317ec681f3Smrg#include "ir3/ir3_nir.h" 32361fc4cbSmaya#include "main/menums.h" 33361fc4cbSmaya#include "nir/nir.h" 34361fc4cbSmaya#include "nir/nir_builder.h" 35361fc4cbSmaya#include "spirv/nir_spirv.h" 36361fc4cbSmaya#include "util/debug.h" 37361fc4cbSmaya#include "util/mesa-sha1.h" 38361fc4cbSmaya#include "util/u_atomic.h" 39361fc4cbSmaya#include "vk_format.h" 40361fc4cbSmaya#include "vk_util.h" 41361fc4cbSmaya 42361fc4cbSmaya#include "tu_cs.h" 43361fc4cbSmaya 447ec681f3Smrg/* Emit IB that preloads the descriptors that the shader uses */ 457ec681f3Smrg 467ec681f3Smrgstatic void 477ec681f3Smrgemit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st, 487ec681f3Smrg enum a6xx_state_block sb, unsigned base, unsigned offset, 497ec681f3Smrg unsigned count) 507ec681f3Smrg{ 517ec681f3Smrg /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not 527ec681f3Smrg * clear if emitting more packets will even help anything. Presumably the 537ec681f3Smrg * descriptor cache is relatively small, and these packets stop doing 547ec681f3Smrg * anything when there are too many descriptors. 557ec681f3Smrg */ 567ec681f3Smrg tu_cs_emit_pkt7(cs, opcode, 3); 577ec681f3Smrg tu_cs_emit(cs, 587ec681f3Smrg CP_LOAD_STATE6_0_STATE_TYPE(st) | 597ec681f3Smrg CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) | 607ec681f3Smrg CP_LOAD_STATE6_0_STATE_BLOCK(sb) | 617ec681f3Smrg CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1))); 627ec681f3Smrg tu_cs_emit_qw(cs, offset | (base << 28)); 637ec681f3Smrg} 647ec681f3Smrg 657ec681f3Smrgstatic unsigned 667ec681f3Smrgtu6_load_state_size(struct tu_pipeline *pipeline, bool compute) 677ec681f3Smrg{ 687ec681f3Smrg const unsigned load_state_size = 4; 697ec681f3Smrg unsigned size = 0; 707ec681f3Smrg for (unsigned i = 0; i < pipeline->layout->num_sets; i++) { 717ec681f3Smrg if (!(pipeline->active_desc_sets & (1u << i))) 727ec681f3Smrg continue; 737ec681f3Smrg 747ec681f3Smrg struct tu_descriptor_set_layout *set_layout = pipeline->layout->set[i].layout; 757ec681f3Smrg for (unsigned j = 0; j < set_layout->binding_count; j++) { 767ec681f3Smrg struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j]; 777ec681f3Smrg unsigned count = 0; 787ec681f3Smrg /* Note: some users, like amber for example, pass in 797ec681f3Smrg * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so 807ec681f3Smrg * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly. 817ec681f3Smrg */ 827ec681f3Smrg VkShaderStageFlags stages = compute ? 837ec681f3Smrg binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT : 847ec681f3Smrg binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS; 857ec681f3Smrg unsigned stage_count = util_bitcount(stages); 867ec681f3Smrg 877ec681f3Smrg if (!binding->array_size) 887ec681f3Smrg continue; 897ec681f3Smrg 907ec681f3Smrg switch (binding->type) { 917ec681f3Smrg case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: 927ec681f3Smrg case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: 937ec681f3Smrg case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: 947ec681f3Smrg case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: 957ec681f3Smrg /* IBO-backed resources only need one packet for all graphics stages */ 967ec681f3Smrg if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) 977ec681f3Smrg count += 1; 987ec681f3Smrg if (stages & VK_SHADER_STAGE_COMPUTE_BIT) 997ec681f3Smrg count += 1; 1007ec681f3Smrg break; 1017ec681f3Smrg case VK_DESCRIPTOR_TYPE_SAMPLER: 1027ec681f3Smrg case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: 1037ec681f3Smrg case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: 1047ec681f3Smrg case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: 1057ec681f3Smrg case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: 1067ec681f3Smrg /* Textures and UBO's needs a packet for each stage */ 1077ec681f3Smrg count = stage_count; 1087ec681f3Smrg break; 1097ec681f3Smrg case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: 1107ec681f3Smrg /* Because of how we pack combined images and samplers, we 1117ec681f3Smrg * currently can't use one packet for the whole array. 1127ec681f3Smrg */ 1137ec681f3Smrg count = stage_count * binding->array_size * 2; 1147ec681f3Smrg break; 1157ec681f3Smrg case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: 1167ec681f3Smrg case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE: 1177ec681f3Smrg break; 1187ec681f3Smrg default: 1197ec681f3Smrg unreachable("bad descriptor type"); 1207ec681f3Smrg } 1217ec681f3Smrg size += count * load_state_size; 1227ec681f3Smrg } 1237ec681f3Smrg } 1247ec681f3Smrg return size; 1257ec681f3Smrg} 1267ec681f3Smrg 1277ec681f3Smrgstatic void 1287ec681f3Smrgtu6_emit_load_state(struct tu_pipeline *pipeline, bool compute) 1297ec681f3Smrg{ 1307ec681f3Smrg unsigned size = tu6_load_state_size(pipeline, compute); 1317ec681f3Smrg if (size == 0) 1327ec681f3Smrg return; 1337ec681f3Smrg 1347ec681f3Smrg struct tu_cs cs; 1357ec681f3Smrg tu_cs_begin_sub_stream(&pipeline->cs, size, &cs); 1367ec681f3Smrg 1377ec681f3Smrg struct tu_pipeline_layout *layout = pipeline->layout; 1387ec681f3Smrg for (unsigned i = 0; i < layout->num_sets; i++) { 1397ec681f3Smrg /* From 13.2.7. Descriptor Set Binding: 1407ec681f3Smrg * 1417ec681f3Smrg * A compatible descriptor set must be bound for all set numbers that 1427ec681f3Smrg * any shaders in a pipeline access, at the time that a draw or 1437ec681f3Smrg * dispatch command is recorded to execute using that pipeline. 1447ec681f3Smrg * However, if none of the shaders in a pipeline statically use any 1457ec681f3Smrg * bindings with a particular set number, then no descriptor set need 1467ec681f3Smrg * be bound for that set number, even if the pipeline layout includes 1477ec681f3Smrg * a non-trivial descriptor set layout for that set number. 1487ec681f3Smrg * 1497ec681f3Smrg * This means that descriptor sets unused by the pipeline may have a 1507ec681f3Smrg * garbage or 0 BINDLESS_BASE register, which will cause context faults 1517ec681f3Smrg * when prefetching descriptors from these sets. Skip prefetching for 1527ec681f3Smrg * descriptors from them to avoid this. This is also an optimization, 1537ec681f3Smrg * since these prefetches would be useless. 1547ec681f3Smrg */ 1557ec681f3Smrg if (!(pipeline->active_desc_sets & (1u << i))) 1567ec681f3Smrg continue; 1577ec681f3Smrg 1587ec681f3Smrg struct tu_descriptor_set_layout *set_layout = layout->set[i].layout; 1597ec681f3Smrg for (unsigned j = 0; j < set_layout->binding_count; j++) { 1607ec681f3Smrg struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j]; 1617ec681f3Smrg unsigned base = i; 1627ec681f3Smrg unsigned offset = binding->offset / 4; 1637ec681f3Smrg /* Note: some users, like amber for example, pass in 1647ec681f3Smrg * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so 1657ec681f3Smrg * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly. 1667ec681f3Smrg */ 1677ec681f3Smrg VkShaderStageFlags stages = compute ? 1687ec681f3Smrg binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT : 1697ec681f3Smrg binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS; 1707ec681f3Smrg unsigned count = binding->array_size; 1717ec681f3Smrg if (count == 0 || stages == 0) 1727ec681f3Smrg continue; 1737ec681f3Smrg switch (binding->type) { 1747ec681f3Smrg case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: 1757ec681f3Smrg base = MAX_SETS; 1767ec681f3Smrg offset = (layout->set[i].dynamic_offset_start + 1777ec681f3Smrg binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS; 1787ec681f3Smrg FALLTHROUGH; 1797ec681f3Smrg case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: 1807ec681f3Smrg case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: 1817ec681f3Smrg case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: 1827ec681f3Smrg /* IBO-backed resources only need one packet for all graphics stages */ 1837ec681f3Smrg if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) { 1847ec681f3Smrg emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO, 1857ec681f3Smrg base, offset, count); 1867ec681f3Smrg } 1877ec681f3Smrg if (stages & VK_SHADER_STAGE_COMPUTE_BIT) { 1887ec681f3Smrg emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER, 1897ec681f3Smrg base, offset, count); 1907ec681f3Smrg } 1917ec681f3Smrg break; 1927ec681f3Smrg case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: 1937ec681f3Smrg case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE: 1947ec681f3Smrg /* nothing - input attachment doesn't use bindless */ 1957ec681f3Smrg break; 1967ec681f3Smrg case VK_DESCRIPTOR_TYPE_SAMPLER: 1977ec681f3Smrg case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: 1987ec681f3Smrg case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: { 1997ec681f3Smrg tu_foreach_stage(stage, stages) { 2007ec681f3Smrg emit_load_state(&cs, tu6_stage2opcode(stage), 2017ec681f3Smrg binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ? 2027ec681f3Smrg ST6_SHADER : ST6_CONSTANTS, 2037ec681f3Smrg tu6_stage2texsb(stage), base, offset, count); 2047ec681f3Smrg } 2057ec681f3Smrg break; 2067ec681f3Smrg } 2077ec681f3Smrg case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: 2087ec681f3Smrg base = MAX_SETS; 2097ec681f3Smrg offset = (layout->set[i].dynamic_offset_start + 2107ec681f3Smrg binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS; 2117ec681f3Smrg FALLTHROUGH; 2127ec681f3Smrg case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: { 2137ec681f3Smrg tu_foreach_stage(stage, stages) { 2147ec681f3Smrg emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO, 2157ec681f3Smrg tu6_stage2shadersb(stage), base, offset, count); 2167ec681f3Smrg } 2177ec681f3Smrg break; 2187ec681f3Smrg } 2197ec681f3Smrg case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: { 2207ec681f3Smrg tu_foreach_stage(stage, stages) { 2217ec681f3Smrg /* TODO: We could emit less CP_LOAD_STATE6 if we used 2227ec681f3Smrg * struct-of-arrays instead of array-of-structs. 2237ec681f3Smrg */ 2247ec681f3Smrg for (unsigned i = 0; i < count; i++) { 2257ec681f3Smrg unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS; 2267ec681f3Smrg unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS; 2277ec681f3Smrg emit_load_state(&cs, tu6_stage2opcode(stage), 2287ec681f3Smrg ST6_CONSTANTS, tu6_stage2texsb(stage), 2297ec681f3Smrg base, tex_offset, 1); 2307ec681f3Smrg emit_load_state(&cs, tu6_stage2opcode(stage), 2317ec681f3Smrg ST6_SHADER, tu6_stage2texsb(stage), 2327ec681f3Smrg base, sam_offset, 1); 2337ec681f3Smrg } 2347ec681f3Smrg } 2357ec681f3Smrg break; 2367ec681f3Smrg } 2377ec681f3Smrg default: 2387ec681f3Smrg unreachable("bad descriptor type"); 2397ec681f3Smrg } 2407ec681f3Smrg } 2417ec681f3Smrg } 2427ec681f3Smrg 2437ec681f3Smrg pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs); 2447ec681f3Smrg} 2457ec681f3Smrg 246361fc4cbSmayastruct tu_pipeline_builder 247361fc4cbSmaya{ 248361fc4cbSmaya struct tu_device *device; 249361fc4cbSmaya struct tu_pipeline_cache *cache; 2507ec681f3Smrg struct tu_pipeline_layout *layout; 251361fc4cbSmaya const VkAllocationCallbacks *alloc; 252361fc4cbSmaya const VkGraphicsPipelineCreateInfo *create_info; 253361fc4cbSmaya 2547ec681f3Smrg struct tu_shader *shaders[MESA_SHADER_FRAGMENT + 1]; 2557ec681f3Smrg struct ir3_shader_variant *variants[MESA_SHADER_FRAGMENT + 1]; 2567ec681f3Smrg struct ir3_shader_variant *binning_variant; 2577ec681f3Smrg uint64_t shader_iova[MESA_SHADER_FRAGMENT + 1]; 2587ec681f3Smrg uint64_t binning_vs_iova; 2597ec681f3Smrg 2607ec681f3Smrg uint32_t additional_cs_reserve_size; 2617ec681f3Smrg 2627ec681f3Smrg struct tu_pvtmem_config pvtmem; 263361fc4cbSmaya 264361fc4cbSmaya bool rasterizer_discard; 265361fc4cbSmaya /* these states are affectd by rasterizer_discard */ 2667ec681f3Smrg bool emit_msaa_state; 267361fc4cbSmaya VkSampleCountFlagBits samples; 268361fc4cbSmaya bool use_color_attachments; 2697ec681f3Smrg bool use_dual_src_blend; 2707ec681f3Smrg bool alpha_to_coverage; 271361fc4cbSmaya uint32_t color_attachment_count; 272361fc4cbSmaya VkFormat color_attachment_formats[MAX_RTS]; 2737ec681f3Smrg VkFormat depth_attachment_format; 2747ec681f3Smrg uint32_t render_components; 2757ec681f3Smrg uint32_t multiview_mask; 276361fc4cbSmaya}; 277361fc4cbSmaya 278361fc4cbSmayastatic bool 279361fc4cbSmayatu_logic_op_reads_dst(VkLogicOp op) 280361fc4cbSmaya{ 281361fc4cbSmaya switch (op) { 282361fc4cbSmaya case VK_LOGIC_OP_CLEAR: 283361fc4cbSmaya case VK_LOGIC_OP_COPY: 284361fc4cbSmaya case VK_LOGIC_OP_COPY_INVERTED: 285361fc4cbSmaya case VK_LOGIC_OP_SET: 286361fc4cbSmaya return false; 287361fc4cbSmaya default: 288361fc4cbSmaya return true; 289361fc4cbSmaya } 290361fc4cbSmaya} 291361fc4cbSmaya 292361fc4cbSmayastatic VkBlendFactor 293361fc4cbSmayatu_blend_factor_no_dst_alpha(VkBlendFactor factor) 294361fc4cbSmaya{ 295361fc4cbSmaya /* treat dst alpha as 1.0 and avoid reading it */ 296361fc4cbSmaya switch (factor) { 297361fc4cbSmaya case VK_BLEND_FACTOR_DST_ALPHA: 298361fc4cbSmaya return VK_BLEND_FACTOR_ONE; 299361fc4cbSmaya case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA: 300361fc4cbSmaya return VK_BLEND_FACTOR_ZERO; 301361fc4cbSmaya default: 302361fc4cbSmaya return factor; 303361fc4cbSmaya } 304361fc4cbSmaya} 305361fc4cbSmaya 3067ec681f3Smrgstatic bool tu_blend_factor_is_dual_src(VkBlendFactor factor) 3077ec681f3Smrg{ 3087ec681f3Smrg switch (factor) { 3097ec681f3Smrg case VK_BLEND_FACTOR_SRC1_COLOR: 3107ec681f3Smrg case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR: 3117ec681f3Smrg case VK_BLEND_FACTOR_SRC1_ALPHA: 3127ec681f3Smrg case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA: 3137ec681f3Smrg return true; 314361fc4cbSmaya default: 3157ec681f3Smrg return false; 316361fc4cbSmaya } 317361fc4cbSmaya} 318361fc4cbSmaya 3197ec681f3Smrgstatic bool 3207ec681f3Smrgtu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo *info) 321361fc4cbSmaya{ 3227ec681f3Smrg if (!info) 3237ec681f3Smrg return false; 3247ec681f3Smrg 3257ec681f3Smrg for (unsigned i = 0; i < info->attachmentCount; i++) { 3267ec681f3Smrg const VkPipelineColorBlendAttachmentState *blend = &info->pAttachments[i]; 3277ec681f3Smrg if (tu_blend_factor_is_dual_src(blend->srcColorBlendFactor) || 3287ec681f3Smrg tu_blend_factor_is_dual_src(blend->dstColorBlendFactor) || 3297ec681f3Smrg tu_blend_factor_is_dual_src(blend->srcAlphaBlendFactor) || 3307ec681f3Smrg tu_blend_factor_is_dual_src(blend->dstAlphaBlendFactor)) 3317ec681f3Smrg return true; 332361fc4cbSmaya } 333361fc4cbSmaya 3347ec681f3Smrg return false; 3357ec681f3Smrg} 3367ec681f3Smrg 3377ec681f3Smrgstatic const struct xs_config { 3387ec681f3Smrg uint16_t reg_sp_xs_ctrl; 3397ec681f3Smrg uint16_t reg_sp_xs_config; 3407ec681f3Smrg uint16_t reg_sp_xs_instrlen; 3417ec681f3Smrg uint16_t reg_hlsq_xs_ctrl; 3427ec681f3Smrg uint16_t reg_sp_xs_first_exec_offset; 3437ec681f3Smrg uint16_t reg_sp_xs_pvt_mem_hw_stack_offset; 3447ec681f3Smrg} xs_config[] = { 3457ec681f3Smrg [MESA_SHADER_VERTEX] = { 3467ec681f3Smrg REG_A6XX_SP_VS_CTRL_REG0, 3477ec681f3Smrg REG_A6XX_SP_VS_CONFIG, 3487ec681f3Smrg REG_A6XX_SP_VS_INSTRLEN, 3497ec681f3Smrg REG_A6XX_HLSQ_VS_CNTL, 3507ec681f3Smrg REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET, 3517ec681f3Smrg REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET, 3527ec681f3Smrg }, 3537ec681f3Smrg [MESA_SHADER_TESS_CTRL] = { 3547ec681f3Smrg REG_A6XX_SP_HS_CTRL_REG0, 3557ec681f3Smrg REG_A6XX_SP_HS_CONFIG, 3567ec681f3Smrg REG_A6XX_SP_HS_INSTRLEN, 3577ec681f3Smrg REG_A6XX_HLSQ_HS_CNTL, 3587ec681f3Smrg REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET, 3597ec681f3Smrg REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET, 3607ec681f3Smrg }, 3617ec681f3Smrg [MESA_SHADER_TESS_EVAL] = { 3627ec681f3Smrg REG_A6XX_SP_DS_CTRL_REG0, 3637ec681f3Smrg REG_A6XX_SP_DS_CONFIG, 3647ec681f3Smrg REG_A6XX_SP_DS_INSTRLEN, 3657ec681f3Smrg REG_A6XX_HLSQ_DS_CNTL, 3667ec681f3Smrg REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET, 3677ec681f3Smrg REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET, 3687ec681f3Smrg }, 3697ec681f3Smrg [MESA_SHADER_GEOMETRY] = { 3707ec681f3Smrg REG_A6XX_SP_GS_CTRL_REG0, 3717ec681f3Smrg REG_A6XX_SP_GS_CONFIG, 3727ec681f3Smrg REG_A6XX_SP_GS_INSTRLEN, 3737ec681f3Smrg REG_A6XX_HLSQ_GS_CNTL, 3747ec681f3Smrg REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET, 3757ec681f3Smrg REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET, 3767ec681f3Smrg }, 3777ec681f3Smrg [MESA_SHADER_FRAGMENT] = { 3787ec681f3Smrg REG_A6XX_SP_FS_CTRL_REG0, 3797ec681f3Smrg REG_A6XX_SP_FS_CONFIG, 3807ec681f3Smrg REG_A6XX_SP_FS_INSTRLEN, 3817ec681f3Smrg REG_A6XX_HLSQ_FS_CNTL, 3827ec681f3Smrg REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET, 3837ec681f3Smrg REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET, 3847ec681f3Smrg }, 3857ec681f3Smrg [MESA_SHADER_COMPUTE] = { 3867ec681f3Smrg REG_A6XX_SP_CS_CTRL_REG0, 3877ec681f3Smrg REG_A6XX_SP_CS_CONFIG, 3887ec681f3Smrg REG_A6XX_SP_CS_INSTRLEN, 3897ec681f3Smrg REG_A6XX_HLSQ_CS_CNTL, 3907ec681f3Smrg REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET, 3917ec681f3Smrg REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET, 3927ec681f3Smrg }, 3937ec681f3Smrg}; 3947ec681f3Smrg 3957ec681f3Smrgstatic uint32_t 3967ec681f3Smrgtu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs) 397361fc4cbSmaya{ 3987ec681f3Smrg const struct ir3_const_state *const_state = ir3_const_state(xs); 3997ec681f3Smrg uint32_t base = const_state->offsets.immediate; 4007ec681f3Smrg int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4); 4017ec681f3Smrg 4027ec681f3Smrg /* truncate size to avoid writing constants that shader 4037ec681f3Smrg * does not use: 4047ec681f3Smrg */ 4057ec681f3Smrg size = MIN2(size + base, xs->constlen) - base; 4067ec681f3Smrg 4077ec681f3Smrg return MAX2(size, 0) * 4; 408361fc4cbSmaya} 409361fc4cbSmaya 4107ec681f3Smrg/* We allocate fixed-length substreams for shader state, however some 4117ec681f3Smrg * parts of the state may have unbound length. Their additional space 4127ec681f3Smrg * requirements should be calculated here. 4137ec681f3Smrg */ 4147ec681f3Smrgstatic uint32_t 4157ec681f3Smrgtu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs) 416361fc4cbSmaya{ 4177ec681f3Smrg uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs); 4187ec681f3Smrg return size; 419361fc4cbSmaya} 420361fc4cbSmaya 4217ec681f3Smrgvoid 4227ec681f3Smrgtu6_emit_xs_config(struct tu_cs *cs, 4237ec681f3Smrg gl_shader_stage stage, /* xs->type, but xs may be NULL */ 4247ec681f3Smrg const struct ir3_shader_variant *xs) 425361fc4cbSmaya{ 4267ec681f3Smrg const struct xs_config *cfg = &xs_config[stage]; 4277ec681f3Smrg 4287ec681f3Smrg if (!xs) { 4297ec681f3Smrg /* shader stage disabled */ 4307ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1); 4317ec681f3Smrg tu_cs_emit(cs, 0); 4327ec681f3Smrg 4337ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1); 4347ec681f3Smrg tu_cs_emit(cs, 0); 4357ec681f3Smrg return; 436361fc4cbSmaya } 4377ec681f3Smrg 4387ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1); 4397ec681f3Smrg tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED | 4407ec681f3Smrg COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) | 4417ec681f3Smrg COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) | 4427ec681f3Smrg COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) | 4437ec681f3Smrg COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) | 4447ec681f3Smrg A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) | 4457ec681f3Smrg A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp)); 4467ec681f3Smrg 4477ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1); 4487ec681f3Smrg tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) | 4497ec681f3Smrg A6XX_HLSQ_VS_CNTL_ENABLED); 450361fc4cbSmaya} 451361fc4cbSmaya 4527ec681f3Smrgvoid 4537ec681f3Smrgtu6_emit_xs(struct tu_cs *cs, 4547ec681f3Smrg gl_shader_stage stage, /* xs->type, but xs may be NULL */ 4557ec681f3Smrg const struct ir3_shader_variant *xs, 4567ec681f3Smrg const struct tu_pvtmem_config *pvtmem, 4577ec681f3Smrg uint64_t binary_iova) 458361fc4cbSmaya{ 4597ec681f3Smrg const struct xs_config *cfg = &xs_config[stage]; 4607ec681f3Smrg 4617ec681f3Smrg if (!xs) { 4627ec681f3Smrg /* shader stage disabled */ 4637ec681f3Smrg return; 4647ec681f3Smrg } 4657ec681f3Smrg 4667ec681f3Smrg enum a6xx_threadsize thrsz = 4677ec681f3Smrg xs->info.double_threadsize ? THREAD128 : THREAD64; 4687ec681f3Smrg switch (stage) { 4697ec681f3Smrg case MESA_SHADER_VERTEX: 4707ec681f3Smrg tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0( 4717ec681f3Smrg .fullregfootprint = xs->info.max_reg + 1, 4727ec681f3Smrg .halfregfootprint = xs->info.max_half_reg + 1, 4737ec681f3Smrg .branchstack = ir3_shader_branchstack_hw(xs), 4747ec681f3Smrg .mergedregs = xs->mergedregs, 4757ec681f3Smrg )); 4767ec681f3Smrg break; 4777ec681f3Smrg case MESA_SHADER_TESS_CTRL: 4787ec681f3Smrg tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0( 4797ec681f3Smrg .fullregfootprint = xs->info.max_reg + 1, 4807ec681f3Smrg .halfregfootprint = xs->info.max_half_reg + 1, 4817ec681f3Smrg .branchstack = ir3_shader_branchstack_hw(xs), 4827ec681f3Smrg )); 4837ec681f3Smrg break; 4847ec681f3Smrg case MESA_SHADER_TESS_EVAL: 4857ec681f3Smrg tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0( 4867ec681f3Smrg .fullregfootprint = xs->info.max_reg + 1, 4877ec681f3Smrg .halfregfootprint = xs->info.max_half_reg + 1, 4887ec681f3Smrg .branchstack = ir3_shader_branchstack_hw(xs), 4897ec681f3Smrg .mergedregs = xs->mergedregs, 4907ec681f3Smrg )); 4917ec681f3Smrg break; 4927ec681f3Smrg case MESA_SHADER_GEOMETRY: 4937ec681f3Smrg tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0( 4947ec681f3Smrg .fullregfootprint = xs->info.max_reg + 1, 4957ec681f3Smrg .halfregfootprint = xs->info.max_half_reg + 1, 4967ec681f3Smrg .branchstack = ir3_shader_branchstack_hw(xs), 4977ec681f3Smrg )); 4987ec681f3Smrg break; 4997ec681f3Smrg case MESA_SHADER_FRAGMENT: 5007ec681f3Smrg tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0( 5017ec681f3Smrg .fullregfootprint = xs->info.max_reg + 1, 5027ec681f3Smrg .halfregfootprint = xs->info.max_half_reg + 1, 5037ec681f3Smrg .branchstack = ir3_shader_branchstack_hw(xs), 5047ec681f3Smrg .mergedregs = xs->mergedregs, 5057ec681f3Smrg .threadsize = thrsz, 5067ec681f3Smrg .pixlodenable = xs->need_pixlod, 5077ec681f3Smrg .diff_fine = xs->need_fine_derivatives, 5087ec681f3Smrg .varying = xs->total_in != 0, 5097ec681f3Smrg /* unknown bit, seems unnecessary */ 5107ec681f3Smrg .unk24 = true, 5117ec681f3Smrg )); 5127ec681f3Smrg break; 5137ec681f3Smrg case MESA_SHADER_COMPUTE: 5147ec681f3Smrg tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0( 5157ec681f3Smrg .fullregfootprint = xs->info.max_reg + 1, 5167ec681f3Smrg .halfregfootprint = xs->info.max_half_reg + 1, 5177ec681f3Smrg .branchstack = ir3_shader_branchstack_hw(xs), 5187ec681f3Smrg .mergedregs = xs->mergedregs, 5197ec681f3Smrg .threadsize = thrsz, 5207ec681f3Smrg )); 5217ec681f3Smrg break; 522361fc4cbSmaya default: 5237ec681f3Smrg unreachable("bad shader stage"); 524361fc4cbSmaya } 525361fc4cbSmaya 5267ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1); 5277ec681f3Smrg tu_cs_emit(cs, xs->instrlen); 5287ec681f3Smrg 5297ec681f3Smrg /* emit program binary & private memory layout 5307ec681f3Smrg * binary_iova should be aligned to 1 instrlen unit (128 bytes) 5317ec681f3Smrg */ 5327ec681f3Smrg 5337ec681f3Smrg assert((binary_iova & 0x7f) == 0); 5347ec681f3Smrg assert((pvtmem->iova & 0x1f) == 0); 5357ec681f3Smrg 5367ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7); 5377ec681f3Smrg tu_cs_emit(cs, 0); 5387ec681f3Smrg tu_cs_emit_qw(cs, binary_iova); 5397ec681f3Smrg tu_cs_emit(cs, 5407ec681f3Smrg A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size)); 5417ec681f3Smrg tu_cs_emit_qw(cs, pvtmem->iova); 5427ec681f3Smrg tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) | 5437ec681f3Smrg COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT)); 5447ec681f3Smrg 5457ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1); 5467ec681f3Smrg tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size)); 5477ec681f3Smrg 5487ec681f3Smrg tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); 5497ec681f3Smrg tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 5507ec681f3Smrg CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | 5517ec681f3Smrg CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 5527ec681f3Smrg CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 5537ec681f3Smrg CP_LOAD_STATE6_0_NUM_UNIT(xs->instrlen)); 5547ec681f3Smrg tu_cs_emit_qw(cs, binary_iova); 5557ec681f3Smrg 5567ec681f3Smrg /* emit immediates */ 557361fc4cbSmaya 5587ec681f3Smrg const struct ir3_const_state *const_state = ir3_const_state(xs); 5597ec681f3Smrg uint32_t base = const_state->offsets.immediate; 5607ec681f3Smrg unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs); 561361fc4cbSmaya 5627ec681f3Smrg if (immediate_size > 0) { 5637ec681f3Smrg tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size); 5647ec681f3Smrg tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | 5657ec681f3Smrg CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 5667ec681f3Smrg CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 5677ec681f3Smrg CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 5687ec681f3Smrg CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4)); 5697ec681f3Smrg tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 5707ec681f3Smrg tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 571361fc4cbSmaya 5727ec681f3Smrg tu_cs_emit_array(cs, const_state->immediates, immediate_size); 5737ec681f3Smrg } 5747ec681f3Smrg 5757ec681f3Smrg if (const_state->constant_data_ubo != -1) { 5767ec681f3Smrg uint64_t iova = binary_iova + xs->info.constant_data_offset; 5777ec681f3Smrg 5787ec681f3Smrg /* Upload UBO state for the constant data. */ 5797ec681f3Smrg tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5); 5807ec681f3Smrg tu_cs_emit(cs, 5817ec681f3Smrg CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) | 5827ec681f3Smrg CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)| 5837ec681f3Smrg CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 5847ec681f3Smrg CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 5857ec681f3Smrg CP_LOAD_STATE6_0_NUM_UNIT(1)); 5867ec681f3Smrg tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 5877ec681f3Smrg tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 5887ec681f3Smrg int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16); 5897ec681f3Smrg tu_cs_emit_qw(cs, 5907ec681f3Smrg iova | 5917ec681f3Smrg (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32); 5927ec681f3Smrg 5937ec681f3Smrg /* Upload the constant data to the const file if needed. */ 5947ec681f3Smrg const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state; 5957ec681f3Smrg 5967ec681f3Smrg for (int i = 0; i < ubo_state->num_enabled; i++) { 5977ec681f3Smrg if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo || 5987ec681f3Smrg ubo_state->range[i].ubo.bindless) { 5997ec681f3Smrg continue; 6007ec681f3Smrg } 601361fc4cbSmaya 6027ec681f3Smrg uint32_t start = ubo_state->range[i].start; 6037ec681f3Smrg uint32_t end = ubo_state->range[i].end; 6047ec681f3Smrg uint32_t size = MIN2(end - start, 6057ec681f3Smrg (16 * xs->constlen) - ubo_state->range[i].offset); 6067ec681f3Smrg 6077ec681f3Smrg tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); 6087ec681f3Smrg tu_cs_emit(cs, 6097ec681f3Smrg CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) | 6107ec681f3Smrg CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 6117ec681f3Smrg CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 6127ec681f3Smrg CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 6137ec681f3Smrg CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); 6147ec681f3Smrg tu_cs_emit_qw(cs, iova + start); 6157ec681f3Smrg } 6167ec681f3Smrg } 617361fc4cbSmaya} 618361fc4cbSmaya 619361fc4cbSmayastatic void 6207ec681f3Smrgtu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader, 6217ec681f3Smrg const struct ir3_shader_variant *v, 6227ec681f3Smrg const struct tu_pvtmem_config *pvtmem, 6237ec681f3Smrg uint64_t binary_iova) 6247ec681f3Smrg{ 6257ec681f3Smrg tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( 6267ec681f3Smrg .cs_state = true, 6277ec681f3Smrg .cs_ibo = true)); 6287ec681f3Smrg 6297ec681f3Smrg tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v); 6307ec681f3Smrg tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova); 6317ec681f3Smrg 6327ec681f3Smrg uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1); 6337ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); 6347ec681f3Smrg tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) | 6357ec681f3Smrg A6XX_SP_CS_UNKNOWN_A9B1_UNK6); 6367ec681f3Smrg 6377ec681f3Smrg if (cs->device->physical_device->info->a6xx.has_lpac) { 6387ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1); 6397ec681f3Smrg tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) | 6407ec681f3Smrg A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6); 6417ec681f3Smrg } 642361fc4cbSmaya 6437ec681f3Smrg uint32_t local_invocation_id = 6447ec681f3Smrg ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); 6457ec681f3Smrg uint32_t work_group_id = 6467ec681f3Smrg ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID); 647361fc4cbSmaya 6487ec681f3Smrg enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64; 6497ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2); 6507ec681f3Smrg tu_cs_emit(cs, 6517ec681f3Smrg A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | 6527ec681f3Smrg A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | 6537ec681f3Smrg A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | 6547ec681f3Smrg A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); 6557ec681f3Smrg tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | 6567ec681f3Smrg A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz)); 6577ec681f3Smrg 6587ec681f3Smrg if (cs->device->physical_device->info->a6xx.has_lpac) { 6597ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2); 6607ec681f3Smrg tu_cs_emit(cs, 6617ec681f3Smrg A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) | 6627ec681f3Smrg A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | 6637ec681f3Smrg A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | 6647ec681f3Smrg A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); 6657ec681f3Smrg tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | 6667ec681f3Smrg A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz)); 6677ec681f3Smrg } 668361fc4cbSmaya} 669361fc4cbSmaya 670361fc4cbSmayastatic void 6717ec681f3Smrgtu6_emit_vs_system_values(struct tu_cs *cs, 6727ec681f3Smrg const struct ir3_shader_variant *vs, 6737ec681f3Smrg const struct ir3_shader_variant *hs, 6747ec681f3Smrg const struct ir3_shader_variant *ds, 6757ec681f3Smrg const struct ir3_shader_variant *gs, 6767ec681f3Smrg bool primid_passthru) 677361fc4cbSmaya{ 6787ec681f3Smrg const uint32_t vertexid_regid = 6797ec681f3Smrg ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); 6807ec681f3Smrg const uint32_t instanceid_regid = 6817ec681f3Smrg ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); 6827ec681f3Smrg const uint32_t tess_coord_x_regid = hs ? 6837ec681f3Smrg ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD) : 6847ec681f3Smrg regid(63, 0); 6857ec681f3Smrg const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ? 6867ec681f3Smrg tess_coord_x_regid + 1 : 6877ec681f3Smrg regid(63, 0); 6887ec681f3Smrg const uint32_t hs_rel_patch_regid = hs ? 6897ec681f3Smrg ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3) : 6907ec681f3Smrg regid(63, 0); 6917ec681f3Smrg const uint32_t ds_rel_patch_regid = hs ? 6927ec681f3Smrg ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3) : 6937ec681f3Smrg regid(63, 0); 6947ec681f3Smrg const uint32_t hs_invocation_regid = hs ? 6957ec681f3Smrg ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3) : 6967ec681f3Smrg regid(63, 0); 6977ec681f3Smrg const uint32_t gs_primitiveid_regid = gs ? 6987ec681f3Smrg ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) : 6997ec681f3Smrg regid(63, 0); 7007ec681f3Smrg const uint32_t vs_primitiveid_regid = hs ? 7017ec681f3Smrg ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID) : 7027ec681f3Smrg gs_primitiveid_regid; 7037ec681f3Smrg const uint32_t ds_primitiveid_regid = ds ? 7047ec681f3Smrg ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID) : 7057ec681f3Smrg regid(63, 0); 7067ec681f3Smrg const uint32_t gsheader_regid = gs ? 7077ec681f3Smrg ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3) : 7087ec681f3Smrg regid(63, 0); 7097ec681f3Smrg 7107ec681f3Smrg /* Note: we currently don't support multiview with tess or GS. If we did, 7117ec681f3Smrg * and the HW actually works, then we'd have to somehow share this across 7127ec681f3Smrg * stages. Note that the blob doesn't support this either. 7137ec681f3Smrg */ 7147ec681f3Smrg const uint32_t viewid_regid = 7157ec681f3Smrg ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX); 716361fc4cbSmaya 7177ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 6); 7187ec681f3Smrg tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) | 7197ec681f3Smrg A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) | 7207ec681f3Smrg A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) | 7217ec681f3Smrg A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid)); 7227ec681f3Smrg tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) | 7237ec681f3Smrg A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid)); 7247ec681f3Smrg tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) | 7257ec681f3Smrg A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) | 7267ec681f3Smrg A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) | 7277ec681f3Smrg A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid)); 7287ec681f3Smrg tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */ 7297ec681f3Smrg tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) | 7307ec681f3Smrg 0xfc00); /* VFD_CONTROL_5 */ 7317ec681f3Smrg tu_cs_emit(cs, COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */ 732361fc4cbSmaya} 733361fc4cbSmaya 734361fc4cbSmayastatic void 7357ec681f3Smrgtu6_setup_streamout(struct tu_cs *cs, 7367ec681f3Smrg const struct ir3_shader_variant *v, 7377ec681f3Smrg struct ir3_shader_linkage *l) 738361fc4cbSmaya{ 7397ec681f3Smrg const struct ir3_stream_output_info *info = &v->shader->stream_output; 7407ec681f3Smrg /* Note: 64 here comes from the HW layout of the program RAM. The program 7417ec681f3Smrg * for stream N is at DWORD 64 * N. 7427ec681f3Smrg */ 7437ec681f3Smrg#define A6XX_SO_PROG_DWORDS 64 7447ec681f3Smrg uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {}; 7457ec681f3Smrg BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0}; 7467ec681f3Smrg uint32_t ncomp[IR3_MAX_SO_BUFFERS] = {}; 747361fc4cbSmaya 7487ec681f3Smrg /* TODO: streamout state should be in a non-GMEM draw state */ 7497ec681f3Smrg 7507ec681f3Smrg /* no streamout: */ 7517ec681f3Smrg if (info->num_outputs == 0) { 7527ec681f3Smrg tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4); 7537ec681f3Smrg tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); 7547ec681f3Smrg tu_cs_emit(cs, 0); 7557ec681f3Smrg tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL); 7567ec681f3Smrg tu_cs_emit(cs, 0); 7577ec681f3Smrg return; 7587ec681f3Smrg } 7597ec681f3Smrg 7607ec681f3Smrg /* is there something to do with info->stride[i]? */ 761361fc4cbSmaya 7627ec681f3Smrg for (unsigned i = 0; i < info->num_outputs; i++) { 7637ec681f3Smrg const struct ir3_stream_output *out = &info->output[i]; 7647ec681f3Smrg unsigned k = out->register_index; 7657ec681f3Smrg unsigned idx; 766361fc4cbSmaya 7677ec681f3Smrg /* Skip it, if it's an output that was never assigned a register. */ 7687ec681f3Smrg if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG) 7697ec681f3Smrg continue; 7707ec681f3Smrg 7717ec681f3Smrg ncomp[out->output_buffer] += out->num_components; 7727ec681f3Smrg 7737ec681f3Smrg /* linkage map sorted by order frag shader wants things, so 7747ec681f3Smrg * a bit less ideal here.. 7757ec681f3Smrg */ 7767ec681f3Smrg for (idx = 0; idx < l->cnt; idx++) 7777ec681f3Smrg if (l->var[idx].regid == v->outputs[k].regid) 7787ec681f3Smrg break; 7797ec681f3Smrg 7807ec681f3Smrg debug_assert(idx < l->cnt); 7817ec681f3Smrg 7827ec681f3Smrg for (unsigned j = 0; j < out->num_components; j++) { 7837ec681f3Smrg unsigned c = j + out->start_component; 7847ec681f3Smrg unsigned loc = l->var[idx].loc + c; 7857ec681f3Smrg unsigned off = j + out->dst_offset; /* in dwords */ 7867ec681f3Smrg 7877ec681f3Smrg assert(loc < A6XX_SO_PROG_DWORDS * 2); 7887ec681f3Smrg unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2; 7897ec681f3Smrg if (loc & 1) { 7907ec681f3Smrg prog[dword] |= A6XX_VPC_SO_PROG_B_EN | 7917ec681f3Smrg A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) | 7927ec681f3Smrg A6XX_VPC_SO_PROG_B_OFF(off * 4); 7937ec681f3Smrg } else { 7947ec681f3Smrg prog[dword] |= A6XX_VPC_SO_PROG_A_EN | 7957ec681f3Smrg A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) | 7967ec681f3Smrg A6XX_VPC_SO_PROG_A_OFF(off * 4); 7977ec681f3Smrg } 7987ec681f3Smrg BITSET_SET(valid_dwords, dword); 7997ec681f3Smrg } 8007ec681f3Smrg } 8017ec681f3Smrg 8027ec681f3Smrg unsigned prog_count = 0; 8037ec681f3Smrg unsigned start, end; 8047ec681f3Smrg BITSET_FOREACH_RANGE(start, end, valid_dwords, 8057ec681f3Smrg A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) { 8067ec681f3Smrg prog_count += end - start + 1; 8077ec681f3Smrg } 8087ec681f3Smrg 8097ec681f3Smrg tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count); 8107ec681f3Smrg tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL); 8117ec681f3Smrg tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) | 8127ec681f3Smrg COND(ncomp[0] > 0, 8137ec681f3Smrg A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) | 8147ec681f3Smrg COND(ncomp[1] > 0, 8157ec681f3Smrg A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) | 8167ec681f3Smrg COND(ncomp[2] > 0, 8177ec681f3Smrg A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) | 8187ec681f3Smrg COND(ncomp[3] > 0, 8197ec681f3Smrg A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3]))); 8207ec681f3Smrg for (uint32_t i = 0; i < 4; i++) { 8217ec681f3Smrg tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(i)); 8227ec681f3Smrg tu_cs_emit(cs, ncomp[i]); 8237ec681f3Smrg } 8247ec681f3Smrg bool first = true; 8257ec681f3Smrg BITSET_FOREACH_RANGE(start, end, valid_dwords, 8267ec681f3Smrg A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) { 8277ec681f3Smrg tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); 8287ec681f3Smrg tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) | 8297ec681f3Smrg A6XX_VPC_SO_CNTL_ADDR(start)); 8307ec681f3Smrg for (unsigned i = start; i < end; i++) { 8317ec681f3Smrg tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG); 8327ec681f3Smrg tu_cs_emit(cs, prog[i]); 8337ec681f3Smrg } 8347ec681f3Smrg first = false; 8357ec681f3Smrg } 836361fc4cbSmaya} 837361fc4cbSmaya 838361fc4cbSmayastatic void 8397ec681f3Smrgtu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base, 8407ec681f3Smrg enum a6xx_state_block block, uint32_t offset, 8417ec681f3Smrg uint32_t size, const uint32_t *dwords) { 8427ec681f3Smrg assert(size % 4 == 0); 843361fc4cbSmaya 8447ec681f3Smrg tu_cs_emit_pkt7(cs, opcode, 3 + size); 8457ec681f3Smrg tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | 8467ec681f3Smrg CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 8477ec681f3Smrg CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 8487ec681f3Smrg CP_LOAD_STATE6_0_STATE_BLOCK(block) | 8497ec681f3Smrg CP_LOAD_STATE6_0_NUM_UNIT(size / 4)); 850361fc4cbSmaya 8517ec681f3Smrg tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 8527ec681f3Smrg tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 8537ec681f3Smrg dwords = (uint32_t *)&((uint8_t *)dwords)[offset]; 854361fc4cbSmaya 8557ec681f3Smrg tu_cs_emit_array(cs, dwords, size); 856361fc4cbSmaya} 857361fc4cbSmaya 858361fc4cbSmayastatic void 8597ec681f3Smrgtu6_emit_link_map(struct tu_cs *cs, 8607ec681f3Smrg const struct ir3_shader_variant *producer, 8617ec681f3Smrg const struct ir3_shader_variant *consumer, 8627ec681f3Smrg enum a6xx_state_block sb) 863361fc4cbSmaya{ 8647ec681f3Smrg const struct ir3_const_state *const_state = ir3_const_state(consumer); 8657ec681f3Smrg uint32_t base = const_state->offsets.primitive_map; 8667ec681f3Smrg int size = DIV_ROUND_UP(consumer->input_size, 4); 867361fc4cbSmaya 8687ec681f3Smrg size = (MIN2(size + base, consumer->constlen) - base) * 4; 8697ec681f3Smrg if (size <= 0) 8707ec681f3Smrg return; 8717ec681f3Smrg 8727ec681f3Smrg tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size, 8737ec681f3Smrg producer->output_loc); 874361fc4cbSmaya} 875361fc4cbSmaya 8767ec681f3Smrgstatic uint16_t 8777ec681f3Smrggl_primitive_to_tess(uint16_t primitive) { 8787ec681f3Smrg switch (primitive) { 8797ec681f3Smrg case GL_POINTS: 8807ec681f3Smrg return TESS_POINTS; 8817ec681f3Smrg case GL_LINE_STRIP: 8827ec681f3Smrg return TESS_LINES; 8837ec681f3Smrg case GL_TRIANGLE_STRIP: 8847ec681f3Smrg return TESS_CW_TRIS; 8857ec681f3Smrg default: 8867ec681f3Smrg unreachable(""); 8877ec681f3Smrg } 8887ec681f3Smrg} 8897ec681f3Smrg 8907ec681f3Smrgvoid 891361fc4cbSmayatu6_emit_vpc(struct tu_cs *cs, 892361fc4cbSmaya const struct ir3_shader_variant *vs, 8937ec681f3Smrg const struct ir3_shader_variant *hs, 8947ec681f3Smrg const struct ir3_shader_variant *ds, 8957ec681f3Smrg const struct ir3_shader_variant *gs, 896361fc4cbSmaya const struct ir3_shader_variant *fs, 8977ec681f3Smrg uint32_t patch_control_points) 8987ec681f3Smrg{ 8997ec681f3Smrg /* note: doesn't compile as static because of the array regs.. */ 9007ec681f3Smrg const struct reg_config { 9017ec681f3Smrg uint16_t reg_sp_xs_out_reg; 9027ec681f3Smrg uint16_t reg_sp_xs_vpc_dst_reg; 9037ec681f3Smrg uint16_t reg_vpc_xs_pack; 9047ec681f3Smrg uint16_t reg_vpc_xs_clip_cntl; 9057ec681f3Smrg uint16_t reg_gras_xs_cl_cntl; 9067ec681f3Smrg uint16_t reg_pc_xs_out_cntl; 9077ec681f3Smrg uint16_t reg_sp_xs_primitive_cntl; 9087ec681f3Smrg uint16_t reg_vpc_xs_layer_cntl; 9097ec681f3Smrg uint16_t reg_gras_xs_layer_cntl; 9107ec681f3Smrg } reg_config[] = { 9117ec681f3Smrg [MESA_SHADER_VERTEX] = { 9127ec681f3Smrg REG_A6XX_SP_VS_OUT_REG(0), 9137ec681f3Smrg REG_A6XX_SP_VS_VPC_DST_REG(0), 9147ec681f3Smrg REG_A6XX_VPC_VS_PACK, 9157ec681f3Smrg REG_A6XX_VPC_VS_CLIP_CNTL, 9167ec681f3Smrg REG_A6XX_GRAS_VS_CL_CNTL, 9177ec681f3Smrg REG_A6XX_PC_VS_OUT_CNTL, 9187ec681f3Smrg REG_A6XX_SP_VS_PRIMITIVE_CNTL, 9197ec681f3Smrg REG_A6XX_VPC_VS_LAYER_CNTL, 9207ec681f3Smrg REG_A6XX_GRAS_VS_LAYER_CNTL 9217ec681f3Smrg }, 9227ec681f3Smrg [MESA_SHADER_TESS_CTRL] = { 9237ec681f3Smrg 0, 9247ec681f3Smrg 0, 9257ec681f3Smrg 0, 9267ec681f3Smrg 0, 9277ec681f3Smrg 0, 9287ec681f3Smrg REG_A6XX_PC_HS_OUT_CNTL, 9297ec681f3Smrg 0, 9307ec681f3Smrg 0, 9317ec681f3Smrg 0 9327ec681f3Smrg }, 9337ec681f3Smrg [MESA_SHADER_TESS_EVAL] = { 9347ec681f3Smrg REG_A6XX_SP_DS_OUT_REG(0), 9357ec681f3Smrg REG_A6XX_SP_DS_VPC_DST_REG(0), 9367ec681f3Smrg REG_A6XX_VPC_DS_PACK, 9377ec681f3Smrg REG_A6XX_VPC_DS_CLIP_CNTL, 9387ec681f3Smrg REG_A6XX_GRAS_DS_CL_CNTL, 9397ec681f3Smrg REG_A6XX_PC_DS_OUT_CNTL, 9407ec681f3Smrg REG_A6XX_SP_DS_PRIMITIVE_CNTL, 9417ec681f3Smrg REG_A6XX_VPC_DS_LAYER_CNTL, 9427ec681f3Smrg REG_A6XX_GRAS_DS_LAYER_CNTL 9437ec681f3Smrg }, 9447ec681f3Smrg [MESA_SHADER_GEOMETRY] = { 9457ec681f3Smrg REG_A6XX_SP_GS_OUT_REG(0), 9467ec681f3Smrg REG_A6XX_SP_GS_VPC_DST_REG(0), 9477ec681f3Smrg REG_A6XX_VPC_GS_PACK, 9487ec681f3Smrg REG_A6XX_VPC_GS_CLIP_CNTL, 9497ec681f3Smrg REG_A6XX_GRAS_GS_CL_CNTL, 9507ec681f3Smrg REG_A6XX_PC_GS_OUT_CNTL, 9517ec681f3Smrg REG_A6XX_SP_GS_PRIMITIVE_CNTL, 9527ec681f3Smrg REG_A6XX_VPC_GS_LAYER_CNTL, 9537ec681f3Smrg REG_A6XX_GRAS_GS_LAYER_CNTL 9547ec681f3Smrg }, 9557ec681f3Smrg }; 956361fc4cbSmaya 9577ec681f3Smrg const struct ir3_shader_variant *last_shader; 9587ec681f3Smrg if (gs) { 9597ec681f3Smrg last_shader = gs; 9607ec681f3Smrg } else if (hs) { 9617ec681f3Smrg last_shader = ds; 9627ec681f3Smrg } else { 9637ec681f3Smrg last_shader = vs; 964361fc4cbSmaya } 965361fc4cbSmaya 9667ec681f3Smrg const struct reg_config *cfg = ®_config[last_shader->type]; 9677ec681f3Smrg 9687ec681f3Smrg struct ir3_shader_linkage linkage = { 9697ec681f3Smrg .primid_loc = 0xff, 9707ec681f3Smrg .clip0_loc = 0xff, 9717ec681f3Smrg .clip1_loc = 0xff, 9727ec681f3Smrg }; 9737ec681f3Smrg if (fs) 9747ec681f3Smrg ir3_link_shaders(&linkage, last_shader, fs, true); 9757ec681f3Smrg 9767ec681f3Smrg if (last_shader->shader->stream_output.num_outputs) 9777ec681f3Smrg ir3_link_stream_out(&linkage, last_shader); 9787ec681f3Smrg 9797ec681f3Smrg /* We do this after linking shaders in order to know whether PrimID 9807ec681f3Smrg * passthrough needs to be enabled. 9817ec681f3Smrg */ 9827ec681f3Smrg bool primid_passthru = linkage.primid_loc != 0xff; 9837ec681f3Smrg tu6_emit_vs_system_values(cs, vs, hs, ds, gs, primid_passthru); 9847ec681f3Smrg 985361fc4cbSmaya tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4); 9867ec681f3Smrg tu_cs_emit(cs, ~linkage.varmask[0]); 9877ec681f3Smrg tu_cs_emit(cs, ~linkage.varmask[1]); 9887ec681f3Smrg tu_cs_emit(cs, ~linkage.varmask[2]); 9897ec681f3Smrg tu_cs_emit(cs, ~linkage.varmask[3]); 990361fc4cbSmaya 991361fc4cbSmaya /* a6xx finds position/pointsize at the end */ 992361fc4cbSmaya const uint32_t pointsize_regid = 9937ec681f3Smrg ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ); 9947ec681f3Smrg const uint32_t layer_regid = 9957ec681f3Smrg ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER); 9967ec681f3Smrg const uint32_t view_regid = 9977ec681f3Smrg ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT); 9987ec681f3Smrg const uint32_t clip0_regid = 9997ec681f3Smrg ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0); 10007ec681f3Smrg const uint32_t clip1_regid = 10017ec681f3Smrg ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1); 10027ec681f3Smrg uint32_t flags_regid = gs ? 10037ec681f3Smrg ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0; 10047ec681f3Smrg 10057ec681f3Smrg uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff; 10067ec681f3Smrg 10077ec681f3Smrg if (layer_regid != regid(63, 0)) { 10087ec681f3Smrg layer_loc = linkage.max_loc; 10097ec681f3Smrg ir3_link_add(&linkage, layer_regid, 0x1, linkage.max_loc); 10107ec681f3Smrg } 10117ec681f3Smrg 10127ec681f3Smrg if (view_regid != regid(63, 0)) { 10137ec681f3Smrg view_loc = linkage.max_loc; 10147ec681f3Smrg ir3_link_add(&linkage, view_regid, 0x1, linkage.max_loc); 10157ec681f3Smrg } 10167ec681f3Smrg 10177ec681f3Smrg unsigned extra_pos = 0; 10187ec681f3Smrg 10197ec681f3Smrg for (unsigned i = 0; i < last_shader->outputs_count; i++) { 10207ec681f3Smrg if (last_shader->outputs[i].slot != VARYING_SLOT_POS) 10217ec681f3Smrg continue; 10227ec681f3Smrg 10237ec681f3Smrg if (position_loc == 0xff) 10247ec681f3Smrg position_loc = linkage.max_loc; 10257ec681f3Smrg 10267ec681f3Smrg ir3_link_add(&linkage, last_shader->outputs[i].regid, 10277ec681f3Smrg 0xf, position_loc + 4 * last_shader->outputs[i].view); 10287ec681f3Smrg extra_pos = MAX2(extra_pos, last_shader->outputs[i].view); 10297ec681f3Smrg } 10307ec681f3Smrg 1031361fc4cbSmaya if (pointsize_regid != regid(63, 0)) { 1032361fc4cbSmaya pointsize_loc = linkage.max_loc; 1033361fc4cbSmaya ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc); 1034361fc4cbSmaya } 1035361fc4cbSmaya 10367ec681f3Smrg uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask; 10377ec681f3Smrg 10387ec681f3Smrg /* Handle the case where clip/cull distances aren't read by the FS */ 10397ec681f3Smrg uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc; 10407ec681f3Smrg if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) { 10417ec681f3Smrg clip0_loc = linkage.max_loc; 10427ec681f3Smrg ir3_link_add(&linkage, clip0_regid, clip_cull_mask & 0xf, linkage.max_loc); 10437ec681f3Smrg } 10447ec681f3Smrg if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) { 10457ec681f3Smrg clip1_loc = linkage.max_loc; 10467ec681f3Smrg ir3_link_add(&linkage, clip1_regid, clip_cull_mask >> 4, linkage.max_loc); 10477ec681f3Smrg } 10487ec681f3Smrg 10497ec681f3Smrg tu6_setup_streamout(cs, last_shader, &linkage); 10507ec681f3Smrg 10517ec681f3Smrg /* The GPU hangs on some models when there are no outputs (xs_pack::CNT), 10527ec681f3Smrg * at least when a DS is the last stage, so add a dummy output to keep it 10537ec681f3Smrg * happy if there aren't any. We do this late in order to avoid emitting 10547ec681f3Smrg * any unused code and make sure that optimizations don't remove it. 10557ec681f3Smrg */ 10567ec681f3Smrg if (linkage.cnt == 0) 10577ec681f3Smrg ir3_link_add(&linkage, 0, 0x1, linkage.max_loc); 10587ec681f3Smrg 10597ec681f3Smrg /* map outputs of the last shader to VPC */ 1060361fc4cbSmaya assert(linkage.cnt <= 32); 10617ec681f3Smrg const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2); 10627ec681f3Smrg const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4); 10637ec681f3Smrg uint32_t sp_out[16] = {0}; 10647ec681f3Smrg uint32_t sp_vpc_dst[8] = {0}; 1065361fc4cbSmaya for (uint32_t i = 0; i < linkage.cnt; i++) { 10667ec681f3Smrg ((uint16_t *) sp_out)[i] = 1067361fc4cbSmaya A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) | 1068361fc4cbSmaya A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask); 10697ec681f3Smrg ((uint8_t *) sp_vpc_dst)[i] = 1070361fc4cbSmaya A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc); 1071361fc4cbSmaya } 1072361fc4cbSmaya 10737ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count); 10747ec681f3Smrg tu_cs_emit_array(cs, sp_out, sp_out_count); 10757ec681f3Smrg 10767ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count); 10777ec681f3Smrg tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count); 10787ec681f3Smrg 10797ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1); 10807ec681f3Smrg tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) | 10817ec681f3Smrg A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) | 10827ec681f3Smrg A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) | 10837ec681f3Smrg A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos)); 10847ec681f3Smrg 10857ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1); 10867ec681f3Smrg tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | 10877ec681f3Smrg A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | 10887ec681f3Smrg A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); 10897ec681f3Smrg 10907ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1); 10917ec681f3Smrg tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) | 10927ec681f3Smrg A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask)); 10937ec681f3Smrg 10947ec681f3Smrg const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs }; 10957ec681f3Smrg 10967ec681f3Smrg for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) { 10977ec681f3Smrg const struct ir3_shader_variant *shader = geom_shaders[i]; 10987ec681f3Smrg if (!shader) 10997ec681f3Smrg continue; 11007ec681f3Smrg 11017ec681f3Smrg bool primid = shader->type != MESA_SHADER_VERTEX && 11027ec681f3Smrg VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID)); 11037ec681f3Smrg 11047ec681f3Smrg tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1); 11057ec681f3Smrg if (shader == last_shader) { 11067ec681f3Smrg tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) | 11077ec681f3Smrg CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) | 11087ec681f3Smrg CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) | 11097ec681f3Smrg CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) | 11107ec681f3Smrg COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) | 11117ec681f3Smrg A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); 11127ec681f3Smrg } else { 11137ec681f3Smrg tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID)); 11147ec681f3Smrg } 11157ec681f3Smrg } 11167ec681f3Smrg 11177ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1); 11187ec681f3Smrg tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) | 11197ec681f3Smrg A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid)); 11207ec681f3Smrg 11217ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1); 11227ec681f3Smrg tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) | 11237ec681f3Smrg A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc)); 1124361fc4cbSmaya 11257ec681f3Smrg tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1); 11267ec681f3Smrg tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) | 11277ec681f3Smrg CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW)); 11287ec681f3Smrg 11297ec681f3Smrg tu_cs_emit_regs(cs, A6XX_PC_PRIMID_PASSTHRU(primid_passthru)); 1130361fc4cbSmaya 1131361fc4cbSmaya tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1); 11327ec681f3Smrg tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs ? fs->total_in : 0) | 11337ec681f3Smrg COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) | 11347ec681f3Smrg A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) | 11357ec681f3Smrg A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc)); 11367ec681f3Smrg 11377ec681f3Smrg if (hs) { 11387ec681f3Smrg shader_info *hs_info = &hs->shader->nir->info; 11397ec681f3Smrg 11407ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1); 11417ec681f3Smrg tu_cs_emit(cs, hs_info->tess.tcs_vertices_out); 11427ec681f3Smrg 11437ec681f3Smrg /* Total attribute slots in HS incoming patch. */ 11447ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1); 11457ec681f3Smrg tu_cs_emit(cs, patch_control_points * vs->output_size / 4); 11467ec681f3Smrg 11477ec681f3Smrg const uint32_t wavesize = 64; 11487ec681f3Smrg const uint32_t max_wave_input_size = 64; 11497ec681f3Smrg 11507ec681f3Smrg /* note: if HS is really just the VS extended, then this 11517ec681f3Smrg * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out) 11527ec681f3Smrg * however that doesn't match the blob, and fails some dEQP tests. 11537ec681f3Smrg */ 11547ec681f3Smrg uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out; 11557ec681f3Smrg uint32_t max_prims_per_wave = 11567ec681f3Smrg max_wave_input_size * wavesize / (vs->output_size * patch_control_points); 11577ec681f3Smrg prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); 11587ec681f3Smrg 11597ec681f3Smrg uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave; 11607ec681f3Smrg uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); 11617ec681f3Smrg 11627ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); 11637ec681f3Smrg tu_cs_emit(cs, wave_input_size); 11647ec681f3Smrg 11657ec681f3Smrg /* In SPIR-V generated from GLSL, the tessellation primitive params are 11667ec681f3Smrg * are specified in the tess eval shader, but in SPIR-V generated from 11677ec681f3Smrg * HLSL, they are specified in the tess control shader. */ 11687ec681f3Smrg shader_info *tess_info = 11697ec681f3Smrg ds->shader->nir->info.tess.spacing == TESS_SPACING_UNSPECIFIED ? 11707ec681f3Smrg &hs->shader->nir->info : &ds->shader->nir->info; 11717ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_CNTL, 1); 11727ec681f3Smrg uint32_t output; 11737ec681f3Smrg if (tess_info->tess.point_mode) 11747ec681f3Smrg output = TESS_POINTS; 11757ec681f3Smrg else if (tess_info->tess.primitive_mode == GL_ISOLINES) 11767ec681f3Smrg output = TESS_LINES; 11777ec681f3Smrg else if (tess_info->tess.ccw) 11787ec681f3Smrg output = TESS_CCW_TRIS; 11797ec681f3Smrg else 11807ec681f3Smrg output = TESS_CW_TRIS; 11817ec681f3Smrg 11827ec681f3Smrg enum a6xx_tess_spacing spacing; 11837ec681f3Smrg switch (tess_info->tess.spacing) { 11847ec681f3Smrg case TESS_SPACING_EQUAL: 11857ec681f3Smrg spacing = TESS_EQUAL; 11867ec681f3Smrg break; 11877ec681f3Smrg case TESS_SPACING_FRACTIONAL_ODD: 11887ec681f3Smrg spacing = TESS_FRACTIONAL_ODD; 11897ec681f3Smrg break; 11907ec681f3Smrg case TESS_SPACING_FRACTIONAL_EVEN: 11917ec681f3Smrg spacing = TESS_FRACTIONAL_EVEN; 11927ec681f3Smrg break; 11937ec681f3Smrg case TESS_SPACING_UNSPECIFIED: 11947ec681f3Smrg default: 11957ec681f3Smrg unreachable("invalid tess spacing"); 11967ec681f3Smrg } 11977ec681f3Smrg tu_cs_emit(cs, A6XX_PC_TESS_CNTL_SPACING(spacing) | 11987ec681f3Smrg A6XX_PC_TESS_CNTL_OUTPUT(output)); 11997ec681f3Smrg 12007ec681f3Smrg tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER); 12017ec681f3Smrg tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER); 12027ec681f3Smrg } 1203361fc4cbSmaya 1204361fc4cbSmaya 12057ec681f3Smrg if (gs) { 12067ec681f3Smrg uint32_t vertices_out, invocations, output, vec4_size; 12077ec681f3Smrg uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size; 1208361fc4cbSmaya 12097ec681f3Smrg /* this detects the tu_clear_blit path, which doesn't set ->nir */ 12107ec681f3Smrg if (gs->shader->nir) { 12117ec681f3Smrg if (hs) { 12127ec681f3Smrg tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER); 12137ec681f3Smrg } else { 12147ec681f3Smrg tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER); 12157ec681f3Smrg } 12167ec681f3Smrg vertices_out = gs->shader->nir->info.gs.vertices_out - 1; 12177ec681f3Smrg output = gl_primitive_to_tess(gs->shader->nir->info.gs.output_primitive); 12187ec681f3Smrg invocations = gs->shader->nir->info.gs.invocations - 1; 12197ec681f3Smrg /* Size of per-primitive alloction in ldlw memory in vec4s. */ 12207ec681f3Smrg vec4_size = gs->shader->nir->info.gs.vertices_in * 12217ec681f3Smrg DIV_ROUND_UP(prev_stage_output_size, 4); 12227ec681f3Smrg } else { 12237ec681f3Smrg vertices_out = 3; 12247ec681f3Smrg output = TESS_CW_TRIS; 12257ec681f3Smrg invocations = 0; 12267ec681f3Smrg vec4_size = 0; 12277ec681f3Smrg } 1228361fc4cbSmaya 12297ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1); 12307ec681f3Smrg tu_cs_emit(cs, 12317ec681f3Smrg A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) | 12327ec681f3Smrg A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | 12337ec681f3Smrg A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations)); 12347ec681f3Smrg 12357ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1); 12367ec681f3Smrg tu_cs_emit(cs, 0xff); 12377ec681f3Smrg 12387ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); 12397ec681f3Smrg tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); 12407ec681f3Smrg 12417ec681f3Smrg uint32_t prim_size = prev_stage_output_size; 12427ec681f3Smrg if (prim_size > 64) 12437ec681f3Smrg prim_size = 64; 12447ec681f3Smrg else if (prim_size == 64) 12457ec681f3Smrg prim_size = 63; 12467ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1); 12477ec681f3Smrg tu_cs_emit(cs, prim_size); 12487ec681f3Smrg } 1249361fc4cbSmaya} 1250361fc4cbSmaya 1251361fc4cbSmayastatic int 1252361fc4cbSmayatu6_vpc_varying_mode(const struct ir3_shader_variant *fs, 1253361fc4cbSmaya uint32_t index, 1254361fc4cbSmaya uint8_t *interp_mode, 1255361fc4cbSmaya uint8_t *ps_repl_mode) 1256361fc4cbSmaya{ 1257361fc4cbSmaya enum 1258361fc4cbSmaya { 1259361fc4cbSmaya INTERP_SMOOTH = 0, 1260361fc4cbSmaya INTERP_FLAT = 1, 1261361fc4cbSmaya INTERP_ZERO = 2, 1262361fc4cbSmaya INTERP_ONE = 3, 1263361fc4cbSmaya }; 1264361fc4cbSmaya enum 1265361fc4cbSmaya { 1266361fc4cbSmaya PS_REPL_NONE = 0, 1267361fc4cbSmaya PS_REPL_S = 1, 1268361fc4cbSmaya PS_REPL_T = 2, 1269361fc4cbSmaya PS_REPL_ONE_MINUS_T = 3, 1270361fc4cbSmaya }; 1271361fc4cbSmaya 1272361fc4cbSmaya const uint32_t compmask = fs->inputs[index].compmask; 1273361fc4cbSmaya 1274361fc4cbSmaya /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and 1275361fc4cbSmaya * fourth component occupy three consecutive varying slots 1276361fc4cbSmaya */ 1277361fc4cbSmaya int shift = 0; 1278361fc4cbSmaya *interp_mode = 0; 1279361fc4cbSmaya *ps_repl_mode = 0; 1280361fc4cbSmaya if (fs->inputs[index].slot == VARYING_SLOT_PNTC) { 1281361fc4cbSmaya if (compmask & 0x1) { 1282361fc4cbSmaya *ps_repl_mode |= PS_REPL_S << shift; 1283361fc4cbSmaya shift += 2; 1284361fc4cbSmaya } 1285361fc4cbSmaya if (compmask & 0x2) { 1286361fc4cbSmaya *ps_repl_mode |= PS_REPL_T << shift; 1287361fc4cbSmaya shift += 2; 1288361fc4cbSmaya } 1289361fc4cbSmaya if (compmask & 0x4) { 1290361fc4cbSmaya *interp_mode |= INTERP_ZERO << shift; 1291361fc4cbSmaya shift += 2; 1292361fc4cbSmaya } 1293361fc4cbSmaya if (compmask & 0x8) { 1294361fc4cbSmaya *interp_mode |= INTERP_ONE << 6; 1295361fc4cbSmaya shift += 2; 1296361fc4cbSmaya } 12977ec681f3Smrg } else if (fs->inputs[index].flat) { 1298361fc4cbSmaya for (int i = 0; i < 4; i++) { 1299361fc4cbSmaya if (compmask & (1 << i)) { 1300361fc4cbSmaya *interp_mode |= INTERP_FLAT << shift; 1301361fc4cbSmaya shift += 2; 1302361fc4cbSmaya } 1303361fc4cbSmaya } 1304361fc4cbSmaya } 1305361fc4cbSmaya 1306361fc4cbSmaya return shift; 1307361fc4cbSmaya} 1308361fc4cbSmaya 1309361fc4cbSmayastatic void 1310361fc4cbSmayatu6_emit_vpc_varying_modes(struct tu_cs *cs, 13117ec681f3Smrg const struct ir3_shader_variant *fs) 1312361fc4cbSmaya{ 1313361fc4cbSmaya uint32_t interp_modes[8] = { 0 }; 1314361fc4cbSmaya uint32_t ps_repl_modes[8] = { 0 }; 1315361fc4cbSmaya 13167ec681f3Smrg if (fs) { 1317361fc4cbSmaya for (int i = -1; 1318361fc4cbSmaya (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) { 1319361fc4cbSmaya 1320361fc4cbSmaya /* get the mode for input i */ 1321361fc4cbSmaya uint8_t interp_mode; 1322361fc4cbSmaya uint8_t ps_repl_mode; 1323361fc4cbSmaya const int bits = 1324361fc4cbSmaya tu6_vpc_varying_mode(fs, i, &interp_mode, &ps_repl_mode); 1325361fc4cbSmaya 1326361fc4cbSmaya /* OR the mode into the array */ 1327361fc4cbSmaya const uint32_t inloc = fs->inputs[i].inloc * 2; 1328361fc4cbSmaya uint32_t n = inloc / 32; 1329361fc4cbSmaya uint32_t shift = inloc % 32; 1330361fc4cbSmaya interp_modes[n] |= interp_mode << shift; 1331361fc4cbSmaya ps_repl_modes[n] |= ps_repl_mode << shift; 1332361fc4cbSmaya if (shift + bits > 32) { 1333361fc4cbSmaya n++; 1334361fc4cbSmaya shift = 32 - shift; 1335361fc4cbSmaya 1336361fc4cbSmaya interp_modes[n] |= interp_mode >> shift; 1337361fc4cbSmaya ps_repl_modes[n] |= ps_repl_mode >> shift; 1338361fc4cbSmaya } 1339361fc4cbSmaya } 1340361fc4cbSmaya } 1341361fc4cbSmaya 1342361fc4cbSmaya tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); 1343361fc4cbSmaya tu_cs_emit_array(cs, interp_modes, 8); 1344361fc4cbSmaya 1345361fc4cbSmaya tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); 1346361fc4cbSmaya tu_cs_emit_array(cs, ps_repl_modes, 8); 1347361fc4cbSmaya} 1348361fc4cbSmaya 13497ec681f3Smrgvoid 1350361fc4cbSmayatu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) 1351361fc4cbSmaya{ 13527ec681f3Smrg uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; 13537ec681f3Smrg uint32_t ij_regid[IJ_COUNT]; 13547ec681f3Smrg uint32_t smask_in_regid; 13557ec681f3Smrg 13567ec681f3Smrg bool sample_shading = fs->per_samp | fs->key.sample_shading; 13577ec681f3Smrg bool enable_varyings = fs->total_in > 0; 13587ec681f3Smrg 13597ec681f3Smrg samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); 13607ec681f3Smrg smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); 13617ec681f3Smrg face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); 13627ec681f3Smrg coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); 13637ec681f3Smrg zwcoord_regid = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0); 13647ec681f3Smrg for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) 13657ec681f3Smrg ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); 13667ec681f3Smrg 13677ec681f3Smrg if (fs->num_sampler_prefetch > 0) { 13687ec681f3Smrg assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL])); 13697ec681f3Smrg /* also, it seems like ij_pix is *required* to be r0.x */ 13707ec681f3Smrg assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0)); 13717ec681f3Smrg } 1372361fc4cbSmaya 13737ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); 13747ec681f3Smrg tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | 13757ec681f3Smrg A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) | 13767ec681f3Smrg 0x7000); // XXX); 13777ec681f3Smrg for (int i = 0; i < fs->num_sampler_prefetch; i++) { 13787ec681f3Smrg const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; 13797ec681f3Smrg tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) | 13807ec681f3Smrg A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) | 13817ec681f3Smrg A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) | 13827ec681f3Smrg A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) | 13837ec681f3Smrg A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) | 13847ec681f3Smrg COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) | 13857ec681f3Smrg A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd)); 13867ec681f3Smrg } 1387361fc4cbSmaya 13887ec681f3Smrg if (fs->num_sampler_prefetch > 0) { 13897ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch); 13907ec681f3Smrg for (int i = 0; i < fs->num_sampler_prefetch; i++) { 13917ec681f3Smrg const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; 13927ec681f3Smrg tu_cs_emit(cs, 13937ec681f3Smrg A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) | 13947ec681f3Smrg A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id)); 13957ec681f3Smrg } 13967ec681f3Smrg } 1397361fc4cbSmaya 13987ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5); 13997ec681f3Smrg tu_cs_emit(cs, 0x7); 14007ec681f3Smrg tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | 14017ec681f3Smrg A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | 14027ec681f3Smrg A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) | 14037ec681f3Smrg A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_regid[IJ_PERSP_SIZE])); 14047ec681f3Smrg tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | 14057ec681f3Smrg A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | 14067ec681f3Smrg A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) | 14077ec681f3Smrg A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID])); 14087ec681f3Smrg tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | 14097ec681f3Smrg A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | 14107ec681f3Smrg A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) | 14117ec681f3Smrg A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE])); 14127ec681f3Smrg tu_cs_emit(cs, 0xfcfc); 14137ec681f3Smrg 14147ec681f3Smrg enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64; 14157ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1); 14167ec681f3Smrg tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz) | 14177ec681f3Smrg COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS)); 14187ec681f3Smrg 14197ec681f3Smrg bool need_size = fs->frag_face || fs->fragcoord_compmask != 0; 14207ec681f3Smrg bool need_size_persamp = false; 14217ec681f3Smrg if (VALIDREG(ij_regid[IJ_PERSP_SIZE])) { 14227ec681f3Smrg if (sample_shading) 14237ec681f3Smrg need_size_persamp = true; 14247ec681f3Smrg else 14257ec681f3Smrg need_size = true; 1426361fc4cbSmaya } 1427361fc4cbSmaya 1428361fc4cbSmaya tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1); 14297ec681f3Smrg tu_cs_emit(cs, 14307ec681f3Smrg CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) | 14317ec681f3Smrg CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) | 14327ec681f3Smrg CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) | 14337ec681f3Smrg CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | 14347ec681f3Smrg CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) | 14357ec681f3Smrg CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | 14367ec681f3Smrg COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | 14377ec681f3Smrg COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | 14387ec681f3Smrg COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask))); 1439361fc4cbSmaya 1440361fc4cbSmaya tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2); 14417ec681f3Smrg tu_cs_emit(cs, 14427ec681f3Smrg CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) | 14437ec681f3Smrg CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) | 14447ec681f3Smrg CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) | 14457ec681f3Smrg CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | 14467ec681f3Smrg CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) | 14477ec681f3Smrg CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | 14487ec681f3Smrg COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | 14497ec681f3Smrg COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) | 14507ec681f3Smrg COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | 14517ec681f3Smrg COND(fs->fragcoord_compmask != 0, 14527ec681f3Smrg A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask))); 14537ec681f3Smrg tu_cs_emit(cs, 14547ec681f3Smrg A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE( 14557ec681f3Smrg sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) | 14567ec681f3Smrg CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | 14577ec681f3Smrg CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | 14587ec681f3Smrg CONDREG(ij_regid[IJ_PERSP_SIZE], A6XX_RB_RENDER_CONTROL1_SIZE) | 14597ec681f3Smrg COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); 14607ec681f3Smrg 14617ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1); 14627ec681f3Smrg tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE)); 14637ec681f3Smrg 14647ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1); 14657ec681f3Smrg tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) | 14667ec681f3Smrg A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE( 14677ec681f3Smrg sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER)); 14687ec681f3Smrg 14697ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1); 14707ec681f3Smrg tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE)); 14717ec681f3Smrg} 1472361fc4cbSmaya 1473361fc4cbSmayastatic void 1474361fc4cbSmayatu6_emit_fs_outputs(struct tu_cs *cs, 1475361fc4cbSmaya const struct ir3_shader_variant *fs, 14767ec681f3Smrg uint32_t mrt_count, bool dual_src_blend, 14777ec681f3Smrg uint32_t render_components, 14787ec681f3Smrg bool no_earlyz, 14797ec681f3Smrg struct tu_pipeline *pipeline) 1480361fc4cbSmaya{ 14817ec681f3Smrg uint32_t smask_regid, posz_regid, stencilref_regid; 14827ec681f3Smrg 14837ec681f3Smrg posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); 14847ec681f3Smrg smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK); 14857ec681f3Smrg stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL); 14867ec681f3Smrg 1487361fc4cbSmaya uint32_t fragdata_regid[8]; 1488361fc4cbSmaya if (fs->color0_mrt) { 1489361fc4cbSmaya fragdata_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_COLOR); 1490361fc4cbSmaya for (uint32_t i = 1; i < ARRAY_SIZE(fragdata_regid); i++) 1491361fc4cbSmaya fragdata_regid[i] = fragdata_regid[0]; 1492361fc4cbSmaya } else { 1493361fc4cbSmaya for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) 1494361fc4cbSmaya fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i); 1495361fc4cbSmaya } 1496361fc4cbSmaya 1497361fc4cbSmaya tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); 14987ec681f3Smrg tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | 14997ec681f3Smrg A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | 15007ec681f3Smrg A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) | 15017ec681f3Smrg COND(dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); 1502361fc4cbSmaya tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count)); 1503361fc4cbSmaya 15047ec681f3Smrg uint32_t fs_render_components = 0; 15057ec681f3Smrg 1506361fc4cbSmaya tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 8); 1507361fc4cbSmaya for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) { 1508361fc4cbSmaya tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) | 15097ec681f3Smrg (COND(fragdata_regid[i] & HALF_REG_ID, 15107ec681f3Smrg A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION))); 15117ec681f3Smrg 15127ec681f3Smrg if (VALIDREG(fragdata_regid[i])) { 15137ec681f3Smrg fs_render_components |= 0xf << (i * 4); 15147ec681f3Smrg } 15157ec681f3Smrg } 15167ec681f3Smrg 15177ec681f3Smrg /* dual source blending has an extra fs output in the 2nd slot */ 15187ec681f3Smrg if (dual_src_blend) { 15197ec681f3Smrg fs_render_components |= 0xf << 4; 1520361fc4cbSmaya } 1521361fc4cbSmaya 15227ec681f3Smrg /* There is no point in having component enabled which is not written 15237ec681f3Smrg * by the shader. Per VK spec it is an UB, however a few apps depend on 15247ec681f3Smrg * attachment not being changed if FS doesn't have corresponding output. 15257ec681f3Smrg */ 15267ec681f3Smrg fs_render_components &= render_components; 15277ec681f3Smrg 15287ec681f3Smrg tu_cs_emit_regs(cs, 15297ec681f3Smrg A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components)); 15307ec681f3Smrg 1531361fc4cbSmaya tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2); 15327ec681f3Smrg tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) | 15337ec681f3Smrg COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) | 15347ec681f3Smrg COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) | 15357ec681f3Smrg COND(dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); 1536361fc4cbSmaya tu_cs_emit(cs, A6XX_RB_FS_OUTPUT_CNTL1_MRT(mrt_count)); 1537361fc4cbSmaya 15387ec681f3Smrg tu_cs_emit_regs(cs, 15397ec681f3Smrg A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components)); 15407ec681f3Smrg 15417ec681f3Smrg if (pipeline) { 15427ec681f3Smrg pipeline->lrz.fs_has_kill = fs->has_kill; 15437ec681f3Smrg pipeline->lrz.early_fragment_tests = fs->shader->nir->info.fs.early_fragment_tests; 15447ec681f3Smrg 15457ec681f3Smrg if ((fs->shader && !fs->shader->nir->info.fs.early_fragment_tests) && 15467ec681f3Smrg (fs->no_earlyz || fs->has_kill || fs->writes_pos || fs->writes_stencilref || no_earlyz || fs->writes_smask)) { 15477ec681f3Smrg pipeline->lrz.force_late_z = true; 15487ec681f3Smrg } 1549361fc4cbSmaya } 15507ec681f3Smrg} 1551361fc4cbSmaya 15527ec681f3Smrgstatic void 15537ec681f3Smrgtu6_emit_geom_tess_consts(struct tu_cs *cs, 15547ec681f3Smrg const struct ir3_shader_variant *vs, 15557ec681f3Smrg const struct ir3_shader_variant *hs, 15567ec681f3Smrg const struct ir3_shader_variant *ds, 15577ec681f3Smrg const struct ir3_shader_variant *gs, 15587ec681f3Smrg uint32_t cps_per_patch) 15597ec681f3Smrg{ 15607ec681f3Smrg uint32_t num_vertices = 15617ec681f3Smrg hs ? cps_per_patch : gs->shader->nir->info.gs.vertices_in; 15627ec681f3Smrg 15637ec681f3Smrg uint32_t vs_params[4] = { 15647ec681f3Smrg vs->output_size * num_vertices * 4, /* vs primitive stride */ 15657ec681f3Smrg vs->output_size * 4, /* vs vertex stride */ 15667ec681f3Smrg 0, 15677ec681f3Smrg 0, 15687ec681f3Smrg }; 15697ec681f3Smrg uint32_t vs_base = ir3_const_state(vs)->offsets.primitive_param; 15707ec681f3Smrg tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0, 15717ec681f3Smrg ARRAY_SIZE(vs_params), vs_params); 15727ec681f3Smrg 15737ec681f3Smrg if (hs) { 15747ec681f3Smrg assert(ds->type != MESA_SHADER_NONE); 15757ec681f3Smrg uint32_t hs_params[4] = { 15767ec681f3Smrg vs->output_size * num_vertices * 4, /* hs primitive stride */ 15777ec681f3Smrg vs->output_size * 4, /* hs vertex stride */ 15787ec681f3Smrg hs->output_size, 15797ec681f3Smrg cps_per_patch, 15807ec681f3Smrg }; 15817ec681f3Smrg 15827ec681f3Smrg uint32_t hs_base = hs->const_state->offsets.primitive_param; 15837ec681f3Smrg tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0, 15847ec681f3Smrg ARRAY_SIZE(hs_params), hs_params); 15857ec681f3Smrg if (gs) 15867ec681f3Smrg num_vertices = gs->shader->nir->info.gs.vertices_in; 15877ec681f3Smrg 15887ec681f3Smrg uint32_t ds_params[4] = { 15897ec681f3Smrg ds->output_size * num_vertices * 4, /* ds primitive stride */ 15907ec681f3Smrg ds->output_size * 4, /* ds vertex stride */ 15917ec681f3Smrg hs->output_size, /* hs vertex stride (dwords) */ 15927ec681f3Smrg hs->shader->nir->info.tess.tcs_vertices_out 15937ec681f3Smrg }; 15947ec681f3Smrg 15957ec681f3Smrg uint32_t ds_base = ds->const_state->offsets.primitive_param; 15967ec681f3Smrg tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0, 15977ec681f3Smrg ARRAY_SIZE(ds_params), ds_params); 15987ec681f3Smrg } 1599361fc4cbSmaya 16007ec681f3Smrg if (gs) { 16017ec681f3Smrg const struct ir3_shader_variant *prev = ds ? ds : vs; 16027ec681f3Smrg uint32_t gs_params[4] = { 16037ec681f3Smrg prev->output_size * num_vertices * 4, /* gs primitive stride */ 16047ec681f3Smrg prev->output_size * 4, /* gs vertex stride */ 16057ec681f3Smrg 0, 16067ec681f3Smrg 0, 16077ec681f3Smrg }; 16087ec681f3Smrg uint32_t gs_base = gs->const_state->offsets.primitive_param; 16097ec681f3Smrg tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0, 16107ec681f3Smrg ARRAY_SIZE(gs_params), gs_params); 16117ec681f3Smrg } 1612361fc4cbSmaya} 1613361fc4cbSmaya 1614361fc4cbSmayastatic void 16157ec681f3Smrgtu6_emit_program_config(struct tu_cs *cs, 16167ec681f3Smrg struct tu_pipeline_builder *builder) 16177ec681f3Smrg{ 16187ec681f3Smrg gl_shader_stage stage = MESA_SHADER_VERTEX; 16197ec681f3Smrg 16207ec681f3Smrg STATIC_ASSERT(MESA_SHADER_VERTEX == 0); 16217ec681f3Smrg 16227ec681f3Smrg tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( 16237ec681f3Smrg .vs_state = true, 16247ec681f3Smrg .hs_state = true, 16257ec681f3Smrg .ds_state = true, 16267ec681f3Smrg .gs_state = true, 16277ec681f3Smrg .fs_state = true, 16287ec681f3Smrg .gfx_ibo = true)); 16297ec681f3Smrg for (; stage < ARRAY_SIZE(builder->shaders); stage++) { 16307ec681f3Smrg tu6_emit_xs_config(cs, stage, builder->variants[stage]); 1631361fc4cbSmaya } 16327ec681f3Smrg} 1633361fc4cbSmaya 16347ec681f3Smrgstatic void 16357ec681f3Smrgtu6_emit_program(struct tu_cs *cs, 16367ec681f3Smrg struct tu_pipeline_builder *builder, 16377ec681f3Smrg bool binning_pass, 16387ec681f3Smrg struct tu_pipeline *pipeline) 16397ec681f3Smrg{ 16407ec681f3Smrg const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX]; 16417ec681f3Smrg const struct ir3_shader_variant *bs = builder->binning_variant; 16427ec681f3Smrg const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL]; 16437ec681f3Smrg const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL]; 16447ec681f3Smrg const struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY]; 16457ec681f3Smrg const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT]; 16467ec681f3Smrg gl_shader_stage stage = MESA_SHADER_VERTEX; 16477ec681f3Smrg uint32_t cps_per_patch = builder->create_info->pTessellationState ? 16487ec681f3Smrg builder->create_info->pTessellationState->patchControlPoints : 0; 16497ec681f3Smrg bool multi_pos_output = builder->shaders[MESA_SHADER_VERTEX]->multi_pos_output; 16507ec681f3Smrg 16517ec681f3Smrg /* Don't use the binning pass variant when GS is present because we don't 16527ec681f3Smrg * support compiling correct binning pass variants with GS. 16537ec681f3Smrg */ 16547ec681f3Smrg if (binning_pass && !gs) { 16557ec681f3Smrg vs = bs; 16567ec681f3Smrg tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova); 16577ec681f3Smrg stage++; 1658361fc4cbSmaya } 1659361fc4cbSmaya 16607ec681f3Smrg for (; stage < ARRAY_SIZE(builder->shaders); stage++) { 16617ec681f3Smrg const struct ir3_shader_variant *xs = builder->variants[stage]; 1662361fc4cbSmaya 16637ec681f3Smrg if (stage == MESA_SHADER_FRAGMENT && binning_pass) 16647ec681f3Smrg fs = xs = NULL; 1665361fc4cbSmaya 16667ec681f3Smrg tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]); 16677ec681f3Smrg } 16687ec681f3Smrg 16697ec681f3Smrg uint32_t multiview_views = util_logbase2(builder->multiview_mask) + 1; 16707ec681f3Smrg uint32_t multiview_cntl = builder->multiview_mask ? 16717ec681f3Smrg A6XX_PC_MULTIVIEW_CNTL_ENABLE | 16727ec681f3Smrg A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) | 16737ec681f3Smrg COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS) 16747ec681f3Smrg : 0; 1675361fc4cbSmaya 16767ec681f3Smrg /* Copy what the blob does here. This will emit an extra 0x3f 16777ec681f3Smrg * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what 16787ec681f3Smrg * this is working around yet. 16797ec681f3Smrg */ 16807ec681f3Smrg if (builder->device->physical_device->info->a6xx.has_cp_reg_write) { 16817ec681f3Smrg tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3); 16827ec681f3Smrg tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE)); 16837ec681f3Smrg tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL); 1684361fc4cbSmaya } else { 16857ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1); 16867ec681f3Smrg } 16877ec681f3Smrg tu_cs_emit(cs, multiview_cntl); 1688361fc4cbSmaya 16897ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1); 16907ec681f3Smrg tu_cs_emit(cs, multiview_cntl); 16917ec681f3Smrg 16927ec681f3Smrg if (multiview_cntl && 16937ec681f3Smrg builder->device->physical_device->info->a6xx.supports_multiview_mask) { 16947ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1); 16957ec681f3Smrg tu_cs_emit(cs, builder->multiview_mask); 1696361fc4cbSmaya } 1697361fc4cbSmaya 16987ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); 16997ec681f3Smrg tu_cs_emit(cs, 0); 17007ec681f3Smrg 17017ec681f3Smrg tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch); 17027ec681f3Smrg tu6_emit_vpc_varying_modes(cs, fs); 17037ec681f3Smrg 17047ec681f3Smrg bool no_earlyz = builder->depth_attachment_format == VK_FORMAT_S8_UINT; 17057ec681f3Smrg uint32_t mrt_count = builder->color_attachment_count; 17067ec681f3Smrg uint32_t render_components = builder->render_components; 17077ec681f3Smrg 17087ec681f3Smrg if (builder->alpha_to_coverage) { 17097ec681f3Smrg /* alpha to coverage can behave like a discard */ 17107ec681f3Smrg no_earlyz = true; 17117ec681f3Smrg /* alpha value comes from first mrt */ 17127ec681f3Smrg render_components |= 0xf; 17137ec681f3Smrg if (!mrt_count) { 17147ec681f3Smrg mrt_count = 1; 17157ec681f3Smrg /* Disable memory write for dummy mrt because it doesn't get set otherwise */ 17167ec681f3Smrg tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = 0)); 17177ec681f3Smrg } 17187ec681f3Smrg } 17197ec681f3Smrg 17207ec681f3Smrg if (fs) { 17217ec681f3Smrg tu6_emit_fs_inputs(cs, fs); 17227ec681f3Smrg tu6_emit_fs_outputs(cs, fs, mrt_count, 17237ec681f3Smrg builder->use_dual_src_blend, 17247ec681f3Smrg render_components, 17257ec681f3Smrg no_earlyz, 17267ec681f3Smrg pipeline); 17277ec681f3Smrg } else { 17287ec681f3Smrg /* TODO: check if these can be skipped if fs is disabled */ 17297ec681f3Smrg struct ir3_shader_variant dummy_variant = {}; 17307ec681f3Smrg tu6_emit_fs_inputs(cs, &dummy_variant); 17317ec681f3Smrg tu6_emit_fs_outputs(cs, &dummy_variant, mrt_count, 17327ec681f3Smrg builder->use_dual_src_blend, 17337ec681f3Smrg render_components, 17347ec681f3Smrg no_earlyz, 17357ec681f3Smrg NULL); 17367ec681f3Smrg } 17377ec681f3Smrg 17387ec681f3Smrg if (gs || hs) { 17397ec681f3Smrg tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs, cps_per_patch); 17407ec681f3Smrg } 1741361fc4cbSmaya} 1742361fc4cbSmaya 1743361fc4cbSmayastatic void 17447ec681f3Smrgtu6_emit_vertex_input(struct tu_pipeline *pipeline, 17457ec681f3Smrg struct tu_cs *cs, 1746361fc4cbSmaya const struct ir3_shader_variant *vs, 17477ec681f3Smrg const VkPipelineVertexInputStateCreateInfo *info) 1748361fc4cbSmaya{ 1749361fc4cbSmaya uint32_t vfd_decode_idx = 0; 17507ec681f3Smrg uint32_t binding_instanced = 0; /* bitmask of instanced bindings */ 17517ec681f3Smrg uint32_t step_rate[MAX_VBS]; 1752361fc4cbSmaya 17537ec681f3Smrg for (uint32_t i = 0; i < info->vertexBindingDescriptionCount; i++) { 17547ec681f3Smrg const VkVertexInputBindingDescription *binding = 17557ec681f3Smrg &info->pVertexBindingDescriptions[i]; 17567ec681f3Smrg 17577ec681f3Smrg if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) { 17587ec681f3Smrg tu_cs_emit_regs(cs, 17597ec681f3Smrg A6XX_VFD_FETCH_STRIDE(binding->binding, binding->stride)); 17607ec681f3Smrg } 1761361fc4cbSmaya 17627ec681f3Smrg if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) 17637ec681f3Smrg binding_instanced |= 1 << binding->binding; 1764361fc4cbSmaya 17657ec681f3Smrg step_rate[binding->binding] = 1; 17667ec681f3Smrg } 1767361fc4cbSmaya 17687ec681f3Smrg const VkPipelineVertexInputDivisorStateCreateInfoEXT *div_state = 17697ec681f3Smrg vk_find_struct_const(info->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT); 17707ec681f3Smrg if (div_state) { 17717ec681f3Smrg for (uint32_t i = 0; i < div_state->vertexBindingDivisorCount; i++) { 17727ec681f3Smrg const VkVertexInputBindingDivisorDescriptionEXT *desc = 17737ec681f3Smrg &div_state->pVertexBindingDivisors[i]; 17747ec681f3Smrg step_rate[desc->binding] = desc->divisor; 17757ec681f3Smrg } 17767ec681f3Smrg } 1777361fc4cbSmaya 17787ec681f3Smrg /* TODO: emit all VFD_DECODE/VFD_DEST_CNTL in same (two) pkt4 */ 1779361fc4cbSmaya 17807ec681f3Smrg for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) { 17817ec681f3Smrg const VkVertexInputAttributeDescription *attr = 17827ec681f3Smrg &info->pVertexAttributeDescriptions[i]; 17837ec681f3Smrg uint32_t input_idx; 1784361fc4cbSmaya 17857ec681f3Smrg for (input_idx = 0; input_idx < vs->inputs_count; input_idx++) { 17867ec681f3Smrg if ((vs->inputs[input_idx].slot - VERT_ATTRIB_GENERIC0) == attr->location) 17877ec681f3Smrg break; 17887ec681f3Smrg } 1789361fc4cbSmaya 17907ec681f3Smrg /* attribute not used, skip it */ 17917ec681f3Smrg if (input_idx == vs->inputs_count) 17927ec681f3Smrg continue; 1793361fc4cbSmaya 17947ec681f3Smrg const struct tu_native_format format = tu6_format_vtx(attr->format); 17957ec681f3Smrg tu_cs_emit_regs(cs, 17967ec681f3Smrg A6XX_VFD_DECODE_INSTR(vfd_decode_idx, 17977ec681f3Smrg .idx = attr->binding, 17987ec681f3Smrg .offset = attr->offset, 17997ec681f3Smrg .instanced = binding_instanced & (1 << attr->binding), 18007ec681f3Smrg .format = format.fmt, 18017ec681f3Smrg .swap = format.swap, 18027ec681f3Smrg .unk30 = 1, 18037ec681f3Smrg ._float = !vk_format_is_int(attr->format)), 18047ec681f3Smrg A6XX_VFD_DECODE_STEP_RATE(vfd_decode_idx, step_rate[attr->binding])); 18057ec681f3Smrg 18067ec681f3Smrg tu_cs_emit_regs(cs, 18077ec681f3Smrg A6XX_VFD_DEST_CNTL_INSTR(vfd_decode_idx, 18087ec681f3Smrg .writemask = vs->inputs[input_idx].compmask, 18097ec681f3Smrg .regid = vs->inputs[input_idx].regid)); 1810361fc4cbSmaya 1811361fc4cbSmaya vfd_decode_idx++; 1812361fc4cbSmaya } 1813361fc4cbSmaya 18147ec681f3Smrg tu_cs_emit_regs(cs, 18157ec681f3Smrg A6XX_VFD_CONTROL_0( 18167ec681f3Smrg .fetch_cnt = vfd_decode_idx, /* decode_cnt for binning pass ? */ 18177ec681f3Smrg .decode_cnt = vfd_decode_idx)); 1818361fc4cbSmaya} 1819361fc4cbSmaya 1820361fc4cbSmayavoid 18217ec681f3Smrgtu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewports, uint32_t num_viewport) 18227ec681f3Smrg{ 18237ec681f3Smrg VkExtent2D guardband = {511, 511}; 18247ec681f3Smrg 18257ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), num_viewport * 6); 18267ec681f3Smrg for (uint32_t i = 0; i < num_viewport; i++) { 18277ec681f3Smrg const VkViewport *viewport = &viewports[i]; 18287ec681f3Smrg float offsets[3]; 18297ec681f3Smrg float scales[3]; 18307ec681f3Smrg scales[0] = viewport->width / 2.0f; 18317ec681f3Smrg scales[1] = viewport->height / 2.0f; 18327ec681f3Smrg scales[2] = viewport->maxDepth - viewport->minDepth; 18337ec681f3Smrg offsets[0] = viewport->x + scales[0]; 18347ec681f3Smrg offsets[1] = viewport->y + scales[1]; 18357ec681f3Smrg offsets[2] = viewport->minDepth; 18367ec681f3Smrg for (uint32_t j = 0; j < 3; j++) { 18377ec681f3Smrg tu_cs_emit(cs, fui(offsets[j])); 18387ec681f3Smrg tu_cs_emit(cs, fui(scales[j])); 18397ec681f3Smrg } 18407ec681f3Smrg 18417ec681f3Smrg guardband.width = 18427ec681f3Smrg MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false)); 18437ec681f3Smrg guardband.height = 18447ec681f3Smrg MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false)); 18457ec681f3Smrg } 1846361fc4cbSmaya 18477ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), num_viewport * 2); 18487ec681f3Smrg for (uint32_t i = 0; i < num_viewport; i++) { 18497ec681f3Smrg const VkViewport *viewport = &viewports[i]; 18507ec681f3Smrg VkOffset2D min; 18517ec681f3Smrg VkOffset2D max; 18527ec681f3Smrg min.x = (int32_t) viewport->x; 18537ec681f3Smrg max.x = (int32_t) ceilf(viewport->x + viewport->width); 18547ec681f3Smrg if (viewport->height >= 0.0f) { 18557ec681f3Smrg min.y = (int32_t) viewport->y; 18567ec681f3Smrg max.y = (int32_t) ceilf(viewport->y + viewport->height); 18577ec681f3Smrg } else { 18587ec681f3Smrg min.y = (int32_t)(viewport->y + viewport->height); 18597ec681f3Smrg max.y = (int32_t) ceilf(viewport->y); 18607ec681f3Smrg } 18617ec681f3Smrg /* the spec allows viewport->height to be 0.0f */ 18627ec681f3Smrg if (min.y == max.y) 18637ec681f3Smrg max.y++; 18647ec681f3Smrg /* allow viewport->width = 0.0f for un-initialized viewports: */ 18657ec681f3Smrg if (min.x == max.x) 18667ec681f3Smrg max.x++; 18677ec681f3Smrg 18687ec681f3Smrg min.x = MAX2(min.x, 0); 18697ec681f3Smrg min.y = MAX2(min.y, 0); 18707ec681f3Smrg 18717ec681f3Smrg assert(min.x < max.x); 18727ec681f3Smrg assert(min.y < max.y); 18737ec681f3Smrg tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) | 18747ec681f3Smrg A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y)); 18757ec681f3Smrg tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(max.x - 1) | 18767ec681f3Smrg A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(max.y - 1)); 18777ec681f3Smrg } 18787ec681f3Smrg 18797ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), num_viewport * 2); 18807ec681f3Smrg for (uint32_t i = 0; i < num_viewport; i++) { 18817ec681f3Smrg const VkViewport *viewport = &viewports[i]; 18827ec681f3Smrg tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth))); 18837ec681f3Smrg tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth))); 18847ec681f3Smrg } 1885361fc4cbSmaya tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1); 18867ec681f3Smrg tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) | 18877ec681f3Smrg A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height)); 18887ec681f3Smrg 18897ec681f3Smrg /* TODO: what to do about this and multi viewport ? */ 18907ec681f3Smrg float z_clamp_min = num_viewport ? MIN2(viewports[0].minDepth, viewports[0].maxDepth) : 0; 18917ec681f3Smrg float z_clamp_max = num_viewport ? MAX2(viewports[0].minDepth, viewports[0].maxDepth) : 0; 18927ec681f3Smrg 18937ec681f3Smrg tu_cs_emit_regs(cs, 18947ec681f3Smrg A6XX_RB_Z_CLAMP_MIN(z_clamp_min), 18957ec681f3Smrg A6XX_RB_Z_CLAMP_MAX(z_clamp_max)); 1896361fc4cbSmaya} 1897361fc4cbSmaya 1898361fc4cbSmayavoid 18997ec681f3Smrgtu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissors, uint32_t scissor_count) 19007ec681f3Smrg{ 19017ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), scissor_count * 2); 19027ec681f3Smrg 19037ec681f3Smrg for (uint32_t i = 0; i < scissor_count; i++) { 19047ec681f3Smrg const VkRect2D *scissor = &scissors[i]; 19057ec681f3Smrg 19067ec681f3Smrg uint32_t min_x = scissor->offset.x; 19077ec681f3Smrg uint32_t min_y = scissor->offset.y; 19087ec681f3Smrg uint32_t max_x = min_x + scissor->extent.width - 1; 19097ec681f3Smrg uint32_t max_y = min_y + scissor->extent.height - 1; 19107ec681f3Smrg 19117ec681f3Smrg if (!scissor->extent.width || !scissor->extent.height) { 19127ec681f3Smrg min_x = min_y = 1; 19137ec681f3Smrg max_x = max_y = 0; 19147ec681f3Smrg } else { 19157ec681f3Smrg /* avoid overflow */ 19167ec681f3Smrg uint32_t scissor_max = BITFIELD_MASK(15); 19177ec681f3Smrg min_x = MIN2(scissor_max, min_x); 19187ec681f3Smrg min_y = MIN2(scissor_max, min_y); 19197ec681f3Smrg max_x = MIN2(scissor_max, max_x); 19207ec681f3Smrg max_y = MIN2(scissor_max, max_y); 19217ec681f3Smrg } 1922361fc4cbSmaya 19237ec681f3Smrg tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) | 19247ec681f3Smrg A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y)); 19257ec681f3Smrg tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) | 19267ec681f3Smrg A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y)); 19277ec681f3Smrg } 1928361fc4cbSmaya} 1929361fc4cbSmaya 19307ec681f3Smrgvoid 19317ec681f3Smrgtu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc) 1932361fc4cbSmaya{ 19337ec681f3Smrg if (!samp_loc) { 19347ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1); 19357ec681f3Smrg tu_cs_emit(cs, 0); 1936361fc4cbSmaya 19377ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1); 19387ec681f3Smrg tu_cs_emit(cs, 0); 19397ec681f3Smrg 19407ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1); 19417ec681f3Smrg tu_cs_emit(cs, 0); 19427ec681f3Smrg return; 19437ec681f3Smrg } 19447ec681f3Smrg 19457ec681f3Smrg assert(samp_loc->sampleLocationsPerPixel == samp_loc->sampleLocationsCount); 19467ec681f3Smrg assert(samp_loc->sampleLocationGridSize.width == 1); 19477ec681f3Smrg assert(samp_loc->sampleLocationGridSize.height == 1); 19487ec681f3Smrg 19497ec681f3Smrg uint32_t sample_config = 19507ec681f3Smrg A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE; 19517ec681f3Smrg uint32_t sample_locations = 0; 19527ec681f3Smrg for (uint32_t i = 0; i < samp_loc->sampleLocationsCount; i++) { 19537ec681f3Smrg sample_locations |= 19547ec681f3Smrg (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(samp_loc->pSampleLocations[i].x) | 19557ec681f3Smrg A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(samp_loc->pSampleLocations[i].y)) << i*8; 19567ec681f3Smrg } 19577ec681f3Smrg 19587ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 2); 19597ec681f3Smrg tu_cs_emit(cs, sample_config); 19607ec681f3Smrg tu_cs_emit(cs, sample_locations); 19617ec681f3Smrg 19627ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 2); 19637ec681f3Smrg tu_cs_emit(cs, sample_config); 19647ec681f3Smrg tu_cs_emit(cs, sample_locations); 19657ec681f3Smrg 19667ec681f3Smrg tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 2); 19677ec681f3Smrg tu_cs_emit(cs, sample_config); 19687ec681f3Smrg tu_cs_emit(cs, sample_locations); 1969361fc4cbSmaya} 1970361fc4cbSmaya 1971361fc4cbSmayastatic uint32_t 1972361fc4cbSmayatu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info, 19737ec681f3Smrg enum a5xx_line_mode line_mode, 19747ec681f3Smrg bool multiview) 1975361fc4cbSmaya{ 1976361fc4cbSmaya uint32_t gras_su_cntl = 0; 1977361fc4cbSmaya 1978361fc4cbSmaya if (rast_info->cullMode & VK_CULL_MODE_FRONT_BIT) 1979361fc4cbSmaya gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT; 1980361fc4cbSmaya if (rast_info->cullMode & VK_CULL_MODE_BACK_BIT) 1981361fc4cbSmaya gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK; 1982361fc4cbSmaya 1983361fc4cbSmaya if (rast_info->frontFace == VK_FRONT_FACE_CLOCKWISE) 1984361fc4cbSmaya gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW; 1985361fc4cbSmaya 19867ec681f3Smrg gras_su_cntl |= 19877ec681f3Smrg A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(rast_info->lineWidth / 2.0f); 1988361fc4cbSmaya 1989361fc4cbSmaya if (rast_info->depthBiasEnable) 1990361fc4cbSmaya gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET; 1991361fc4cbSmaya 19927ec681f3Smrg gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINE_MODE(line_mode); 1993361fc4cbSmaya 19947ec681f3Smrg if (multiview) { 19957ec681f3Smrg gras_su_cntl |= 19967ec681f3Smrg A6XX_GRAS_SU_CNTL_UNK17 | 19977ec681f3Smrg A6XX_GRAS_SU_CNTL_MULTIVIEW_ENABLE; 19987ec681f3Smrg } 1999361fc4cbSmaya 20007ec681f3Smrg return gras_su_cntl; 2001361fc4cbSmaya} 2002361fc4cbSmaya 2003361fc4cbSmayavoid 2004361fc4cbSmayatu6_emit_depth_bias(struct tu_cs *cs, 2005361fc4cbSmaya float constant_factor, 2006361fc4cbSmaya float clamp, 2007361fc4cbSmaya float slope_factor) 2008361fc4cbSmaya{ 2009361fc4cbSmaya tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3); 20107ec681f3Smrg tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor).value); 20117ec681f3Smrg tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor).value); 20127ec681f3Smrg tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp).value); 2013361fc4cbSmaya} 2014361fc4cbSmaya 2015361fc4cbSmayastatic uint32_t 2016361fc4cbSmayatu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState *att, 2017361fc4cbSmaya bool has_alpha) 2018361fc4cbSmaya{ 2019361fc4cbSmaya const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->colorBlendOp); 2020361fc4cbSmaya const enum adreno_rb_blend_factor src_color_factor = tu6_blend_factor( 2021361fc4cbSmaya has_alpha ? att->srcColorBlendFactor 2022361fc4cbSmaya : tu_blend_factor_no_dst_alpha(att->srcColorBlendFactor)); 2023361fc4cbSmaya const enum adreno_rb_blend_factor dst_color_factor = tu6_blend_factor( 2024361fc4cbSmaya has_alpha ? att->dstColorBlendFactor 2025361fc4cbSmaya : tu_blend_factor_no_dst_alpha(att->dstColorBlendFactor)); 2026361fc4cbSmaya const enum a3xx_rb_blend_opcode alpha_op = tu6_blend_op(att->alphaBlendOp); 2027361fc4cbSmaya const enum adreno_rb_blend_factor src_alpha_factor = 2028361fc4cbSmaya tu6_blend_factor(att->srcAlphaBlendFactor); 2029361fc4cbSmaya const enum adreno_rb_blend_factor dst_alpha_factor = 2030361fc4cbSmaya tu6_blend_factor(att->dstAlphaBlendFactor); 2031361fc4cbSmaya 2032361fc4cbSmaya return A6XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(src_color_factor) | 2033361fc4cbSmaya A6XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(color_op) | 2034361fc4cbSmaya A6XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(dst_color_factor) | 2035361fc4cbSmaya A6XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(src_alpha_factor) | 2036361fc4cbSmaya A6XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(alpha_op) | 2037361fc4cbSmaya A6XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(dst_alpha_factor); 2038361fc4cbSmaya} 2039361fc4cbSmaya 2040361fc4cbSmayastatic uint32_t 2041361fc4cbSmayatu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att, 2042361fc4cbSmaya uint32_t rb_mrt_control_rop, 2043361fc4cbSmaya bool has_alpha) 2044361fc4cbSmaya{ 2045361fc4cbSmaya uint32_t rb_mrt_control = 2046361fc4cbSmaya A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE(att->colorWriteMask); 2047361fc4cbSmaya 2048361fc4cbSmaya rb_mrt_control |= rb_mrt_control_rop; 2049361fc4cbSmaya 2050361fc4cbSmaya if (att->blendEnable) { 2051361fc4cbSmaya rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND; 2052361fc4cbSmaya 2053361fc4cbSmaya if (has_alpha) 2054361fc4cbSmaya rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND2; 2055361fc4cbSmaya } 2056361fc4cbSmaya 2057361fc4cbSmaya return rb_mrt_control; 2058361fc4cbSmaya} 2059361fc4cbSmaya 2060361fc4cbSmayastatic void 2061361fc4cbSmayatu6_emit_rb_mrt_controls(struct tu_cs *cs, 2062361fc4cbSmaya const VkPipelineColorBlendStateCreateInfo *blend_info, 2063361fc4cbSmaya const VkFormat attachment_formats[MAX_RTS], 2064361fc4cbSmaya uint32_t *blend_enable_mask) 2065361fc4cbSmaya{ 2066361fc4cbSmaya *blend_enable_mask = 0; 2067361fc4cbSmaya 2068361fc4cbSmaya bool rop_reads_dst = false; 2069361fc4cbSmaya uint32_t rb_mrt_control_rop = 0; 2070361fc4cbSmaya if (blend_info->logicOpEnable) { 2071361fc4cbSmaya rop_reads_dst = tu_logic_op_reads_dst(blend_info->logicOp); 2072361fc4cbSmaya rb_mrt_control_rop = 2073361fc4cbSmaya A6XX_RB_MRT_CONTROL_ROP_ENABLE | 2074361fc4cbSmaya A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(blend_info->logicOp)); 2075361fc4cbSmaya } 2076361fc4cbSmaya 2077361fc4cbSmaya for (uint32_t i = 0; i < blend_info->attachmentCount; i++) { 2078361fc4cbSmaya const VkPipelineColorBlendAttachmentState *att = 2079361fc4cbSmaya &blend_info->pAttachments[i]; 2080361fc4cbSmaya const VkFormat format = attachment_formats[i]; 2081361fc4cbSmaya 2082361fc4cbSmaya uint32_t rb_mrt_control = 0; 2083361fc4cbSmaya uint32_t rb_mrt_blend_control = 0; 2084361fc4cbSmaya if (format != VK_FORMAT_UNDEFINED) { 2085361fc4cbSmaya const bool has_alpha = vk_format_has_alpha(format); 2086361fc4cbSmaya 2087361fc4cbSmaya rb_mrt_control = 20887ec681f3Smrg tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha); 2089361fc4cbSmaya rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha); 2090361fc4cbSmaya 2091361fc4cbSmaya if (att->blendEnable || rop_reads_dst) 2092361fc4cbSmaya *blend_enable_mask |= 1 << i; 2093361fc4cbSmaya } 2094361fc4cbSmaya 2095361fc4cbSmaya tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_CONTROL(i), 2); 2096361fc4cbSmaya tu_cs_emit(cs, rb_mrt_control); 2097361fc4cbSmaya tu_cs_emit(cs, rb_mrt_blend_control); 2098361fc4cbSmaya } 2099361fc4cbSmaya} 2100361fc4cbSmaya 2101361fc4cbSmayastatic void 2102361fc4cbSmayatu6_emit_blend_control(struct tu_cs *cs, 2103361fc4cbSmaya uint32_t blend_enable_mask, 21047ec681f3Smrg bool dual_src_blend, 2105361fc4cbSmaya const VkPipelineMultisampleStateCreateInfo *msaa_info) 2106361fc4cbSmaya{ 2107361fc4cbSmaya const uint32_t sample_mask = 21087ec681f3Smrg msaa_info->pSampleMask ? (*msaa_info->pSampleMask & 0xffff) 2109361fc4cbSmaya : ((1 << msaa_info->rasterizationSamples) - 1); 2110361fc4cbSmaya 21117ec681f3Smrg tu_cs_emit_regs(cs, 21127ec681f3Smrg A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask, 21137ec681f3Smrg .dual_color_in_enable = dual_src_blend, 21147ec681f3Smrg .alpha_to_coverage = msaa_info->alphaToCoverageEnable, 21157ec681f3Smrg .unk8 = true)); 21167ec681f3Smrg 2117361fc4cbSmaya /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */ 21187ec681f3Smrg tu_cs_emit_regs(cs, 21197ec681f3Smrg A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask, 21207ec681f3Smrg .independent_blend = true, 21217ec681f3Smrg .sample_mask = sample_mask, 21227ec681f3Smrg .dual_color_in_enable = dual_src_blend, 21237ec681f3Smrg .alpha_to_coverage = msaa_info->alphaToCoverageEnable, 21247ec681f3Smrg .alpha_to_one = msaa_info->alphaToOneEnable)); 21257ec681f3Smrg} 2126361fc4cbSmaya 21277ec681f3Smrgstatic uint32_t 21287ec681f3Smrgcalc_pvtmem_size(struct tu_device *dev, struct tu_pvtmem_config *config, 21297ec681f3Smrg uint32_t pvtmem_bytes) 21307ec681f3Smrg{ 21317ec681f3Smrg uint32_t per_fiber_size = ALIGN(pvtmem_bytes, 512); 21327ec681f3Smrg uint32_t per_sp_size = 21337ec681f3Smrg ALIGN(per_fiber_size * dev->physical_device->info->a6xx.fibers_per_sp, 1 << 12); 2134361fc4cbSmaya 21357ec681f3Smrg if (config) { 21367ec681f3Smrg config->per_fiber_size = per_fiber_size; 21377ec681f3Smrg config->per_sp_size = per_sp_size; 21387ec681f3Smrg } 21397ec681f3Smrg 21407ec681f3Smrg return dev->physical_device->info->num_sp_cores * per_sp_size; 2141361fc4cbSmaya} 2142361fc4cbSmaya 21437ec681f3Smrgstatic VkResult 21447ec681f3Smrgtu_setup_pvtmem(struct tu_device *dev, 21457ec681f3Smrg struct tu_pipeline *pipeline, 21467ec681f3Smrg struct tu_pvtmem_config *config, 21477ec681f3Smrg uint32_t pvtmem_bytes, bool per_wave) 21487ec681f3Smrg{ 21497ec681f3Smrg if (!pvtmem_bytes) { 21507ec681f3Smrg memset(config, 0, sizeof(*config)); 21517ec681f3Smrg return VK_SUCCESS; 21527ec681f3Smrg } 21537ec681f3Smrg 21547ec681f3Smrg uint32_t total_size = calc_pvtmem_size(dev, config, pvtmem_bytes); 21557ec681f3Smrg config->per_wave = per_wave; 21567ec681f3Smrg 21577ec681f3Smrg VkResult result = 21587ec681f3Smrg tu_bo_init_new(dev, &pipeline->pvtmem_bo, total_size, 21597ec681f3Smrg TU_BO_ALLOC_NO_FLAGS); 21607ec681f3Smrg if (result != VK_SUCCESS) 21617ec681f3Smrg return result; 21627ec681f3Smrg 21637ec681f3Smrg config->iova = pipeline->pvtmem_bo.iova; 21647ec681f3Smrg 21657ec681f3Smrg return result; 2166361fc4cbSmaya} 2167361fc4cbSmaya 21687ec681f3Smrg 2169361fc4cbSmayastatic VkResult 21707ec681f3Smrgtu_pipeline_allocate_cs(struct tu_device *dev, 21717ec681f3Smrg struct tu_pipeline *pipeline, 21727ec681f3Smrg struct tu_pipeline_builder *builder, 21737ec681f3Smrg struct ir3_shader_variant *compute) 21747ec681f3Smrg{ 21757ec681f3Smrg uint32_t size = 2048 + tu6_load_state_size(pipeline, compute); 21767ec681f3Smrg 21777ec681f3Smrg /* graphics case: */ 21787ec681f3Smrg if (builder) { 21797ec681f3Smrg uint32_t pvtmem_bytes = 0; 21807ec681f3Smrg for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) { 21817ec681f3Smrg if (builder->variants[i]) { 21827ec681f3Smrg size += builder->variants[i]->info.size / 4; 21837ec681f3Smrg pvtmem_bytes = MAX2(pvtmem_bytes, builder->variants[i]->pvtmem_size); 21847ec681f3Smrg } 21857ec681f3Smrg } 2186361fc4cbSmaya 21877ec681f3Smrg size += builder->binning_variant->info.size / 4; 21887ec681f3Smrg pvtmem_bytes = MAX2(pvtmem_bytes, builder->binning_variant->pvtmem_size); 2189361fc4cbSmaya 21907ec681f3Smrg size += calc_pvtmem_size(dev, NULL, pvtmem_bytes) / 4; 2191361fc4cbSmaya 21927ec681f3Smrg builder->additional_cs_reserve_size = 0; 21937ec681f3Smrg for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) { 21947ec681f3Smrg struct ir3_shader_variant *variant = builder->variants[i]; 21957ec681f3Smrg if (variant) { 21967ec681f3Smrg builder->additional_cs_reserve_size += 21977ec681f3Smrg tu_xs_get_additional_cs_size_dwords(variant); 21987ec681f3Smrg 21997ec681f3Smrg if (variant->binning) { 22007ec681f3Smrg builder->additional_cs_reserve_size += 22017ec681f3Smrg tu_xs_get_additional_cs_size_dwords(variant->binning); 22027ec681f3Smrg } 22037ec681f3Smrg } 22047ec681f3Smrg } 22057ec681f3Smrg 22067ec681f3Smrg size += builder->additional_cs_reserve_size; 22077ec681f3Smrg } else { 22087ec681f3Smrg size += compute->info.size / 4; 22097ec681f3Smrg size += calc_pvtmem_size(dev, NULL, compute->pvtmem_size) / 4; 22107ec681f3Smrg 22117ec681f3Smrg size += tu_xs_get_additional_cs_size_dwords(compute); 2212361fc4cbSmaya } 2213361fc4cbSmaya 22147ec681f3Smrg tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size); 2215361fc4cbSmaya 22167ec681f3Smrg /* Reserve the space now such that tu_cs_begin_sub_stream never fails. Note 22177ec681f3Smrg * that LOAD_STATE can potentially take up a large amount of space so we 22187ec681f3Smrg * calculate its size explicitly. 22197ec681f3Smrg */ 22207ec681f3Smrg return tu_cs_reserve_space(&pipeline->cs, size); 22217ec681f3Smrg} 22227ec681f3Smrg 22237ec681f3Smrgstatic void 22247ec681f3Smrgtu_pipeline_shader_key_init(struct ir3_shader_key *key, 22257ec681f3Smrg const struct tu_pipeline *pipeline, 22267ec681f3Smrg const VkGraphicsPipelineCreateInfo *pipeline_info) 22277ec681f3Smrg{ 22287ec681f3Smrg for (uint32_t i = 0; i < pipeline_info->stageCount; i++) { 22297ec681f3Smrg if (pipeline_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) { 22307ec681f3Smrg key->has_gs = true; 22317ec681f3Smrg break; 22327ec681f3Smrg } 22337ec681f3Smrg } 22347ec681f3Smrg 22357ec681f3Smrg if (pipeline_info->pRasterizationState->rasterizerDiscardEnable && 22367ec681f3Smrg !(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD))) 22377ec681f3Smrg return; 22387ec681f3Smrg 22397ec681f3Smrg const VkPipelineMultisampleStateCreateInfo *msaa_info = pipeline_info->pMultisampleState; 22407ec681f3Smrg const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations = 22417ec681f3Smrg vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT); 22427ec681f3Smrg if (msaa_info->rasterizationSamples > 1 || 22437ec681f3Smrg /* also set msaa key when sample location is not the default 22447ec681f3Smrg * since this affects varying interpolation */ 22457ec681f3Smrg (sample_locations && sample_locations->sampleLocationsEnable)) { 22467ec681f3Smrg key->msaa = true; 22477ec681f3Smrg } 22487ec681f3Smrg 22497ec681f3Smrg /* note: not actually used by ir3, just checked in tu6_emit_fs_inputs */ 22507ec681f3Smrg if (msaa_info->sampleShadingEnable) 22517ec681f3Smrg key->sample_shading = true; 22527ec681f3Smrg 22537ec681f3Smrg /* We set this after we compile to NIR because we need the prim mode */ 22547ec681f3Smrg key->tessellation = IR3_TESS_NONE; 22557ec681f3Smrg} 22567ec681f3Smrg 22577ec681f3Smrgstatic uint32_t 22587ec681f3Smrgtu6_get_tessmode(struct tu_shader* shader) 22597ec681f3Smrg{ 22607ec681f3Smrg uint32_t primitive_mode = shader->ir3_shader->nir->info.tess.primitive_mode; 22617ec681f3Smrg switch (primitive_mode) { 22627ec681f3Smrg case GL_ISOLINES: 22637ec681f3Smrg return IR3_TESS_ISOLINES; 22647ec681f3Smrg case GL_TRIANGLES: 22657ec681f3Smrg return IR3_TESS_TRIANGLES; 22667ec681f3Smrg case GL_QUADS: 22677ec681f3Smrg return IR3_TESS_QUADS; 22687ec681f3Smrg case GL_NONE: 22697ec681f3Smrg return IR3_TESS_NONE; 22707ec681f3Smrg default: 22717ec681f3Smrg unreachable("bad tessmode"); 22727ec681f3Smrg } 22737ec681f3Smrg} 22747ec681f3Smrg 22757ec681f3Smrgstatic uint64_t 22767ec681f3Smrgtu_upload_variant(struct tu_pipeline *pipeline, 22777ec681f3Smrg const struct ir3_shader_variant *variant) 22787ec681f3Smrg{ 22797ec681f3Smrg struct tu_cs_memory memory; 22807ec681f3Smrg 22817ec681f3Smrg if (!variant) 22827ec681f3Smrg return 0; 22837ec681f3Smrg 22847ec681f3Smrg /* this expects to get enough alignment because shaders are allocated first 22857ec681f3Smrg * and total size is always aligned correctly 22867ec681f3Smrg * note: an assert in tu6_emit_xs_config validates the alignment 22877ec681f3Smrg */ 22887ec681f3Smrg tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory); 22897ec681f3Smrg 22907ec681f3Smrg memcpy(memory.map, variant->bin, variant->info.size); 22917ec681f3Smrg return memory.iova; 22927ec681f3Smrg} 22937ec681f3Smrg 22947ec681f3Smrgstatic void 22957ec681f3Smrgtu_append_executable(struct tu_pipeline *pipeline, struct ir3_shader_variant *variant, 22967ec681f3Smrg char *nir_from_spirv) 22977ec681f3Smrg{ 22987ec681f3Smrg ralloc_steal(pipeline->executables_mem_ctx, variant->disasm_info.nir); 22997ec681f3Smrg ralloc_steal(pipeline->executables_mem_ctx, variant->disasm_info.disasm); 23007ec681f3Smrg 23017ec681f3Smrg struct tu_pipeline_executable exe = { 23027ec681f3Smrg .stage = variant->shader->type, 23037ec681f3Smrg .nir_from_spirv = nir_from_spirv, 23047ec681f3Smrg .nir_final = variant->disasm_info.nir, 23057ec681f3Smrg .disasm = variant->disasm_info.disasm, 23067ec681f3Smrg .stats = variant->info, 23077ec681f3Smrg .is_binning = variant->binning_pass, 23087ec681f3Smrg }; 23097ec681f3Smrg 23107ec681f3Smrg util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe); 2311361fc4cbSmaya} 2312361fc4cbSmaya 2313361fc4cbSmayastatic VkResult 23147ec681f3Smrgtu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder, 23157ec681f3Smrg struct tu_pipeline *pipeline) 2316361fc4cbSmaya{ 23177ec681f3Smrg const struct ir3_compiler *compiler = builder->device->compiler; 2318361fc4cbSmaya const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = { 2319361fc4cbSmaya NULL 2320361fc4cbSmaya }; 2321361fc4cbSmaya for (uint32_t i = 0; i < builder->create_info->stageCount; i++) { 2322361fc4cbSmaya gl_shader_stage stage = 23237ec681f3Smrg vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage); 2324361fc4cbSmaya stage_infos[stage] = &builder->create_info->pStages[i]; 2325361fc4cbSmaya } 2326361fc4cbSmaya 23277ec681f3Smrg struct ir3_shader_key key = {}; 23287ec681f3Smrg tu_pipeline_shader_key_init(&key, pipeline, builder->create_info); 23297ec681f3Smrg 23307ec681f3Smrg nir_shader *nir[ARRAY_SIZE(builder->shaders)] = { NULL }; 2331361fc4cbSmaya 23327ec681f3Smrg for (gl_shader_stage stage = MESA_SHADER_VERTEX; 23337ec681f3Smrg stage < ARRAY_SIZE(nir); stage++) { 2334361fc4cbSmaya const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage]; 2335361fc4cbSmaya if (!stage_info) 2336361fc4cbSmaya continue; 2337361fc4cbSmaya 23387ec681f3Smrg nir[stage] = tu_spirv_to_nir(builder->device, stage_info, stage); 23397ec681f3Smrg if (!nir[stage]) 23407ec681f3Smrg return VK_ERROR_OUT_OF_HOST_MEMORY; 23417ec681f3Smrg } 23427ec681f3Smrg 23437ec681f3Smrg if (!nir[MESA_SHADER_FRAGMENT]) { 23447ec681f3Smrg const nir_shader_compiler_options *nir_options = 23457ec681f3Smrg ir3_get_compiler_options(builder->device->compiler); 23467ec681f3Smrg nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, 23477ec681f3Smrg nir_options, 23487ec681f3Smrg "noop_fs"); 23497ec681f3Smrg nir[MESA_SHADER_FRAGMENT] = fs_b.shader; 23507ec681f3Smrg } 23517ec681f3Smrg 23527ec681f3Smrg const bool executable_info = builder->create_info->flags & 23537ec681f3Smrg VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR; 23547ec681f3Smrg 23557ec681f3Smrg char *nir_initial_disasm[ARRAY_SIZE(builder->shaders)] = { NULL }; 23567ec681f3Smrg 23577ec681f3Smrg if (executable_info) { 23587ec681f3Smrg for (gl_shader_stage stage = MESA_SHADER_VERTEX; 23597ec681f3Smrg stage < ARRAY_SIZE(nir); stage++) { 23607ec681f3Smrg if (!nir[stage]) 23617ec681f3Smrg continue; 23627ec681f3Smrg 23637ec681f3Smrg nir_initial_disasm[stage] = 23647ec681f3Smrg nir_shader_as_str(nir[stage], pipeline->executables_mem_ctx); 23657ec681f3Smrg } 23667ec681f3Smrg } 23677ec681f3Smrg 23687ec681f3Smrg /* TODO do intra-stage linking here */ 23697ec681f3Smrg 23707ec681f3Smrg uint32_t desc_sets = 0; 23717ec681f3Smrg for (gl_shader_stage stage = MESA_SHADER_VERTEX; 23727ec681f3Smrg stage < ARRAY_SIZE(nir); stage++) { 23737ec681f3Smrg if (!nir[stage]) 23747ec681f3Smrg continue; 23757ec681f3Smrg 2376361fc4cbSmaya struct tu_shader *shader = 23777ec681f3Smrg tu_shader_create(builder->device, nir[stage], 23787ec681f3Smrg builder->multiview_mask, builder->layout, 23797ec681f3Smrg builder->alloc); 2380361fc4cbSmaya if (!shader) 2381361fc4cbSmaya return VK_ERROR_OUT_OF_HOST_MEMORY; 2382361fc4cbSmaya 23837ec681f3Smrg /* In SPIR-V generated from GLSL, the primitive mode is specified in the 23847ec681f3Smrg * tessellation evaluation shader, but in SPIR-V generated from HLSL, 23857ec681f3Smrg * the mode is specified in the tessellation control shader. */ 23867ec681f3Smrg if ((stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_TESS_CTRL) && 23877ec681f3Smrg key.tessellation == IR3_TESS_NONE) { 23887ec681f3Smrg key.tessellation = tu6_get_tessmode(shader); 23897ec681f3Smrg } 2390361fc4cbSmaya 23917ec681f3Smrg if (stage > MESA_SHADER_TESS_CTRL) { 23927ec681f3Smrg if (stage == MESA_SHADER_FRAGMENT) { 23937ec681f3Smrg key.tcs_store_primid = key.tcs_store_primid || 23947ec681f3Smrg (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID)); 23957ec681f3Smrg } else { 23967ec681f3Smrg key.tcs_store_primid = key.tcs_store_primid || 23977ec681f3Smrg BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID); 23987ec681f3Smrg } 23997ec681f3Smrg } 2400361fc4cbSmaya 24017ec681f3Smrg /* Keep track of the status of each shader's active descriptor sets, 24027ec681f3Smrg * which is set in tu_lower_io. */ 24037ec681f3Smrg desc_sets |= shader->active_desc_sets; 2404361fc4cbSmaya 24057ec681f3Smrg builder->shaders[stage] = shader; 2406361fc4cbSmaya } 24077ec681f3Smrg pipeline->active_desc_sets = desc_sets; 2408361fc4cbSmaya 24097ec681f3Smrg struct tu_shader *last_shader = builder->shaders[MESA_SHADER_GEOMETRY]; 24107ec681f3Smrg if (!last_shader) 24117ec681f3Smrg last_shader = builder->shaders[MESA_SHADER_TESS_EVAL]; 24127ec681f3Smrg if (!last_shader) 24137ec681f3Smrg last_shader = builder->shaders[MESA_SHADER_VERTEX]; 2414361fc4cbSmaya 24157ec681f3Smrg uint64_t outputs_written = last_shader->ir3_shader->nir->info.outputs_written; 2416361fc4cbSmaya 24177ec681f3Smrg key.layer_zero = !(outputs_written & VARYING_BIT_LAYER); 24187ec681f3Smrg key.view_zero = !(outputs_written & VARYING_BIT_VIEWPORT); 2419361fc4cbSmaya 24207ec681f3Smrg pipeline->tess.patch_type = key.tessellation; 2421361fc4cbSmaya 24227ec681f3Smrg for (gl_shader_stage stage = MESA_SHADER_VERTEX; 24237ec681f3Smrg stage < ARRAY_SIZE(builder->shaders); stage++) { 24247ec681f3Smrg if (!builder->shaders[stage]) 24257ec681f3Smrg continue; 24267ec681f3Smrg 24277ec681f3Smrg bool created; 24287ec681f3Smrg builder->variants[stage] = 24297ec681f3Smrg ir3_shader_get_variant(builder->shaders[stage]->ir3_shader, 24307ec681f3Smrg &key, false, executable_info, &created); 24317ec681f3Smrg if (!builder->variants[stage]) 24327ec681f3Smrg return VK_ERROR_OUT_OF_HOST_MEMORY; 24337ec681f3Smrg } 24347ec681f3Smrg 24357ec681f3Smrg uint32_t safe_constlens = ir3_trim_constlen(builder->variants, compiler); 24367ec681f3Smrg 24377ec681f3Smrg key.safe_constlen = true; 24387ec681f3Smrg 24397ec681f3Smrg for (gl_shader_stage stage = MESA_SHADER_VERTEX; 24407ec681f3Smrg stage < ARRAY_SIZE(builder->shaders); stage++) { 24417ec681f3Smrg if (!builder->shaders[stage]) 2442361fc4cbSmaya continue; 2443361fc4cbSmaya 24447ec681f3Smrg if (safe_constlens & (1 << stage)) { 24457ec681f3Smrg bool created; 24467ec681f3Smrg builder->variants[stage] = 24477ec681f3Smrg ir3_shader_get_variant(builder->shaders[stage]->ir3_shader, 24487ec681f3Smrg &key, false, executable_info, &created); 24497ec681f3Smrg if (!builder->variants[stage]) 24507ec681f3Smrg return VK_ERROR_OUT_OF_HOST_MEMORY; 24517ec681f3Smrg } 2452361fc4cbSmaya } 2453361fc4cbSmaya 24547ec681f3Smrg const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX]; 24557ec681f3Smrg struct ir3_shader_variant *variant; 24567ec681f3Smrg 24577ec681f3Smrg if (vs->ir3_shader->stream_output.num_outputs || 24587ec681f3Smrg !ir3_has_binning_vs(&key)) { 24597ec681f3Smrg variant = builder->variants[MESA_SHADER_VERTEX]; 24607ec681f3Smrg } else { 24617ec681f3Smrg bool created; 24627ec681f3Smrg key.safe_constlen = !!(safe_constlens & (1 << MESA_SHADER_VERTEX)); 24637ec681f3Smrg variant = ir3_shader_get_variant(vs->ir3_shader, &key, 24647ec681f3Smrg true, executable_info, &created); 24657ec681f3Smrg if (!variant) 24667ec681f3Smrg return VK_ERROR_OUT_OF_HOST_MEMORY; 24677ec681f3Smrg } 24687ec681f3Smrg 24697ec681f3Smrg builder->binning_variant = variant; 24707ec681f3Smrg 24717ec681f3Smrg for (gl_shader_stage stage = MESA_SHADER_VERTEX; 24727ec681f3Smrg stage < ARRAY_SIZE(nir); stage++) { 24737ec681f3Smrg if (builder->variants[stage]) { 24747ec681f3Smrg tu_append_executable(pipeline, builder->variants[stage], 24757ec681f3Smrg nir_initial_disasm[stage]); 24767ec681f3Smrg } 24777ec681f3Smrg } 24787ec681f3Smrg 24797ec681f3Smrg if (builder->binning_variant != builder->variants[MESA_SHADER_VERTEX]) { 24807ec681f3Smrg tu_append_executable(pipeline, builder->binning_variant, NULL); 2481361fc4cbSmaya } 2482361fc4cbSmaya 2483361fc4cbSmaya return VK_SUCCESS; 2484361fc4cbSmaya} 2485361fc4cbSmaya 2486361fc4cbSmayastatic void 2487361fc4cbSmayatu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder *builder, 2488361fc4cbSmaya struct tu_pipeline *pipeline) 2489361fc4cbSmaya{ 2490361fc4cbSmaya const VkPipelineDynamicStateCreateInfo *dynamic_info = 2491361fc4cbSmaya builder->create_info->pDynamicState; 2492361fc4cbSmaya 24937ec681f3Smrg pipeline->gras_su_cntl_mask = ~0u; 24947ec681f3Smrg pipeline->rb_depth_cntl_mask = ~0u; 24957ec681f3Smrg pipeline->rb_stencil_cntl_mask = ~0u; 24967ec681f3Smrg pipeline->pc_raster_cntl_mask = ~0u; 24977ec681f3Smrg pipeline->vpc_unknown_9107_mask = ~0u; 24987ec681f3Smrg 2499361fc4cbSmaya if (!dynamic_info) 2500361fc4cbSmaya return; 2501361fc4cbSmaya 2502361fc4cbSmaya for (uint32_t i = 0; i < dynamic_info->dynamicStateCount; i++) { 25037ec681f3Smrg VkDynamicState state = dynamic_info->pDynamicStates[i]; 25047ec681f3Smrg switch (state) { 25057ec681f3Smrg case VK_DYNAMIC_STATE_VIEWPORT ... VK_DYNAMIC_STATE_STENCIL_REFERENCE: 25067ec681f3Smrg if (state == VK_DYNAMIC_STATE_LINE_WIDTH) 25077ec681f3Smrg pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK; 25087ec681f3Smrg pipeline->dynamic_state_mask |= BIT(state); 25097ec681f3Smrg break; 25107ec681f3Smrg case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT: 25117ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS); 25127ec681f3Smrg break; 25137ec681f3Smrg case VK_DYNAMIC_STATE_CULL_MODE_EXT: 25147ec681f3Smrg pipeline->gras_su_cntl_mask &= 25157ec681f3Smrg ~(A6XX_GRAS_SU_CNTL_CULL_BACK | A6XX_GRAS_SU_CNTL_CULL_FRONT); 25167ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL); 25177ec681f3Smrg break; 25187ec681f3Smrg case VK_DYNAMIC_STATE_FRONT_FACE_EXT: 25197ec681f3Smrg pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_FRONT_CW; 25207ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL); 25217ec681f3Smrg break; 25227ec681f3Smrg case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT: 25237ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY); 25247ec681f3Smrg break; 25257ec681f3Smrg case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT: 25267ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VB_STRIDE); 25277ec681f3Smrg break; 25287ec681f3Smrg case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT: 25297ec681f3Smrg pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT); 25307ec681f3Smrg break; 25317ec681f3Smrg case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT: 25327ec681f3Smrg pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR); 25337ec681f3Smrg break; 25347ec681f3Smrg case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT: 25357ec681f3Smrg pipeline->rb_depth_cntl_mask &= 25367ec681f3Smrg ~(A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE); 25377ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); 25387ec681f3Smrg break; 25397ec681f3Smrg case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT: 25407ec681f3Smrg pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; 25417ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); 25427ec681f3Smrg break; 25437ec681f3Smrg case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT: 25447ec681f3Smrg pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK; 25457ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); 25467ec681f3Smrg break; 25477ec681f3Smrg case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT: 25487ec681f3Smrg pipeline->rb_depth_cntl_mask &= 25497ec681f3Smrg ~(A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE); 25507ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); 25517ec681f3Smrg break; 25527ec681f3Smrg case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT: 25537ec681f3Smrg pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | 25547ec681f3Smrg A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | 25557ec681f3Smrg A6XX_RB_STENCIL_CONTROL_STENCIL_READ); 25567ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL); 25577ec681f3Smrg break; 25587ec681f3Smrg case VK_DYNAMIC_STATE_STENCIL_OP_EXT: 25597ec681f3Smrg pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_FUNC__MASK | 25607ec681f3Smrg A6XX_RB_STENCIL_CONTROL_FAIL__MASK | 25617ec681f3Smrg A6XX_RB_STENCIL_CONTROL_ZPASS__MASK | 25627ec681f3Smrg A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK | 25637ec681f3Smrg A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK | 25647ec681f3Smrg A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK | 25657ec681f3Smrg A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK | 25667ec681f3Smrg A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK); 25677ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL); 25687ec681f3Smrg break; 25697ec681f3Smrg case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT: 25707ec681f3Smrg pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET; 25717ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL); 25727ec681f3Smrg break; 25737ec681f3Smrg case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT: 25747ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE); 25757ec681f3Smrg break; 25767ec681f3Smrg case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT: 25777ec681f3Smrg pipeline->pc_raster_cntl_mask &= ~A6XX_PC_RASTER_CNTL_DISCARD; 25787ec681f3Smrg pipeline->vpc_unknown_9107_mask &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD; 25797ec681f3Smrg pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD); 25807ec681f3Smrg break; 25817ec681f3Smrg default: 25827ec681f3Smrg assert(!"unsupported dynamic state"); 25837ec681f3Smrg break; 25847ec681f3Smrg } 2585361fc4cbSmaya } 2586361fc4cbSmaya} 2587361fc4cbSmaya 25887ec681f3Smrgstatic void 25897ec681f3Smrgtu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link, 25907ec681f3Smrg struct tu_shader *shader, 25917ec681f3Smrg struct ir3_shader_variant *v) 25927ec681f3Smrg{ 25937ec681f3Smrg link->const_state = *ir3_const_state(v); 25947ec681f3Smrg link->constlen = v->constlen; 25957ec681f3Smrg link->push_consts = shader->push_consts; 25967ec681f3Smrg} 25977ec681f3Smrg 25987ec681f3Smrgstatic void 25997ec681f3Smrgtu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, 26007ec681f3Smrg struct tu_pipeline *pipeline) 26017ec681f3Smrg{ 26027ec681f3Smrg struct tu_cs prog_cs; 26037ec681f3Smrg 26047ec681f3Smrg /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything 26057ec681f3Smrg * else that could depend on that state (like push constants) 26067ec681f3Smrg * 26077ec681f3Smrg * Note also that this always uses the full VS even in binning pass. The 26087ec681f3Smrg * binning pass variant has the same const layout as the full VS, and 26097ec681f3Smrg * the constlen for the VS will be the same or greater than the constlen 26107ec681f3Smrg * for the binning pass variant. It is required that the constlen state 26117ec681f3Smrg * matches between binning and draw passes, as some parts of the push 26127ec681f3Smrg * consts are emitted in state groups that are shared between the binning 26137ec681f3Smrg * and draw passes. 26147ec681f3Smrg */ 26157ec681f3Smrg tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs); 26167ec681f3Smrg tu6_emit_program_config(&prog_cs, builder); 26177ec681f3Smrg pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); 26187ec681f3Smrg 26197ec681f3Smrg tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs); 26207ec681f3Smrg tu6_emit_program(&prog_cs, builder, false, pipeline); 26217ec681f3Smrg pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); 26227ec681f3Smrg 26237ec681f3Smrg tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs); 26247ec681f3Smrg tu6_emit_program(&prog_cs, builder, true, pipeline); 26257ec681f3Smrg pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); 26267ec681f3Smrg 26277ec681f3Smrg VkShaderStageFlags stages = 0; 26287ec681f3Smrg for (unsigned i = 0; i < builder->create_info->stageCount; i++) { 26297ec681f3Smrg stages |= builder->create_info->pStages[i].stage; 26307ec681f3Smrg } 26317ec681f3Smrg pipeline->active_stages = stages; 26327ec681f3Smrg 26337ec681f3Smrg for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders); i++) { 26347ec681f3Smrg if (!builder->shaders[i]) 26357ec681f3Smrg continue; 2636361fc4cbSmaya 26377ec681f3Smrg tu_pipeline_set_linkage(&pipeline->program.link[i], 26387ec681f3Smrg builder->shaders[i], 26397ec681f3Smrg builder->variants[i]); 26407ec681f3Smrg } 2641361fc4cbSmaya} 2642361fc4cbSmaya 2643361fc4cbSmayastatic void 2644361fc4cbSmayatu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder, 2645361fc4cbSmaya struct tu_pipeline *pipeline) 2646361fc4cbSmaya{ 2647361fc4cbSmaya const VkPipelineVertexInputStateCreateInfo *vi_info = 2648361fc4cbSmaya builder->create_info->pVertexInputState; 26497ec681f3Smrg const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX]; 26507ec681f3Smrg const struct ir3_shader_variant *bs = builder->binning_variant; 2651361fc4cbSmaya 26527ec681f3Smrg /* Bindings may contain holes */ 26537ec681f3Smrg for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) { 26547ec681f3Smrg pipeline->num_vbs = 26557ec681f3Smrg MAX2(pipeline->num_vbs, vi_info->pVertexBindingDescriptions[i].binding + 1); 26567ec681f3Smrg } 2657361fc4cbSmaya 26587ec681f3Smrg struct tu_cs vi_cs; 26597ec681f3Smrg tu_cs_begin_sub_stream(&pipeline->cs, 26607ec681f3Smrg MAX_VERTEX_ATTRIBS * 7 + 2, &vi_cs); 26617ec681f3Smrg tu6_emit_vertex_input(pipeline, &vi_cs, vs, vi_info); 26627ec681f3Smrg pipeline->vi.state = tu_cs_end_draw_state(&pipeline->cs, &vi_cs); 26637ec681f3Smrg 26647ec681f3Smrg if (bs) { 26657ec681f3Smrg tu_cs_begin_sub_stream(&pipeline->cs, 26667ec681f3Smrg MAX_VERTEX_ATTRIBS * 7 + 2, &vi_cs); 26677ec681f3Smrg tu6_emit_vertex_input(pipeline, &vi_cs, bs, vi_info); 26687ec681f3Smrg pipeline->vi.binning_state = 26697ec681f3Smrg tu_cs_end_draw_state(&pipeline->cs, &vi_cs); 2670361fc4cbSmaya } 2671361fc4cbSmaya} 2672361fc4cbSmaya 2673361fc4cbSmayastatic void 2674361fc4cbSmayatu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder *builder, 2675361fc4cbSmaya struct tu_pipeline *pipeline) 2676361fc4cbSmaya{ 2677361fc4cbSmaya const VkPipelineInputAssemblyStateCreateInfo *ia_info = 2678361fc4cbSmaya builder->create_info->pInputAssemblyState; 2679361fc4cbSmaya 2680361fc4cbSmaya pipeline->ia.primtype = tu6_primtype(ia_info->topology); 2681361fc4cbSmaya pipeline->ia.primitive_restart = ia_info->primitiveRestartEnable; 2682361fc4cbSmaya} 2683361fc4cbSmaya 26847ec681f3Smrgstatic bool 26857ec681f3Smrgtu_pipeline_static_state(struct tu_pipeline *pipeline, struct tu_cs *cs, 26867ec681f3Smrg uint32_t id, uint32_t size) 26877ec681f3Smrg{ 26887ec681f3Smrg assert(id < ARRAY_SIZE(pipeline->dynamic_state)); 26897ec681f3Smrg 26907ec681f3Smrg if (pipeline->dynamic_state_mask & BIT(id)) 26917ec681f3Smrg return false; 26927ec681f3Smrg 26937ec681f3Smrg pipeline->dynamic_state[id] = tu_cs_draw_state(&pipeline->cs, cs, size); 26947ec681f3Smrg return true; 26957ec681f3Smrg} 26967ec681f3Smrg 26977ec681f3Smrgstatic void 26987ec681f3Smrgtu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder, 26997ec681f3Smrg struct tu_pipeline *pipeline) 27007ec681f3Smrg{ 27017ec681f3Smrg if (!(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) || 27027ec681f3Smrg !(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)) 27037ec681f3Smrg return; 27047ec681f3Smrg 27057ec681f3Smrg const VkPipelineTessellationStateCreateInfo *tess_info = 27067ec681f3Smrg builder->create_info->pTessellationState; 27077ec681f3Smrg 27087ec681f3Smrg assert(pipeline->ia.primtype == DI_PT_PATCHES0); 27097ec681f3Smrg assert(tess_info->patchControlPoints <= 32); 27107ec681f3Smrg pipeline->ia.primtype += tess_info->patchControlPoints; 27117ec681f3Smrg const VkPipelineTessellationDomainOriginStateCreateInfo *domain_info = 27127ec681f3Smrg vk_find_struct_const(tess_info->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO); 27137ec681f3Smrg pipeline->tess.upper_left_domain_origin = !domain_info || 27147ec681f3Smrg domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT; 27157ec681f3Smrg const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL]; 27167ec681f3Smrg const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL]; 27177ec681f3Smrg pipeline->tess.param_stride = hs->output_size * 4; 27187ec681f3Smrg pipeline->tess.hs_bo_regid = hs->const_state->offsets.primitive_param + 1; 27197ec681f3Smrg pipeline->tess.ds_bo_regid = ds->const_state->offsets.primitive_param + 1; 27207ec681f3Smrg} 27217ec681f3Smrg 2722361fc4cbSmayastatic void 2723361fc4cbSmayatu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder, 2724361fc4cbSmaya struct tu_pipeline *pipeline) 2725361fc4cbSmaya{ 2726361fc4cbSmaya /* The spec says: 2727361fc4cbSmaya * 2728361fc4cbSmaya * pViewportState is a pointer to an instance of the 2729361fc4cbSmaya * VkPipelineViewportStateCreateInfo structure, and is ignored if the 2730361fc4cbSmaya * pipeline has rasterization disabled." 2731361fc4cbSmaya * 2732361fc4cbSmaya * We leave the relevant registers stale in that case. 2733361fc4cbSmaya */ 2734361fc4cbSmaya if (builder->rasterizer_discard) 2735361fc4cbSmaya return; 2736361fc4cbSmaya 2737361fc4cbSmaya const VkPipelineViewportStateCreateInfo *vp_info = 2738361fc4cbSmaya builder->create_info->pViewportState; 2739361fc4cbSmaya 27407ec681f3Smrg struct tu_cs cs; 2741361fc4cbSmaya 27427ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount)) 27437ec681f3Smrg tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount); 2744361fc4cbSmaya 27457ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount)) 27467ec681f3Smrg tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount); 2747361fc4cbSmaya} 2748361fc4cbSmaya 2749361fc4cbSmayastatic void 2750361fc4cbSmayatu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder, 2751361fc4cbSmaya struct tu_pipeline *pipeline) 2752361fc4cbSmaya{ 2753361fc4cbSmaya const VkPipelineRasterizationStateCreateInfo *rast_info = 2754361fc4cbSmaya builder->create_info->pRasterizationState; 2755361fc4cbSmaya 27567ec681f3Smrg enum a6xx_polygon_mode mode = tu6_polygon_mode(rast_info->polygonMode); 27577ec681f3Smrg 27587ec681f3Smrg bool depth_clip_disable = rast_info->depthClampEnable; 27597ec681f3Smrg 27607ec681f3Smrg const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state = 27617ec681f3Smrg vk_find_struct_const(rast_info, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT); 27627ec681f3Smrg if (depth_clip_state) 27637ec681f3Smrg depth_clip_disable = !depth_clip_state->depthClipEnable; 2764361fc4cbSmaya 27657ec681f3Smrg pipeline->line_mode = RECTANGULAR; 27667ec681f3Smrg 27677ec681f3Smrg if (tu6_primtype_line(pipeline->ia.primtype)) { 27687ec681f3Smrg const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_state = 27697ec681f3Smrg vk_find_struct_const(rast_info->pNext, 27707ec681f3Smrg PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT); 27717ec681f3Smrg 27727ec681f3Smrg if (rast_line_state && rast_line_state->lineRasterizationMode == 27737ec681f3Smrg VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) { 27747ec681f3Smrg pipeline->line_mode = BRESENHAM; 27757ec681f3Smrg } 27767ec681f3Smrg } 27777ec681f3Smrg 27787ec681f3Smrg struct tu_cs cs; 27797ec681f3Smrg uint32_t cs_size = 9 + 27807ec681f3Smrg (builder->device->physical_device->info->a6xx.has_shading_rate ? 8 : 0) + 27817ec681f3Smrg (builder->emit_msaa_state ? 11 : 0); 27827ec681f3Smrg pipeline->rast_state = tu_cs_draw_state(&pipeline->cs, &cs, cs_size); 27837ec681f3Smrg 27847ec681f3Smrg tu_cs_emit_regs(&cs, 27857ec681f3Smrg A6XX_GRAS_CL_CNTL( 27867ec681f3Smrg .znear_clip_disable = depth_clip_disable, 27877ec681f3Smrg .zfar_clip_disable = depth_clip_disable, 27887ec681f3Smrg /* TODO should this be depth_clip_disable instead? */ 27897ec681f3Smrg .unk5 = rast_info->depthClampEnable, 27907ec681f3Smrg .zero_gb_scale_z = 1, 27917ec681f3Smrg .vp_clip_code_ignore = 1)); 27927ec681f3Smrg 27937ec681f3Smrg tu_cs_emit_regs(&cs, 27947ec681f3Smrg A6XX_VPC_POLYGON_MODE(mode)); 27957ec681f3Smrg 27967ec681f3Smrg tu_cs_emit_regs(&cs, 27977ec681f3Smrg A6XX_PC_POLYGON_MODE(mode)); 2798361fc4cbSmaya 2799361fc4cbSmaya /* move to hw ctx init? */ 28007ec681f3Smrg tu_cs_emit_regs(&cs, 28017ec681f3Smrg A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f), 28027ec681f3Smrg A6XX_GRAS_SU_POINT_SIZE(1.0f)); 28037ec681f3Smrg 28047ec681f3Smrg if (builder->device->physical_device->info->a6xx.has_shading_rate) { 28057ec681f3Smrg tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A00()); 28067ec681f3Smrg tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A10()); 28077ec681f3Smrg tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A20()); 28087ec681f3Smrg tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A30()); 28097ec681f3Smrg } 28107ec681f3Smrg 28117ec681f3Smrg /* If samples count couldn't be devised from the subpass, we should emit it here. 28127ec681f3Smrg * It happens when subpass doesn't use any color/depth attachment. 28137ec681f3Smrg */ 28147ec681f3Smrg if (builder->emit_msaa_state) 28157ec681f3Smrg tu6_emit_msaa(&cs, builder->samples, pipeline->line_mode); 28167ec681f3Smrg 28177ec681f3Smrg const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info = 28187ec681f3Smrg vk_find_struct_const(rast_info->pNext, 28197ec681f3Smrg PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT); 28207ec681f3Smrg unsigned stream = stream_info ? stream_info->rasterizationStream : 0; 28217ec681f3Smrg 28227ec681f3Smrg pipeline->pc_raster_cntl = A6XX_PC_RASTER_CNTL_STREAM(stream); 28237ec681f3Smrg pipeline->vpc_unknown_9107 = 0; 28247ec681f3Smrg if (rast_info->rasterizerDiscardEnable) { 28257ec681f3Smrg pipeline->pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD; 28267ec681f3Smrg pipeline->vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD; 28277ec681f3Smrg } 2828361fc4cbSmaya 28297ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4)) { 28307ec681f3Smrg tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = pipeline->pc_raster_cntl)); 28317ec681f3Smrg tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = pipeline->vpc_unknown_9107)); 28327ec681f3Smrg } 28337ec681f3Smrg 28347ec681f3Smrg pipeline->gras_su_cntl = 28357ec681f3Smrg tu6_gras_su_cntl(rast_info, pipeline->line_mode, builder->multiview_mask != 0); 2836361fc4cbSmaya 28377ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2)) 28387ec681f3Smrg tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = pipeline->gras_su_cntl)); 2839361fc4cbSmaya 28407ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BIAS, 4)) { 28417ec681f3Smrg tu6_emit_depth_bias(&cs, rast_info->depthBiasConstantFactor, 2842361fc4cbSmaya rast_info->depthBiasClamp, 2843361fc4cbSmaya rast_info->depthBiasSlopeFactor); 2844361fc4cbSmaya } 2845361fc4cbSmaya 28467ec681f3Smrg const struct VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_state = 28477ec681f3Smrg vk_find_struct_const(rast_info->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT); 28487ec681f3Smrg pipeline->provoking_vertex_last = provoking_vtx_state && 28497ec681f3Smrg provoking_vtx_state->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT; 2850361fc4cbSmaya} 2851361fc4cbSmaya 2852361fc4cbSmayastatic void 2853361fc4cbSmayatu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder, 2854361fc4cbSmaya struct tu_pipeline *pipeline) 2855361fc4cbSmaya{ 2856361fc4cbSmaya /* The spec says: 2857361fc4cbSmaya * 2858361fc4cbSmaya * pDepthStencilState is a pointer to an instance of the 2859361fc4cbSmaya * VkPipelineDepthStencilStateCreateInfo structure, and is ignored if 2860361fc4cbSmaya * the pipeline has rasterization disabled or if the subpass of the 2861361fc4cbSmaya * render pass the pipeline is created against does not use a 2862361fc4cbSmaya * depth/stencil attachment. 2863361fc4cbSmaya */ 2864361fc4cbSmaya const VkPipelineDepthStencilStateCreateInfo *ds_info = 28657ec681f3Smrg builder->create_info->pDepthStencilState; 28667ec681f3Smrg const VkPipelineRasterizationStateCreateInfo *rast_info = 28677ec681f3Smrg builder->create_info->pRasterizationState; 28687ec681f3Smrg uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0; 28697ec681f3Smrg struct tu_cs cs; 28707ec681f3Smrg 28717ec681f3Smrg if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED && 28727ec681f3Smrg builder->depth_attachment_format != VK_FORMAT_S8_UINT) { 28737ec681f3Smrg if (ds_info->depthTestEnable) { 28747ec681f3Smrg rb_depth_cntl |= 28757ec681f3Smrg A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | 28767ec681f3Smrg A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) | 28777ec681f3Smrg A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; /* TODO: don't set for ALWAYS/NEVER */ 28787ec681f3Smrg 28797ec681f3Smrg if (rast_info->depthClampEnable) 28807ec681f3Smrg rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_CLAMP_ENABLE; 28817ec681f3Smrg 28827ec681f3Smrg if (ds_info->depthWriteEnable) 28837ec681f3Smrg rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; 28847ec681f3Smrg } 2885361fc4cbSmaya 28867ec681f3Smrg if (ds_info->depthBoundsTestEnable) 28877ec681f3Smrg rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; 2888361fc4cbSmaya 28897ec681f3Smrg if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable) 28907ec681f3Smrg tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl); 28917ec681f3Smrg } else { 28927ec681f3Smrg /* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set 28937ec681f3Smrg * to 0 when this pipeline is used, as enabling depth test when there 28947ec681f3Smrg * is no depth attachment is a problem (at least for the S8_UINT case) 28957ec681f3Smrg */ 28967ec681f3Smrg if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL)) 28977ec681f3Smrg pipeline->rb_depth_cntl_disable = true; 28987ec681f3Smrg } 28997ec681f3Smrg 29007ec681f3Smrg if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) { 29017ec681f3Smrg const VkStencilOpState *front = &ds_info->front; 29027ec681f3Smrg const VkStencilOpState *back = &ds_info->back; 29037ec681f3Smrg 29047ec681f3Smrg rb_stencil_cntl |= 29057ec681f3Smrg A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) | 29067ec681f3Smrg A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) | 29077ec681f3Smrg A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) | 29087ec681f3Smrg A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) | 29097ec681f3Smrg A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) | 29107ec681f3Smrg A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) | 29117ec681f3Smrg A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) | 29127ec681f3Smrg A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp)); 29137ec681f3Smrg 29147ec681f3Smrg if (ds_info->stencilTestEnable) { 29157ec681f3Smrg rb_stencil_cntl |= 29167ec681f3Smrg A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | 29177ec681f3Smrg A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | 29187ec681f3Smrg A6XX_RB_STENCIL_CONTROL_STENCIL_READ; 29197ec681f3Smrg } 29207ec681f3Smrg } 29217ec681f3Smrg 29227ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) { 29237ec681f3Smrg tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_CNTL, 1); 29247ec681f3Smrg tu_cs_emit(&cs, rb_depth_cntl); 29257ec681f3Smrg } 29267ec681f3Smrg pipeline->rb_depth_cntl = rb_depth_cntl; 29277ec681f3Smrg 29287ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2)) { 29297ec681f3Smrg tu_cs_emit_pkt4(&cs, REG_A6XX_RB_STENCIL_CONTROL, 1); 29307ec681f3Smrg tu_cs_emit(&cs, rb_stencil_cntl); 29317ec681f3Smrg } 29327ec681f3Smrg pipeline->rb_stencil_cntl = rb_stencil_cntl; 29337ec681f3Smrg 29347ec681f3Smrg /* the remaining draw states arent used if there is no d/s, leave them empty */ 29357ec681f3Smrg if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED) 29367ec681f3Smrg return; 2937361fc4cbSmaya 29387ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3)) { 29397ec681f3Smrg tu_cs_emit_regs(&cs, 29407ec681f3Smrg A6XX_RB_Z_BOUNDS_MIN(ds_info->minDepthBounds), 29417ec681f3Smrg A6XX_RB_Z_BOUNDS_MAX(ds_info->maxDepthBounds)); 29427ec681f3Smrg } 2943361fc4cbSmaya 29447ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2)) { 29457ec681f3Smrg tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.mask = ds_info->front.compareMask & 0xff, 29467ec681f3Smrg .bfmask = ds_info->back.compareMask & 0xff)); 2947361fc4cbSmaya } 29487ec681f3Smrg 29497ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2)) { 29507ec681f3Smrg update_stencil_mask(&pipeline->stencil_wrmask, VK_STENCIL_FACE_FRONT_BIT, ds_info->front.writeMask); 29517ec681f3Smrg update_stencil_mask(&pipeline->stencil_wrmask, VK_STENCIL_FACE_BACK_BIT, ds_info->back.writeMask); 29527ec681f3Smrg tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = pipeline->stencil_wrmask)); 2953361fc4cbSmaya } 29547ec681f3Smrg 29557ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2)) { 29567ec681f3Smrg tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.ref = ds_info->front.reference & 0xff, 29577ec681f3Smrg .bfref = ds_info->back.reference & 0xff)); 2958361fc4cbSmaya } 2959361fc4cbSmaya 29607ec681f3Smrg if (builder->shaders[MESA_SHADER_FRAGMENT]) { 29617ec681f3Smrg const struct ir3_shader_variant *fs = &builder->shaders[MESA_SHADER_FRAGMENT]->ir3_shader->variants[0]; 29627ec681f3Smrg if (fs->has_kill || fs->no_earlyz || fs->writes_pos) { 29637ec681f3Smrg pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE; 29647ec681f3Smrg } 29657ec681f3Smrg if (fs->no_earlyz || fs->writes_pos) { 29667ec681f3Smrg pipeline->lrz.force_disable_mask = TU_LRZ_FORCE_DISABLE_LRZ; 29677ec681f3Smrg } 29687ec681f3Smrg } 2969361fc4cbSmaya} 2970361fc4cbSmaya 2971361fc4cbSmayastatic void 2972361fc4cbSmayatu_pipeline_builder_parse_multisample_and_color_blend( 2973361fc4cbSmaya struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline) 2974361fc4cbSmaya{ 2975361fc4cbSmaya /* The spec says: 2976361fc4cbSmaya * 2977361fc4cbSmaya * pMultisampleState is a pointer to an instance of the 2978361fc4cbSmaya * VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline 2979361fc4cbSmaya * has rasterization disabled. 2980361fc4cbSmaya * 2981361fc4cbSmaya * Also, 2982361fc4cbSmaya * 2983361fc4cbSmaya * pColorBlendState is a pointer to an instance of the 2984361fc4cbSmaya * VkPipelineColorBlendStateCreateInfo structure, and is ignored if the 2985361fc4cbSmaya * pipeline has rasterization disabled or if the subpass of the render 2986361fc4cbSmaya * pass the pipeline is created against does not use any color 2987361fc4cbSmaya * attachments. 2988361fc4cbSmaya * 2989361fc4cbSmaya * We leave the relevant registers stale when rasterization is disabled. 2990361fc4cbSmaya */ 2991361fc4cbSmaya if (builder->rasterizer_discard) 2992361fc4cbSmaya return; 2993361fc4cbSmaya 2994361fc4cbSmaya static const VkPipelineColorBlendStateCreateInfo dummy_blend_info; 2995361fc4cbSmaya const VkPipelineMultisampleStateCreateInfo *msaa_info = 2996361fc4cbSmaya builder->create_info->pMultisampleState; 2997361fc4cbSmaya const VkPipelineColorBlendStateCreateInfo *blend_info = 2998361fc4cbSmaya builder->use_color_attachments ? builder->create_info->pColorBlendState 2999361fc4cbSmaya : &dummy_blend_info; 3000361fc4cbSmaya 30017ec681f3Smrg struct tu_cs cs; 30027ec681f3Smrg pipeline->blend_state = 30037ec681f3Smrg tu_cs_draw_state(&pipeline->cs, &cs, blend_info->attachmentCount * 3 + 4); 3004361fc4cbSmaya 3005361fc4cbSmaya uint32_t blend_enable_mask; 30067ec681f3Smrg tu6_emit_rb_mrt_controls(&cs, blend_info, 3007361fc4cbSmaya builder->color_attachment_formats, 3008361fc4cbSmaya &blend_enable_mask); 3009361fc4cbSmaya 30107ec681f3Smrg tu6_emit_blend_control(&cs, blend_enable_mask, 30117ec681f3Smrg builder->use_dual_src_blend, msaa_info); 30127ec681f3Smrg 30137ec681f3Smrg assert(cs.cur == cs.end); /* validate draw state size */ 30147ec681f3Smrg 30157ec681f3Smrg if (blend_enable_mask) { 30167ec681f3Smrg for (int i = 0; i < blend_info->attachmentCount; i++) { 30177ec681f3Smrg VkPipelineColorBlendAttachmentState blendAttachment = blend_info->pAttachments[i]; 30187ec681f3Smrg /* Disable LRZ writes when blend is enabled, since the 30197ec681f3Smrg * resulting pixel value from the blend-draw 30207ec681f3Smrg * depends on an earlier draw, which LRZ in the draw pass 30217ec681f3Smrg * could early-reject if the previous blend-enabled draw wrote LRZ. 30227ec681f3Smrg * 30237ec681f3Smrg * From the PoV of LRZ, having masked color channels is 30247ec681f3Smrg * the same as having blend enabled, in that the draw will 30257ec681f3Smrg * care about the fragments from an earlier draw. 30267ec681f3Smrg * 30277ec681f3Smrg * TODO: We need to disable LRZ writes only for the binning pass. 30287ec681f3Smrg * Therefore, we need to emit it in a separate draw state. We keep 30297ec681f3Smrg * it disabled for sysmem path as well for the moment. 30307ec681f3Smrg */ 30317ec681f3Smrg if (blendAttachment.blendEnable || blendAttachment.colorWriteMask != 0xf) { 30327ec681f3Smrg pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE; 30337ec681f3Smrg } 30347ec681f3Smrg } 30357ec681f3Smrg } 30367ec681f3Smrg 30377ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5)) { 30387ec681f3Smrg tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4); 30397ec681f3Smrg tu_cs_emit_array(&cs, (const uint32_t *) blend_info->blendConstants, 4); 30407ec681f3Smrg } 30417ec681f3Smrg 30427ec681f3Smrg const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations = 30437ec681f3Smrg vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT); 30447ec681f3Smrg const VkSampleLocationsInfoEXT *samp_loc = NULL; 3045361fc4cbSmaya 30467ec681f3Smrg if (sample_locations && sample_locations->sampleLocationsEnable) 30477ec681f3Smrg samp_loc = &sample_locations->sampleLocationsInfo; 3048361fc4cbSmaya 30497ec681f3Smrg if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, 30507ec681f3Smrg samp_loc ? 9 : 6)) { 30517ec681f3Smrg tu6_emit_sample_locations(&cs, samp_loc); 30527ec681f3Smrg } 3053361fc4cbSmaya} 3054361fc4cbSmaya 3055361fc4cbSmayastatic void 3056361fc4cbSmayatu_pipeline_finish(struct tu_pipeline *pipeline, 3057361fc4cbSmaya struct tu_device *dev, 3058361fc4cbSmaya const VkAllocationCallbacks *alloc) 3059361fc4cbSmaya{ 30607ec681f3Smrg tu_cs_finish(&pipeline->cs); 30617ec681f3Smrg 30627ec681f3Smrg if (pipeline->pvtmem_bo.size) 30637ec681f3Smrg tu_bo_finish(dev, &pipeline->pvtmem_bo); 3064361fc4cbSmaya 30657ec681f3Smrg ralloc_free(pipeline->executables_mem_ctx); 3066361fc4cbSmaya} 3067361fc4cbSmaya 3068361fc4cbSmayastatic VkResult 3069361fc4cbSmayatu_pipeline_builder_build(struct tu_pipeline_builder *builder, 3070361fc4cbSmaya struct tu_pipeline **pipeline) 3071361fc4cbSmaya{ 30727ec681f3Smrg VkResult result; 30737ec681f3Smrg 30747ec681f3Smrg *pipeline = vk_object_zalloc(&builder->device->vk, builder->alloc, 30757ec681f3Smrg sizeof(**pipeline), VK_OBJECT_TYPE_PIPELINE); 30767ec681f3Smrg if (!*pipeline) 30777ec681f3Smrg return VK_ERROR_OUT_OF_HOST_MEMORY; 30787ec681f3Smrg 30797ec681f3Smrg (*pipeline)->layout = builder->layout; 30807ec681f3Smrg (*pipeline)->executables_mem_ctx = ralloc_context(NULL); 30817ec681f3Smrg util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx); 3082361fc4cbSmaya 3083361fc4cbSmaya /* compile and upload shaders */ 30847ec681f3Smrg result = tu_pipeline_builder_compile_shaders(builder, *pipeline); 30857ec681f3Smrg if (result != VK_SUCCESS) { 30867ec681f3Smrg vk_object_free(&builder->device->vk, builder->alloc, *pipeline); 30877ec681f3Smrg return result; 30887ec681f3Smrg } 30897ec681f3Smrg 30907ec681f3Smrg result = tu_pipeline_allocate_cs(builder->device, *pipeline, builder, NULL); 3091361fc4cbSmaya if (result != VK_SUCCESS) { 30927ec681f3Smrg vk_object_free(&builder->device->vk, builder->alloc, *pipeline); 30937ec681f3Smrg return result; 30947ec681f3Smrg } 30957ec681f3Smrg 30967ec681f3Smrg for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) 30977ec681f3Smrg builder->shader_iova[i] = tu_upload_variant(*pipeline, builder->variants[i]); 30987ec681f3Smrg 30997ec681f3Smrg builder->binning_vs_iova = 31007ec681f3Smrg tu_upload_variant(*pipeline, builder->binning_variant); 31017ec681f3Smrg 31027ec681f3Smrg /* Setup private memory. Note that because we're sharing the same private 31037ec681f3Smrg * memory for all stages, all stages must use the same config, or else 31047ec681f3Smrg * fibers from one stage might overwrite fibers in another. 31057ec681f3Smrg */ 3106361fc4cbSmaya 31077ec681f3Smrg uint32_t pvtmem_size = 0; 31087ec681f3Smrg bool per_wave = true; 31097ec681f3Smrg for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) { 31107ec681f3Smrg if (builder->variants[i]) { 31117ec681f3Smrg pvtmem_size = MAX2(pvtmem_size, builder->variants[i]->pvtmem_size); 31127ec681f3Smrg if (!builder->variants[i]->pvtmem_per_wave) 31137ec681f3Smrg per_wave = false; 31147ec681f3Smrg } 31157ec681f3Smrg } 31167ec681f3Smrg 31177ec681f3Smrg if (builder->binning_variant) { 31187ec681f3Smrg pvtmem_size = MAX2(pvtmem_size, builder->binning_variant->pvtmem_size); 31197ec681f3Smrg if (!builder->binning_variant->pvtmem_per_wave) 31207ec681f3Smrg per_wave = false; 31217ec681f3Smrg } 31227ec681f3Smrg 31237ec681f3Smrg result = tu_setup_pvtmem(builder->device, *pipeline, &builder->pvtmem, 31247ec681f3Smrg pvtmem_size, per_wave); 31257ec681f3Smrg if (result != VK_SUCCESS) { 31267ec681f3Smrg vk_object_free(&builder->device->vk, builder->alloc, *pipeline); 3127361fc4cbSmaya return result; 3128361fc4cbSmaya } 3129361fc4cbSmaya 3130361fc4cbSmaya tu_pipeline_builder_parse_dynamic(builder, *pipeline); 3131361fc4cbSmaya tu_pipeline_builder_parse_shader_stages(builder, *pipeline); 3132361fc4cbSmaya tu_pipeline_builder_parse_vertex_input(builder, *pipeline); 3133361fc4cbSmaya tu_pipeline_builder_parse_input_assembly(builder, *pipeline); 31347ec681f3Smrg tu_pipeline_builder_parse_tessellation(builder, *pipeline); 3135361fc4cbSmaya tu_pipeline_builder_parse_viewport(builder, *pipeline); 3136361fc4cbSmaya tu_pipeline_builder_parse_rasterization(builder, *pipeline); 3137361fc4cbSmaya tu_pipeline_builder_parse_depth_stencil(builder, *pipeline); 3138361fc4cbSmaya tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline); 31397ec681f3Smrg tu6_emit_load_state(*pipeline, false); 3140361fc4cbSmaya 3141361fc4cbSmaya /* we should have reserved enough space upfront such that the CS never 3142361fc4cbSmaya * grows 3143361fc4cbSmaya */ 3144361fc4cbSmaya assert((*pipeline)->cs.bo_count == 1); 3145361fc4cbSmaya 3146361fc4cbSmaya return VK_SUCCESS; 3147361fc4cbSmaya} 3148361fc4cbSmaya 3149361fc4cbSmayastatic void 3150361fc4cbSmayatu_pipeline_builder_finish(struct tu_pipeline_builder *builder) 3151361fc4cbSmaya{ 31527ec681f3Smrg for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders); i++) { 3153361fc4cbSmaya if (!builder->shaders[i]) 3154361fc4cbSmaya continue; 3155361fc4cbSmaya tu_shader_destroy(builder->device, builder->shaders[i], builder->alloc); 3156361fc4cbSmaya } 3157361fc4cbSmaya} 3158361fc4cbSmaya 3159361fc4cbSmayastatic void 3160361fc4cbSmayatu_pipeline_builder_init_graphics( 3161361fc4cbSmaya struct tu_pipeline_builder *builder, 3162361fc4cbSmaya struct tu_device *dev, 3163361fc4cbSmaya struct tu_pipeline_cache *cache, 3164361fc4cbSmaya const VkGraphicsPipelineCreateInfo *create_info, 3165361fc4cbSmaya const VkAllocationCallbacks *alloc) 3166361fc4cbSmaya{ 31677ec681f3Smrg TU_FROM_HANDLE(tu_pipeline_layout, layout, create_info->layout); 31687ec681f3Smrg 3169361fc4cbSmaya *builder = (struct tu_pipeline_builder) { 3170361fc4cbSmaya .device = dev, 3171361fc4cbSmaya .cache = cache, 3172361fc4cbSmaya .create_info = create_info, 3173361fc4cbSmaya .alloc = alloc, 31747ec681f3Smrg .layout = layout, 3175361fc4cbSmaya }; 3176361fc4cbSmaya 31777ec681f3Smrg bool rasterizer_discard_dynamic = false; 31787ec681f3Smrg if (create_info->pDynamicState) { 31797ec681f3Smrg for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) { 31807ec681f3Smrg if (create_info->pDynamicState->pDynamicStates[i] == 31817ec681f3Smrg VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT) { 31827ec681f3Smrg rasterizer_discard_dynamic = true; 31837ec681f3Smrg break; 31847ec681f3Smrg } 31857ec681f3Smrg } 31867ec681f3Smrg } 31877ec681f3Smrg 31887ec681f3Smrg const struct tu_render_pass *pass = 31897ec681f3Smrg tu_render_pass_from_handle(create_info->renderPass); 31907ec681f3Smrg const struct tu_subpass *subpass = 31917ec681f3Smrg &pass->subpasses[create_info->subpass]; 31927ec681f3Smrg 31937ec681f3Smrg builder->multiview_mask = subpass->multiview_mask; 31947ec681f3Smrg 3195361fc4cbSmaya builder->rasterizer_discard = 31967ec681f3Smrg builder->create_info->pRasterizationState->rasterizerDiscardEnable && 31977ec681f3Smrg !rasterizer_discard_dynamic; 31987ec681f3Smrg 31997ec681f3Smrg /* variableMultisampleRate support */ 32007ec681f3Smrg builder->emit_msaa_state = (subpass->samples == 0) && !builder->rasterizer_discard; 3201361fc4cbSmaya 3202361fc4cbSmaya if (builder->rasterizer_discard) { 3203361fc4cbSmaya builder->samples = VK_SAMPLE_COUNT_1_BIT; 3204361fc4cbSmaya } else { 3205361fc4cbSmaya builder->samples = create_info->pMultisampleState->rasterizationSamples; 32067ec681f3Smrg builder->alpha_to_coverage = create_info->pMultisampleState->alphaToCoverageEnable; 3207361fc4cbSmaya 32087ec681f3Smrg const uint32_t a = subpass->depth_stencil_attachment.attachment; 32097ec681f3Smrg builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ? 32107ec681f3Smrg pass->attachments[a].format : VK_FORMAT_UNDEFINED; 3211361fc4cbSmaya 32127ec681f3Smrg assert(subpass->color_count == 0 || 32137ec681f3Smrg !create_info->pColorBlendState || 32147ec681f3Smrg subpass->color_count == create_info->pColorBlendState->attachmentCount); 3215361fc4cbSmaya builder->color_attachment_count = subpass->color_count; 3216361fc4cbSmaya for (uint32_t i = 0; i < subpass->color_count; i++) { 3217361fc4cbSmaya const uint32_t a = subpass->color_attachments[i].attachment; 3218361fc4cbSmaya if (a == VK_ATTACHMENT_UNUSED) 3219361fc4cbSmaya continue; 3220361fc4cbSmaya 3221361fc4cbSmaya builder->color_attachment_formats[i] = pass->attachments[a].format; 3222361fc4cbSmaya builder->use_color_attachments = true; 32237ec681f3Smrg builder->render_components |= 0xf << (i * 4); 32247ec681f3Smrg } 32257ec681f3Smrg 32267ec681f3Smrg if (tu_blend_state_is_dual_src(create_info->pColorBlendState)) { 32277ec681f3Smrg builder->color_attachment_count++; 32287ec681f3Smrg builder->use_dual_src_blend = true; 32297ec681f3Smrg /* dual source blending has an extra fs output in the 2nd slot */ 32307ec681f3Smrg if (subpass->color_attachments[0].attachment != VK_ATTACHMENT_UNUSED) 32317ec681f3Smrg builder->render_components |= 0xf << 4; 3232361fc4cbSmaya } 3233361fc4cbSmaya } 3234361fc4cbSmaya} 3235361fc4cbSmaya 32367ec681f3Smrgstatic VkResult 32377ec681f3Smrgtu_graphics_pipeline_create(VkDevice device, 32387ec681f3Smrg VkPipelineCache pipelineCache, 32397ec681f3Smrg const VkGraphicsPipelineCreateInfo *pCreateInfo, 32407ec681f3Smrg const VkAllocationCallbacks *pAllocator, 32417ec681f3Smrg VkPipeline *pPipeline) 32427ec681f3Smrg{ 32437ec681f3Smrg TU_FROM_HANDLE(tu_device, dev, device); 32447ec681f3Smrg TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache); 32457ec681f3Smrg 32467ec681f3Smrg struct tu_pipeline_builder builder; 32477ec681f3Smrg tu_pipeline_builder_init_graphics(&builder, dev, cache, 32487ec681f3Smrg pCreateInfo, pAllocator); 32497ec681f3Smrg 32507ec681f3Smrg struct tu_pipeline *pipeline = NULL; 32517ec681f3Smrg VkResult result = tu_pipeline_builder_build(&builder, &pipeline); 32527ec681f3Smrg tu_pipeline_builder_finish(&builder); 32537ec681f3Smrg 32547ec681f3Smrg if (result == VK_SUCCESS) 32557ec681f3Smrg *pPipeline = tu_pipeline_to_handle(pipeline); 32567ec681f3Smrg else 32577ec681f3Smrg *pPipeline = VK_NULL_HANDLE; 32587ec681f3Smrg 32597ec681f3Smrg return result; 32607ec681f3Smrg} 32617ec681f3Smrg 32627ec681f3SmrgVKAPI_ATTR VkResult VKAPI_CALL 3263361fc4cbSmayatu_CreateGraphicsPipelines(VkDevice device, 3264361fc4cbSmaya VkPipelineCache pipelineCache, 3265361fc4cbSmaya uint32_t count, 3266361fc4cbSmaya const VkGraphicsPipelineCreateInfo *pCreateInfos, 3267361fc4cbSmaya const VkAllocationCallbacks *pAllocator, 3268361fc4cbSmaya VkPipeline *pPipelines) 3269361fc4cbSmaya{ 32707ec681f3Smrg VkResult final_result = VK_SUCCESS; 3271361fc4cbSmaya 3272361fc4cbSmaya for (uint32_t i = 0; i < count; i++) { 32737ec681f3Smrg VkResult result = tu_graphics_pipeline_create(device, pipelineCache, 32747ec681f3Smrg &pCreateInfos[i], pAllocator, 32757ec681f3Smrg &pPipelines[i]); 3276361fc4cbSmaya 32777ec681f3Smrg if (result != VK_SUCCESS) 32787ec681f3Smrg final_result = result; 3279361fc4cbSmaya } 3280361fc4cbSmaya 32817ec681f3Smrg return final_result; 3282361fc4cbSmaya} 3283361fc4cbSmaya 3284361fc4cbSmayastatic VkResult 32857ec681f3Smrgtu_compute_pipeline_create(VkDevice device, 3286361fc4cbSmaya VkPipelineCache _cache, 3287361fc4cbSmaya const VkComputePipelineCreateInfo *pCreateInfo, 3288361fc4cbSmaya const VkAllocationCallbacks *pAllocator, 3289361fc4cbSmaya VkPipeline *pPipeline) 3290361fc4cbSmaya{ 32917ec681f3Smrg TU_FROM_HANDLE(tu_device, dev, device); 32927ec681f3Smrg TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout); 32937ec681f3Smrg const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage; 32947ec681f3Smrg VkResult result; 32957ec681f3Smrg 32967ec681f3Smrg struct tu_pipeline *pipeline; 32977ec681f3Smrg 32987ec681f3Smrg *pPipeline = VK_NULL_HANDLE; 32997ec681f3Smrg 33007ec681f3Smrg pipeline = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pipeline), 33017ec681f3Smrg VK_OBJECT_TYPE_PIPELINE); 33027ec681f3Smrg if (!pipeline) 33037ec681f3Smrg return VK_ERROR_OUT_OF_HOST_MEMORY; 33047ec681f3Smrg 33057ec681f3Smrg pipeline->layout = layout; 33067ec681f3Smrg 33077ec681f3Smrg pipeline->executables_mem_ctx = ralloc_context(NULL); 33087ec681f3Smrg util_dynarray_init(&pipeline->executables, pipeline->executables_mem_ctx); 33097ec681f3Smrg 33107ec681f3Smrg struct ir3_shader_key key = {}; 33117ec681f3Smrg 33127ec681f3Smrg nir_shader *nir = tu_spirv_to_nir(dev, stage_info, MESA_SHADER_COMPUTE); 33137ec681f3Smrg 33147ec681f3Smrg const bool executable_info = pCreateInfo->flags & 33157ec681f3Smrg VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR; 33167ec681f3Smrg 33177ec681f3Smrg char *nir_initial_disasm = executable_info ? 33187ec681f3Smrg nir_shader_as_str(nir, pipeline->executables_mem_ctx) : NULL; 33197ec681f3Smrg 33207ec681f3Smrg struct tu_shader *shader = 33217ec681f3Smrg tu_shader_create(dev, nir, 0, layout, pAllocator); 33227ec681f3Smrg if (!shader) { 33237ec681f3Smrg result = VK_ERROR_OUT_OF_HOST_MEMORY; 33247ec681f3Smrg goto fail; 33257ec681f3Smrg } 33267ec681f3Smrg 33277ec681f3Smrg pipeline->active_desc_sets = shader->active_desc_sets; 33287ec681f3Smrg 33297ec681f3Smrg bool created; 33307ec681f3Smrg struct ir3_shader_variant *v = 33317ec681f3Smrg ir3_shader_get_variant(shader->ir3_shader, &key, false, executable_info, &created); 33327ec681f3Smrg if (!v) { 33337ec681f3Smrg result = VK_ERROR_OUT_OF_HOST_MEMORY; 33347ec681f3Smrg goto fail; 33357ec681f3Smrg } 33367ec681f3Smrg 33377ec681f3Smrg tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE], 33387ec681f3Smrg shader, v); 33397ec681f3Smrg 33407ec681f3Smrg result = tu_pipeline_allocate_cs(dev, pipeline, NULL, v); 33417ec681f3Smrg if (result != VK_SUCCESS) 33427ec681f3Smrg goto fail; 33437ec681f3Smrg 33447ec681f3Smrg uint64_t shader_iova = tu_upload_variant(pipeline, v); 33457ec681f3Smrg 33467ec681f3Smrg struct tu_pvtmem_config pvtmem; 33477ec681f3Smrg tu_setup_pvtmem(dev, pipeline, &pvtmem, v->pvtmem_size, v->pvtmem_per_wave); 33487ec681f3Smrg 33497ec681f3Smrg for (int i = 0; i < 3; i++) 33507ec681f3Smrg pipeline->compute.local_size[i] = v->local_size[i]; 33517ec681f3Smrg 33527ec681f3Smrg pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64; 33537ec681f3Smrg 33547ec681f3Smrg struct tu_cs prog_cs; 33557ec681f3Smrg uint32_t additional_reserve_size = tu_xs_get_additional_cs_size_dwords(v); 33567ec681f3Smrg tu_cs_begin_sub_stream(&pipeline->cs, 64 + additional_reserve_size, &prog_cs); 33577ec681f3Smrg tu6_emit_cs_config(&prog_cs, shader, v, &pvtmem, shader_iova); 33587ec681f3Smrg pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); 33597ec681f3Smrg 33607ec681f3Smrg tu6_emit_load_state(pipeline, true); 33617ec681f3Smrg 33627ec681f3Smrg tu_append_executable(pipeline, v, nir_initial_disasm); 33637ec681f3Smrg 33647ec681f3Smrg tu_shader_destroy(dev, shader, pAllocator); 33657ec681f3Smrg 33667ec681f3Smrg *pPipeline = tu_pipeline_to_handle(pipeline); 33677ec681f3Smrg 3368361fc4cbSmaya return VK_SUCCESS; 33697ec681f3Smrg 33707ec681f3Smrgfail: 33717ec681f3Smrg if (shader) 33727ec681f3Smrg tu_shader_destroy(dev, shader, pAllocator); 33737ec681f3Smrg 33747ec681f3Smrg vk_object_free(&dev->vk, pAllocator, pipeline); 33757ec681f3Smrg 33767ec681f3Smrg return result; 3377361fc4cbSmaya} 3378361fc4cbSmaya 33797ec681f3SmrgVKAPI_ATTR VkResult VKAPI_CALL 33807ec681f3Smrgtu_CreateComputePipelines(VkDevice device, 3381361fc4cbSmaya VkPipelineCache pipelineCache, 3382361fc4cbSmaya uint32_t count, 3383361fc4cbSmaya const VkComputePipelineCreateInfo *pCreateInfos, 3384361fc4cbSmaya const VkAllocationCallbacks *pAllocator, 3385361fc4cbSmaya VkPipeline *pPipelines) 3386361fc4cbSmaya{ 33877ec681f3Smrg VkResult final_result = VK_SUCCESS; 3388361fc4cbSmaya 33897ec681f3Smrg for (uint32_t i = 0; i < count; i++) { 33907ec681f3Smrg VkResult result = tu_compute_pipeline_create(device, pipelineCache, 33917ec681f3Smrg &pCreateInfos[i], 33927ec681f3Smrg pAllocator, &pPipelines[i]); 33937ec681f3Smrg if (result != VK_SUCCESS) 33947ec681f3Smrg final_result = result; 3395361fc4cbSmaya } 3396361fc4cbSmaya 33977ec681f3Smrg return final_result; 3398361fc4cbSmaya} 3399361fc4cbSmaya 34007ec681f3SmrgVKAPI_ATTR void VKAPI_CALL 3401361fc4cbSmayatu_DestroyPipeline(VkDevice _device, 3402361fc4cbSmaya VkPipeline _pipeline, 3403361fc4cbSmaya const VkAllocationCallbacks *pAllocator) 3404361fc4cbSmaya{ 3405361fc4cbSmaya TU_FROM_HANDLE(tu_device, dev, _device); 3406361fc4cbSmaya TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline); 3407361fc4cbSmaya 3408361fc4cbSmaya if (!_pipeline) 3409361fc4cbSmaya return; 3410361fc4cbSmaya 3411361fc4cbSmaya tu_pipeline_finish(pipeline, dev, pAllocator); 34127ec681f3Smrg vk_object_free(&dev->vk, pAllocator, pipeline); 34137ec681f3Smrg} 34147ec681f3Smrg 34157ec681f3Smrg#define WRITE_STR(field, ...) ({ \ 34167ec681f3Smrg memset(field, 0, sizeof(field)); \ 34177ec681f3Smrg UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \ 34187ec681f3Smrg assert(_i > 0 && _i < sizeof(field)); \ 34197ec681f3Smrg}) 34207ec681f3Smrg 34217ec681f3Smrgstatic const struct tu_pipeline_executable * 34227ec681f3Smrgtu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index) 34237ec681f3Smrg{ 34247ec681f3Smrg assert(index < util_dynarray_num_elements(&pipeline->executables, 34257ec681f3Smrg struct tu_pipeline_executable)); 34267ec681f3Smrg return util_dynarray_element( 34277ec681f3Smrg &pipeline->executables, struct tu_pipeline_executable, index); 34287ec681f3Smrg} 34297ec681f3Smrg 34307ec681f3SmrgVKAPI_ATTR VkResult VKAPI_CALL 34317ec681f3Smrgtu_GetPipelineExecutablePropertiesKHR( 34327ec681f3Smrg VkDevice _device, 34337ec681f3Smrg const VkPipelineInfoKHR* pPipelineInfo, 34347ec681f3Smrg uint32_t* pExecutableCount, 34357ec681f3Smrg VkPipelineExecutablePropertiesKHR* pProperties) 34367ec681f3Smrg{ 34377ec681f3Smrg TU_FROM_HANDLE(tu_device, dev, _device); 34387ec681f3Smrg TU_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline); 34397ec681f3Smrg VK_OUTARRAY_MAKE(out, pProperties, pExecutableCount); 34407ec681f3Smrg 34417ec681f3Smrg util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) { 34427ec681f3Smrg vk_outarray_append(&out, props) { 34437ec681f3Smrg gl_shader_stage stage = exe->stage; 34447ec681f3Smrg props->stages = mesa_to_vk_shader_stage(stage); 34457ec681f3Smrg 34467ec681f3Smrg if (!exe->is_binning) 34477ec681f3Smrg WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage)); 34487ec681f3Smrg else 34497ec681f3Smrg WRITE_STR(props->name, "Binning VS"); 34507ec681f3Smrg 34517ec681f3Smrg WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage)); 34527ec681f3Smrg 34537ec681f3Smrg props->subgroupSize = 34547ec681f3Smrg dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1); 34557ec681f3Smrg } 34567ec681f3Smrg } 34577ec681f3Smrg 34587ec681f3Smrg return vk_outarray_status(&out); 34597ec681f3Smrg} 34607ec681f3Smrg 34617ec681f3SmrgVKAPI_ATTR VkResult VKAPI_CALL 34627ec681f3Smrgtu_GetPipelineExecutableStatisticsKHR( 34637ec681f3Smrg VkDevice _device, 34647ec681f3Smrg const VkPipelineExecutableInfoKHR* pExecutableInfo, 34657ec681f3Smrg uint32_t* pStatisticCount, 34667ec681f3Smrg VkPipelineExecutableStatisticKHR* pStatistics) 34677ec681f3Smrg{ 34687ec681f3Smrg TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline); 34697ec681f3Smrg VK_OUTARRAY_MAKE(out, pStatistics, pStatisticCount); 34707ec681f3Smrg 34717ec681f3Smrg const struct tu_pipeline_executable *exe = 34727ec681f3Smrg tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex); 34737ec681f3Smrg 34747ec681f3Smrg vk_outarray_append(&out, stat) { 34757ec681f3Smrg WRITE_STR(stat->name, "Max Waves Per Core"); 34767ec681f3Smrg WRITE_STR(stat->description, 34777ec681f3Smrg "Maximum number of simultaneous waves per core."); 34787ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 34797ec681f3Smrg stat->value.u64 = exe->stats.max_waves; 34807ec681f3Smrg } 34817ec681f3Smrg 34827ec681f3Smrg vk_outarray_append(&out, stat) { 34837ec681f3Smrg WRITE_STR(stat->name, "Instruction Count"); 34847ec681f3Smrg WRITE_STR(stat->description, 34857ec681f3Smrg "Total number of IR3 instructions in the final generated " 34867ec681f3Smrg "shader executable."); 34877ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 34887ec681f3Smrg stat->value.u64 = exe->stats.instrs_count; 34897ec681f3Smrg } 34907ec681f3Smrg 34917ec681f3Smrg vk_outarray_append(&out, stat) { 34927ec681f3Smrg WRITE_STR(stat->name, "NOPs Count"); 34937ec681f3Smrg WRITE_STR(stat->description, 34947ec681f3Smrg "Number of NOP instructions in the final generated " 34957ec681f3Smrg "shader executable."); 34967ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 34977ec681f3Smrg stat->value.u64 = exe->stats.nops_count; 34987ec681f3Smrg } 34997ec681f3Smrg 35007ec681f3Smrg vk_outarray_append(&out, stat) { 35017ec681f3Smrg WRITE_STR(stat->name, "MOV Count"); 35027ec681f3Smrg WRITE_STR(stat->description, 35037ec681f3Smrg "Number of MOV instructions in the final generated " 35047ec681f3Smrg "shader executable."); 35057ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 35067ec681f3Smrg stat->value.u64 = exe->stats.mov_count; 35077ec681f3Smrg } 35087ec681f3Smrg 35097ec681f3Smrg vk_outarray_append(&out, stat) { 35107ec681f3Smrg WRITE_STR(stat->name, "COV Count"); 35117ec681f3Smrg WRITE_STR(stat->description, 35127ec681f3Smrg "Number of COV instructions in the final generated " 35137ec681f3Smrg "shader executable."); 35147ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 35157ec681f3Smrg stat->value.u64 = exe->stats.cov_count; 35167ec681f3Smrg } 35177ec681f3Smrg 35187ec681f3Smrg vk_outarray_append(&out, stat) { 35197ec681f3Smrg WRITE_STR(stat->name, "Registers used"); 35207ec681f3Smrg WRITE_STR(stat->description, 35217ec681f3Smrg "Number of registers used in the final generated " 35227ec681f3Smrg "shader executable."); 35237ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 35247ec681f3Smrg stat->value.u64 = exe->stats.max_reg + 1; 35257ec681f3Smrg } 35267ec681f3Smrg 35277ec681f3Smrg vk_outarray_append(&out, stat) { 35287ec681f3Smrg WRITE_STR(stat->name, "Half-registers used"); 35297ec681f3Smrg WRITE_STR(stat->description, 35307ec681f3Smrg "Number of half-registers used in the final generated " 35317ec681f3Smrg "shader executable."); 35327ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 35337ec681f3Smrg stat->value.u64 = exe->stats.max_half_reg + 1; 35347ec681f3Smrg } 35357ec681f3Smrg 35367ec681f3Smrg vk_outarray_append(&out, stat) { 35377ec681f3Smrg WRITE_STR(stat->name, "Instructions with SS sync bit"); 35387ec681f3Smrg WRITE_STR(stat->description, 35397ec681f3Smrg "SS bit is set for instructions which depend on a result " 35407ec681f3Smrg "of \"long\" instructions to prevent RAW hazard."); 35417ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 35427ec681f3Smrg stat->value.u64 = exe->stats.ss; 35437ec681f3Smrg } 35447ec681f3Smrg 35457ec681f3Smrg vk_outarray_append(&out, stat) { 35467ec681f3Smrg WRITE_STR(stat->name, "Instructions with SY sync bit"); 35477ec681f3Smrg WRITE_STR(stat->description, 35487ec681f3Smrg "SY bit is set for instructions which depend on a result " 35497ec681f3Smrg "of loads from global memory to prevent RAW hazard."); 35507ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 35517ec681f3Smrg stat->value.u64 = exe->stats.sy; 35527ec681f3Smrg } 35537ec681f3Smrg 35547ec681f3Smrg vk_outarray_append(&out, stat) { 35557ec681f3Smrg WRITE_STR(stat->name, "Estimated cycles stalled on SS"); 35567ec681f3Smrg WRITE_STR(stat->description, 35577ec681f3Smrg "A better metric to estimate the impact of SS syncs."); 35587ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 35597ec681f3Smrg stat->value.u64 = exe->stats.sstall; 35607ec681f3Smrg } 35617ec681f3Smrg 35627ec681f3Smrg for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) { 35637ec681f3Smrg vk_outarray_append(&out, stat) { 35647ec681f3Smrg WRITE_STR(stat->name, "cat%d instructions", i); 35657ec681f3Smrg WRITE_STR(stat->description, 35667ec681f3Smrg "Number of cat%d instructions.", i); 35677ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 35687ec681f3Smrg stat->value.u64 = exe->stats.instrs_per_cat[i]; 35697ec681f3Smrg } 35707ec681f3Smrg } 35717ec681f3Smrg 35727ec681f3Smrg vk_outarray_append(&out, stat) { 35737ec681f3Smrg WRITE_STR(stat->name, "STP Count"); 35747ec681f3Smrg WRITE_STR(stat->description, 35757ec681f3Smrg "Number of STore Private instructions in the final generated " 35767ec681f3Smrg "shader executable."); 35777ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 35787ec681f3Smrg stat->value.u64 = exe->stats.stp_count; 35797ec681f3Smrg } 35807ec681f3Smrg 35817ec681f3Smrg vk_outarray_append(&out, stat) { 35827ec681f3Smrg WRITE_STR(stat->name, "LDP Count"); 35837ec681f3Smrg WRITE_STR(stat->description, 35847ec681f3Smrg "Number of LoaD Private instructions in the final generated " 35857ec681f3Smrg "shader executable."); 35867ec681f3Smrg stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 35877ec681f3Smrg stat->value.u64 = exe->stats.ldp_count; 35887ec681f3Smrg } 35897ec681f3Smrg 35907ec681f3Smrg return vk_outarray_status(&out); 35917ec681f3Smrg} 35927ec681f3Smrg 35937ec681f3Smrgstatic bool 35947ec681f3Smrgwrite_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir, 35957ec681f3Smrg const char *data) 35967ec681f3Smrg{ 35977ec681f3Smrg ir->isText = VK_TRUE; 35987ec681f3Smrg 35997ec681f3Smrg size_t data_len = strlen(data) + 1; 36007ec681f3Smrg 36017ec681f3Smrg if (ir->pData == NULL) { 36027ec681f3Smrg ir->dataSize = data_len; 36037ec681f3Smrg return true; 36047ec681f3Smrg } 36057ec681f3Smrg 36067ec681f3Smrg strncpy(ir->pData, data, ir->dataSize); 36077ec681f3Smrg if (ir->dataSize < data_len) 36087ec681f3Smrg return false; 36097ec681f3Smrg 36107ec681f3Smrg ir->dataSize = data_len; 36117ec681f3Smrg return true; 36127ec681f3Smrg} 36137ec681f3Smrg 36147ec681f3SmrgVKAPI_ATTR VkResult VKAPI_CALL 36157ec681f3Smrgtu_GetPipelineExecutableInternalRepresentationsKHR( 36167ec681f3Smrg VkDevice _device, 36177ec681f3Smrg const VkPipelineExecutableInfoKHR* pExecutableInfo, 36187ec681f3Smrg uint32_t* pInternalRepresentationCount, 36197ec681f3Smrg VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations) 36207ec681f3Smrg{ 36217ec681f3Smrg TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline); 36227ec681f3Smrg VK_OUTARRAY_MAKE(out, pInternalRepresentations, pInternalRepresentationCount); 36237ec681f3Smrg bool incomplete_text = false; 36247ec681f3Smrg 36257ec681f3Smrg const struct tu_pipeline_executable *exe = 36267ec681f3Smrg tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex); 36277ec681f3Smrg 36287ec681f3Smrg if (exe->nir_from_spirv) { 36297ec681f3Smrg vk_outarray_append(&out, ir) { 36307ec681f3Smrg WRITE_STR(ir->name, "NIR from SPIRV"); 36317ec681f3Smrg WRITE_STR(ir->description, 36327ec681f3Smrg "Initial NIR before any optimizations"); 36337ec681f3Smrg 36347ec681f3Smrg if (!write_ir_text(ir, exe->nir_from_spirv)) 36357ec681f3Smrg incomplete_text = true; 36367ec681f3Smrg } 36377ec681f3Smrg } 36387ec681f3Smrg 36397ec681f3Smrg if (exe->nir_final) { 36407ec681f3Smrg vk_outarray_append(&out, ir) { 36417ec681f3Smrg WRITE_STR(ir->name, "Final NIR"); 36427ec681f3Smrg WRITE_STR(ir->description, 36437ec681f3Smrg "Final NIR before going into the back-end compiler"); 36447ec681f3Smrg 36457ec681f3Smrg if (!write_ir_text(ir, exe->nir_final)) 36467ec681f3Smrg incomplete_text = true; 36477ec681f3Smrg } 36487ec681f3Smrg } 36497ec681f3Smrg 36507ec681f3Smrg if (exe->disasm) { 36517ec681f3Smrg vk_outarray_append(&out, ir) { 36527ec681f3Smrg WRITE_STR(ir->name, "IR3 Assembly"); 36537ec681f3Smrg WRITE_STR(ir->description, 36547ec681f3Smrg "Final IR3 assembly for the generated shader binary"); 36557ec681f3Smrg 36567ec681f3Smrg if (!write_ir_text(ir, exe->disasm)) 36577ec681f3Smrg incomplete_text = true; 36587ec681f3Smrg } 36597ec681f3Smrg } 36607ec681f3Smrg 36617ec681f3Smrg return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out); 3662361fc4cbSmaya} 3663