radv_pipeline.c revision 7ec681f3
1/* 2 * Copyright © 2016 Red Hat. 3 * Copyright © 2016 Bas Nieuwenhuizen 4 * 5 * based in part on anv driver which is: 6 * Copyright © 2015 Intel Corporation 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the next 16 * paragraph) shall be included in all copies or substantial portions of the 17 * Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 25 * IN THE SOFTWARE. 26 */ 27 28#include "nir/nir.h" 29#include "nir/nir_builder.h" 30#include "nir/nir_xfb_info.h" 31#include "spirv/nir_spirv.h" 32#include "util/disk_cache.h" 33#include "util/mesa-sha1.h" 34#include "util/u_atomic.h" 35#include "radv_cs.h" 36#include "radv_debug.h" 37#include "radv_private.h" 38#include "radv_shader.h" 39#include "vk_util.h" 40 41#include "util/debug.h" 42#include "ac_binary.h" 43#include "ac_exp_param.h" 44#include "ac_nir.h" 45#include "ac_shader_util.h" 46#include "aco_interface.h" 47#include "sid.h" 48#include "vk_format.h" 49 50struct radv_blend_state { 51 uint32_t blend_enable_4bit; 52 uint32_t need_src_alpha; 53 54 uint32_t cb_target_mask; 55 uint32_t cb_target_enabled_4bit; 56 uint32_t sx_mrt_blend_opt[8]; 57 uint32_t cb_blend_control[8]; 58 59 uint32_t spi_shader_col_format; 60 uint32_t col_format_is_int8; 61 uint32_t col_format_is_int10; 62 uint32_t cb_shader_mask; 63 uint32_t db_alpha_to_mask; 64 65 uint32_t commutative_4bit; 66 67 bool single_cb_enable; 68 bool mrt0_is_dual_src; 69}; 70 71struct radv_dsa_order_invariance { 72 /* Whether the final result in Z/S buffers is guaranteed to be 73 * invariant under changes to the order in which fragments arrive. 74 */ 75 bool zs; 76 77 /* Whether the set of fragments that pass the combined Z/S test is 78 * guaranteed to be invariant under changes to the order in which 79 * fragments arrive. 80 */ 81 bool pass_set; 82}; 83 84static bool 85radv_is_state_dynamic(const VkGraphicsPipelineCreateInfo *pCreateInfo, VkDynamicState state) 86{ 87 if (pCreateInfo->pDynamicState) { 88 uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount; 89 for (uint32_t i = 0; i < count; i++) { 90 if (pCreateInfo->pDynamicState->pDynamicStates[i] == state) 91 return true; 92 } 93 } 94 95 return false; 96} 97 98static const VkPipelineMultisampleStateCreateInfo * 99radv_pipeline_get_multisample_state(const VkGraphicsPipelineCreateInfo *pCreateInfo) 100{ 101 if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable || 102 radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT)) 103 return pCreateInfo->pMultisampleState; 104 return NULL; 105} 106 107static const VkPipelineTessellationStateCreateInfo * 108radv_pipeline_get_tessellation_state(const VkGraphicsPipelineCreateInfo *pCreateInfo) 109{ 110 for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) { 111 if (pCreateInfo->pStages[i].stage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT || 112 pCreateInfo->pStages[i].stage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) { 113 return pCreateInfo->pTessellationState; 114 } 115 } 116 return NULL; 117} 118 119static const VkPipelineDepthStencilStateCreateInfo * 120radv_pipeline_get_depth_stencil_state(const VkGraphicsPipelineCreateInfo *pCreateInfo) 121{ 122 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 123 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; 124 125 if ((!pCreateInfo->pRasterizationState->rasterizerDiscardEnable && 126 subpass->depth_stencil_attachment) || 127 radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT)) 128 return pCreateInfo->pDepthStencilState; 129 return NULL; 130} 131 132static const VkPipelineColorBlendStateCreateInfo * 133radv_pipeline_get_color_blend_state(const VkGraphicsPipelineCreateInfo *pCreateInfo) 134{ 135 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 136 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; 137 138 if ((!pCreateInfo->pRasterizationState->rasterizerDiscardEnable && subpass->has_color_att) || 139 radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT)) 140 return pCreateInfo->pColorBlendState; 141 return NULL; 142} 143 144static bool 145radv_pipeline_has_ngg(const struct radv_pipeline *pipeline) 146{ 147 if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_NONE) 148 return false; 149 150 struct radv_shader_variant *variant = 151 pipeline->shaders[pipeline->graphics.last_vgt_api_stage]; 152 153 return variant->info.is_ngg; 154} 155 156bool 157radv_pipeline_has_ngg_passthrough(const struct radv_pipeline *pipeline) 158{ 159 if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_NONE) 160 return false; 161 162 assert(radv_pipeline_has_ngg(pipeline)); 163 164 struct radv_shader_variant *variant = 165 pipeline->shaders[pipeline->graphics.last_vgt_api_stage]; 166 167 return variant->info.is_ngg_passthrough; 168} 169 170bool 171radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline) 172{ 173 return !!pipeline->gs_copy_shader; 174} 175 176void 177radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline, 178 const VkAllocationCallbacks *allocator) 179{ 180 if (pipeline->type == RADV_PIPELINE_COMPUTE) { 181 free(pipeline->compute.rt_group_handles); 182 free(pipeline->compute.rt_stack_sizes); 183 } else if (pipeline->type == RADV_PIPELINE_LIBRARY) { 184 free(pipeline->library.groups); 185 free(pipeline->library.stages); 186 } 187 188 for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) 189 if (pipeline->shaders[i]) 190 radv_shader_variant_destroy(device, pipeline->shaders[i]); 191 192 if (pipeline->gs_copy_shader) 193 radv_shader_variant_destroy(device, pipeline->gs_copy_shader); 194 195 if (pipeline->cs.buf) 196 free(pipeline->cs.buf); 197 198 vk_object_base_finish(&pipeline->base); 199 vk_free2(&device->vk.alloc, allocator, pipeline); 200} 201 202void 203radv_DestroyPipeline(VkDevice _device, VkPipeline _pipeline, 204 const VkAllocationCallbacks *pAllocator) 205{ 206 RADV_FROM_HANDLE(radv_device, device, _device); 207 RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline); 208 209 if (!_pipeline) 210 return; 211 212 radv_pipeline_destroy(device, pipeline, pAllocator); 213} 214 215uint32_t 216radv_get_hash_flags(const struct radv_device *device, bool stats) 217{ 218 uint32_t hash_flags = 0; 219 220 if (device->physical_device->use_ngg_culling) 221 hash_flags |= RADV_HASH_SHADER_USE_NGG_CULLING; 222 if (device->instance->perftest_flags & RADV_PERFTEST_FORCE_EMULATE_RT) 223 hash_flags |= RADV_HASH_SHADER_FORCE_EMULATE_RT; 224 if (device->physical_device->cs_wave_size == 32) 225 hash_flags |= RADV_HASH_SHADER_CS_WAVE32; 226 if (device->physical_device->ps_wave_size == 32) 227 hash_flags |= RADV_HASH_SHADER_PS_WAVE32; 228 if (device->physical_device->ge_wave_size == 32) 229 hash_flags |= RADV_HASH_SHADER_GE_WAVE32; 230 if (device->physical_device->use_llvm) 231 hash_flags |= RADV_HASH_SHADER_LLVM; 232 if (stats) 233 hash_flags |= RADV_HASH_SHADER_KEEP_STATISTICS; 234 if (device->robust_buffer_access) /* forces per-attribute vertex descriptors */ 235 hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS; 236 if (device->robust_buffer_access2) /* affects load/store vectorizer */ 237 hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS2; 238 return hash_flags; 239} 240 241static void 242radv_pipeline_init_scratch(const struct radv_device *device, struct radv_pipeline *pipeline) 243{ 244 unsigned scratch_bytes_per_wave = 0; 245 unsigned max_waves = 0; 246 247 for (int i = 0; i < MESA_SHADER_STAGES; ++i) { 248 if (pipeline->shaders[i] && pipeline->shaders[i]->config.scratch_bytes_per_wave) { 249 unsigned max_stage_waves = device->scratch_waves; 250 251 scratch_bytes_per_wave = 252 MAX2(scratch_bytes_per_wave, pipeline->shaders[i]->config.scratch_bytes_per_wave); 253 254 max_stage_waves = 255 MIN2(max_stage_waves, 4 * device->physical_device->rad_info.num_good_compute_units * 256 radv_get_max_waves(device, pipeline->shaders[i], i)); 257 max_waves = MAX2(max_waves, max_stage_waves); 258 } 259 } 260 261 pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave; 262 pipeline->max_waves = max_waves; 263} 264 265static uint32_t 266si_translate_blend_function(VkBlendOp op) 267{ 268 switch (op) { 269 case VK_BLEND_OP_ADD: 270 return V_028780_COMB_DST_PLUS_SRC; 271 case VK_BLEND_OP_SUBTRACT: 272 return V_028780_COMB_SRC_MINUS_DST; 273 case VK_BLEND_OP_REVERSE_SUBTRACT: 274 return V_028780_COMB_DST_MINUS_SRC; 275 case VK_BLEND_OP_MIN: 276 return V_028780_COMB_MIN_DST_SRC; 277 case VK_BLEND_OP_MAX: 278 return V_028780_COMB_MAX_DST_SRC; 279 default: 280 return 0; 281 } 282} 283 284static uint32_t 285si_translate_blend_factor(VkBlendFactor factor) 286{ 287 switch (factor) { 288 case VK_BLEND_FACTOR_ZERO: 289 return V_028780_BLEND_ZERO; 290 case VK_BLEND_FACTOR_ONE: 291 return V_028780_BLEND_ONE; 292 case VK_BLEND_FACTOR_SRC_COLOR: 293 return V_028780_BLEND_SRC_COLOR; 294 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR: 295 return V_028780_BLEND_ONE_MINUS_SRC_COLOR; 296 case VK_BLEND_FACTOR_DST_COLOR: 297 return V_028780_BLEND_DST_COLOR; 298 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR: 299 return V_028780_BLEND_ONE_MINUS_DST_COLOR; 300 case VK_BLEND_FACTOR_SRC_ALPHA: 301 return V_028780_BLEND_SRC_ALPHA; 302 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA: 303 return V_028780_BLEND_ONE_MINUS_SRC_ALPHA; 304 case VK_BLEND_FACTOR_DST_ALPHA: 305 return V_028780_BLEND_DST_ALPHA; 306 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA: 307 return V_028780_BLEND_ONE_MINUS_DST_ALPHA; 308 case VK_BLEND_FACTOR_CONSTANT_COLOR: 309 return V_028780_BLEND_CONSTANT_COLOR; 310 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR: 311 return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR; 312 case VK_BLEND_FACTOR_CONSTANT_ALPHA: 313 return V_028780_BLEND_CONSTANT_ALPHA; 314 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA: 315 return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA; 316 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE: 317 return V_028780_BLEND_SRC_ALPHA_SATURATE; 318 case VK_BLEND_FACTOR_SRC1_COLOR: 319 return V_028780_BLEND_SRC1_COLOR; 320 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR: 321 return V_028780_BLEND_INV_SRC1_COLOR; 322 case VK_BLEND_FACTOR_SRC1_ALPHA: 323 return V_028780_BLEND_SRC1_ALPHA; 324 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA: 325 return V_028780_BLEND_INV_SRC1_ALPHA; 326 default: 327 return 0; 328 } 329} 330 331static uint32_t 332si_translate_blend_opt_function(VkBlendOp op) 333{ 334 switch (op) { 335 case VK_BLEND_OP_ADD: 336 return V_028760_OPT_COMB_ADD; 337 case VK_BLEND_OP_SUBTRACT: 338 return V_028760_OPT_COMB_SUBTRACT; 339 case VK_BLEND_OP_REVERSE_SUBTRACT: 340 return V_028760_OPT_COMB_REVSUBTRACT; 341 case VK_BLEND_OP_MIN: 342 return V_028760_OPT_COMB_MIN; 343 case VK_BLEND_OP_MAX: 344 return V_028760_OPT_COMB_MAX; 345 default: 346 return V_028760_OPT_COMB_BLEND_DISABLED; 347 } 348} 349 350static uint32_t 351si_translate_blend_opt_factor(VkBlendFactor factor, bool is_alpha) 352{ 353 switch (factor) { 354 case VK_BLEND_FACTOR_ZERO: 355 return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL; 356 case VK_BLEND_FACTOR_ONE: 357 return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE; 358 case VK_BLEND_FACTOR_SRC_COLOR: 359 return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0 360 : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0; 361 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR: 362 return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1 363 : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1; 364 case VK_BLEND_FACTOR_SRC_ALPHA: 365 return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0; 366 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA: 367 return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1; 368 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE: 369 return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE 370 : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; 371 default: 372 return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; 373 } 374} 375 376/** 377 * Get rid of DST in the blend factors by commuting the operands: 378 * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC) 379 */ 380static void 381si_blend_remove_dst(VkBlendOp *func, VkBlendFactor *src_factor, VkBlendFactor *dst_factor, 382 VkBlendFactor expected_dst, VkBlendFactor replacement_src) 383{ 384 if (*src_factor == expected_dst && *dst_factor == VK_BLEND_FACTOR_ZERO) { 385 *src_factor = VK_BLEND_FACTOR_ZERO; 386 *dst_factor = replacement_src; 387 388 /* Commuting the operands requires reversing subtractions. */ 389 if (*func == VK_BLEND_OP_SUBTRACT) 390 *func = VK_BLEND_OP_REVERSE_SUBTRACT; 391 else if (*func == VK_BLEND_OP_REVERSE_SUBTRACT) 392 *func = VK_BLEND_OP_SUBTRACT; 393 } 394} 395 396static bool 397si_blend_factor_uses_dst(VkBlendFactor factor) 398{ 399 return factor == VK_BLEND_FACTOR_DST_COLOR || factor == VK_BLEND_FACTOR_DST_ALPHA || 400 factor == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || 401 factor == VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA || 402 factor == VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR; 403} 404 405static bool 406is_dual_src(VkBlendFactor factor) 407{ 408 switch (factor) { 409 case VK_BLEND_FACTOR_SRC1_COLOR: 410 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR: 411 case VK_BLEND_FACTOR_SRC1_ALPHA: 412 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA: 413 return true; 414 default: 415 return false; 416 } 417} 418 419static unsigned 420radv_choose_spi_color_format(const struct radv_device *device, VkFormat vk_format, 421 bool blend_enable, bool blend_need_alpha) 422{ 423 const struct util_format_description *desc = vk_format_description(vk_format); 424 bool use_rbplus = device->physical_device->rad_info.rbplus_allowed; 425 struct ac_spi_color_formats formats = {0}; 426 unsigned format, ntype, swap; 427 428 format = radv_translate_colorformat(vk_format); 429 ntype = radv_translate_color_numformat(vk_format, desc, 430 vk_format_get_first_non_void_channel(vk_format)); 431 swap = radv_translate_colorswap(vk_format, false); 432 433 ac_choose_spi_color_formats(format, swap, ntype, false, use_rbplus, &formats); 434 435 if (blend_enable && blend_need_alpha) 436 return formats.blend_alpha; 437 else if (blend_need_alpha) 438 return formats.alpha; 439 else if (blend_enable) 440 return formats.blend; 441 else 442 return formats.normal; 443} 444 445static bool 446format_is_int8(VkFormat format) 447{ 448 const struct util_format_description *desc = vk_format_description(format); 449 int channel = vk_format_get_first_non_void_channel(format); 450 451 return channel >= 0 && desc->channel[channel].pure_integer && desc->channel[channel].size == 8; 452} 453 454static bool 455format_is_int10(VkFormat format) 456{ 457 const struct util_format_description *desc = vk_format_description(format); 458 459 if (desc->nr_channels != 4) 460 return false; 461 for (unsigned i = 0; i < 4; i++) { 462 if (desc->channel[i].pure_integer && desc->channel[i].size == 10) 463 return true; 464 } 465 return false; 466} 467 468static void 469radv_pipeline_compute_spi_color_formats(const struct radv_pipeline *pipeline, 470 const VkGraphicsPipelineCreateInfo *pCreateInfo, 471 struct radv_blend_state *blend) 472{ 473 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 474 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; 475 unsigned col_format = 0, is_int8 = 0, is_int10 = 0; 476 unsigned num_targets; 477 478 for (unsigned i = 0; i < (blend->single_cb_enable ? 1 : subpass->color_count); ++i) { 479 unsigned cf; 480 481 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED || 482 !(blend->cb_target_mask & (0xfu << (i * 4)))) { 483 cf = V_028714_SPI_SHADER_ZERO; 484 } else { 485 struct radv_render_pass_attachment *attachment = 486 pass->attachments + subpass->color_attachments[i].attachment; 487 bool blend_enable = blend->blend_enable_4bit & (0xfu << (i * 4)); 488 489 cf = radv_choose_spi_color_format(pipeline->device, attachment->format, blend_enable, 490 blend->need_src_alpha & (1 << i)); 491 492 if (format_is_int8(attachment->format)) 493 is_int8 |= 1 << i; 494 if (format_is_int10(attachment->format)) 495 is_int10 |= 1 << i; 496 } 497 498 col_format |= cf << (4 * i); 499 } 500 501 if (!(col_format & 0xf) && blend->need_src_alpha & (1 << 0)) { 502 /* When a subpass doesn't have any color attachments, write the 503 * alpha channel of MRT0 when alpha coverage is enabled because 504 * the depth attachment needs it. 505 */ 506 col_format |= V_028714_SPI_SHADER_32_AR; 507 } 508 509 /* If the i-th target format is set, all previous target formats must 510 * be non-zero to avoid hangs. 511 */ 512 num_targets = (util_last_bit(col_format) + 3) / 4; 513 for (unsigned i = 0; i < num_targets; i++) { 514 if (!(col_format & (0xfu << (i * 4)))) { 515 col_format |= V_028714_SPI_SHADER_32_R << (i * 4); 516 } 517 } 518 519 /* The output for dual source blending should have the same format as 520 * the first output. 521 */ 522 if (blend->mrt0_is_dual_src) { 523 assert(!(col_format >> 4)); 524 col_format |= (col_format & 0xf) << 4; 525 } 526 527 blend->cb_shader_mask = ac_get_cb_shader_mask(col_format); 528 blend->spi_shader_col_format = col_format; 529 blend->col_format_is_int8 = is_int8; 530 blend->col_format_is_int10 = is_int10; 531} 532 533/* 534 * Ordered so that for each i, 535 * radv_format_meta_fs_key(radv_fs_key_format_exemplars[i]) == i. 536 */ 537const VkFormat radv_fs_key_format_exemplars[NUM_META_FS_KEYS] = { 538 VK_FORMAT_R32_SFLOAT, 539 VK_FORMAT_R32G32_SFLOAT, 540 VK_FORMAT_R8G8B8A8_UNORM, 541 VK_FORMAT_R16G16B16A16_UNORM, 542 VK_FORMAT_R16G16B16A16_SNORM, 543 VK_FORMAT_R16G16B16A16_UINT, 544 VK_FORMAT_R16G16B16A16_SINT, 545 VK_FORMAT_R32G32B32A32_SFLOAT, 546 VK_FORMAT_R8G8B8A8_UINT, 547 VK_FORMAT_R8G8B8A8_SINT, 548 VK_FORMAT_A2R10G10B10_UINT_PACK32, 549 VK_FORMAT_A2R10G10B10_SINT_PACK32, 550}; 551 552unsigned 553radv_format_meta_fs_key(struct radv_device *device, VkFormat format) 554{ 555 unsigned col_format = radv_choose_spi_color_format(device, format, false, false); 556 assert(col_format != V_028714_SPI_SHADER_32_AR); 557 558 bool is_int8 = format_is_int8(format); 559 bool is_int10 = format_is_int10(format); 560 561 if (col_format == V_028714_SPI_SHADER_UINT16_ABGR && is_int8) 562 return 8; 563 else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR && is_int8) 564 return 9; 565 else if (col_format == V_028714_SPI_SHADER_UINT16_ABGR && is_int10) 566 return 10; 567 else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR && is_int10) 568 return 11; 569 else { 570 if (col_format >= V_028714_SPI_SHADER_32_AR) 571 --col_format; /* Skip V_028714_SPI_SHADER_32_AR since there is no such VkFormat */ 572 573 --col_format; /* Skip V_028714_SPI_SHADER_ZERO */ 574 return col_format; 575 } 576} 577 578static void 579radv_blend_check_commutativity(struct radv_blend_state *blend, VkBlendOp op, VkBlendFactor src, 580 VkBlendFactor dst, unsigned chanmask) 581{ 582 /* Src factor is allowed when it does not depend on Dst. */ 583 static const uint32_t src_allowed = 584 (1u << VK_BLEND_FACTOR_ONE) | (1u << VK_BLEND_FACTOR_SRC_COLOR) | 585 (1u << VK_BLEND_FACTOR_SRC_ALPHA) | (1u << VK_BLEND_FACTOR_SRC_ALPHA_SATURATE) | 586 (1u << VK_BLEND_FACTOR_CONSTANT_COLOR) | (1u << VK_BLEND_FACTOR_CONSTANT_ALPHA) | 587 (1u << VK_BLEND_FACTOR_SRC1_COLOR) | (1u << VK_BLEND_FACTOR_SRC1_ALPHA) | 588 (1u << VK_BLEND_FACTOR_ZERO) | (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR) | 589 (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA) | 590 (1u << VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR) | 591 (1u << VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA) | 592 (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR) | (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA); 593 594 if (dst == VK_BLEND_FACTOR_ONE && (src_allowed & (1u << src))) { 595 /* Addition is commutative, but floating point addition isn't 596 * associative: subtle changes can be introduced via different 597 * rounding. Be conservative, only enable for min and max. 598 */ 599 if (op == VK_BLEND_OP_MAX || op == VK_BLEND_OP_MIN) 600 blend->commutative_4bit |= chanmask; 601 } 602} 603 604static struct radv_blend_state 605radv_pipeline_init_blend_state(struct radv_pipeline *pipeline, 606 const VkGraphicsPipelineCreateInfo *pCreateInfo, 607 const struct radv_graphics_pipeline_create_info *extra) 608{ 609 const VkPipelineColorBlendStateCreateInfo *vkblend = 610 radv_pipeline_get_color_blend_state(pCreateInfo); 611 const VkPipelineMultisampleStateCreateInfo *vkms = 612 radv_pipeline_get_multisample_state(pCreateInfo); 613 struct radv_blend_state blend = {0}; 614 unsigned mode = V_028808_CB_NORMAL; 615 unsigned cb_color_control = 0; 616 int i; 617 618 if (extra && extra->custom_blend_mode) { 619 blend.single_cb_enable = true; 620 mode = extra->custom_blend_mode; 621 } 622 623 if (vkblend) { 624 if (vkblend->logicOpEnable) 625 cb_color_control |= S_028808_ROP3(si_translate_blend_logic_op(vkblend->logicOp)); 626 else 627 cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY); 628 } 629 630 if (pipeline->device->instance->debug_flags & RADV_DEBUG_NO_ATOC_DITHERING) 631 { 632 blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) | 633 S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) | 634 S_028B70_OFFSET_ROUND(0); 635 } 636 else 637 { 638 blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) | 639 S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) | 640 S_028B70_OFFSET_ROUND(1); 641 } 642 643 if (vkms && vkms->alphaToCoverageEnable) { 644 blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1); 645 blend.need_src_alpha |= 0x1; 646 } 647 648 blend.cb_target_mask = 0; 649 if (vkblend) { 650 for (i = 0; i < vkblend->attachmentCount; i++) { 651 const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i]; 652 unsigned blend_cntl = 0; 653 unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt; 654 VkBlendOp eqRGB = att->colorBlendOp; 655 VkBlendFactor srcRGB = att->srcColorBlendFactor; 656 VkBlendFactor dstRGB = att->dstColorBlendFactor; 657 VkBlendOp eqA = att->alphaBlendOp; 658 VkBlendFactor srcA = att->srcAlphaBlendFactor; 659 VkBlendFactor dstA = att->dstAlphaBlendFactor; 660 661 blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | 662 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED); 663 664 if (!att->colorWriteMask) 665 continue; 666 667 /* Ignore other blend targets if dual-source blending 668 * is enabled to prevent wrong behaviour. 669 */ 670 if (blend.mrt0_is_dual_src) 671 continue; 672 673 blend.cb_target_mask |= (unsigned)att->colorWriteMask << (4 * i); 674 blend.cb_target_enabled_4bit |= 0xfu << (4 * i); 675 if (!att->blendEnable) { 676 blend.cb_blend_control[i] = blend_cntl; 677 continue; 678 } 679 680 if (is_dual_src(srcRGB) || is_dual_src(dstRGB) || is_dual_src(srcA) || is_dual_src(dstA)) 681 if (i == 0) 682 blend.mrt0_is_dual_src = true; 683 684 if (eqRGB == VK_BLEND_OP_MIN || eqRGB == VK_BLEND_OP_MAX) { 685 srcRGB = VK_BLEND_FACTOR_ONE; 686 dstRGB = VK_BLEND_FACTOR_ONE; 687 } 688 if (eqA == VK_BLEND_OP_MIN || eqA == VK_BLEND_OP_MAX) { 689 srcA = VK_BLEND_FACTOR_ONE; 690 dstA = VK_BLEND_FACTOR_ONE; 691 } 692 693 radv_blend_check_commutativity(&blend, eqRGB, srcRGB, dstRGB, 0x7u << (4 * i)); 694 radv_blend_check_commutativity(&blend, eqA, srcA, dstA, 0x8u << (4 * i)); 695 696 /* Blending optimizations for RB+. 697 * These transformations don't change the behavior. 698 * 699 * First, get rid of DST in the blend factors: 700 * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC) 701 */ 702 si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, VK_BLEND_FACTOR_DST_COLOR, 703 VK_BLEND_FACTOR_SRC_COLOR); 704 705 si_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_COLOR, 706 VK_BLEND_FACTOR_SRC_COLOR); 707 708 si_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_ALPHA, 709 VK_BLEND_FACTOR_SRC_ALPHA); 710 711 /* Look up the ideal settings from tables. */ 712 srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false); 713 dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false); 714 srcA_opt = si_translate_blend_opt_factor(srcA, true); 715 dstA_opt = si_translate_blend_opt_factor(dstA, true); 716 717 /* Handle interdependencies. */ 718 if (si_blend_factor_uses_dst(srcRGB)) 719 dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; 720 if (si_blend_factor_uses_dst(srcA)) 721 dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; 722 723 if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE && 724 (dstRGB == VK_BLEND_FACTOR_ZERO || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA || 725 dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE)) 726 dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; 727 728 /* Set the final value. */ 729 blend.sx_mrt_blend_opt[i] = 730 S_028760_COLOR_SRC_OPT(srcRGB_opt) | S_028760_COLOR_DST_OPT(dstRGB_opt) | 731 S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) | 732 S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) | 733 S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA)); 734 blend_cntl |= S_028780_ENABLE(1); 735 736 blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB)); 737 blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB)); 738 blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB)); 739 if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) { 740 blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1); 741 blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA)); 742 blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA)); 743 blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA)); 744 } 745 blend.cb_blend_control[i] = blend_cntl; 746 747 blend.blend_enable_4bit |= 0xfu << (i * 4); 748 749 if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA || 750 srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || 751 dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || 752 srcRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA || 753 dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA) 754 blend.need_src_alpha |= 1 << i; 755 } 756 for (i = vkblend->attachmentCount; i < 8; i++) { 757 blend.cb_blend_control[i] = 0; 758 blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | 759 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED); 760 } 761 } 762 763 if (pipeline->device->physical_device->rad_info.has_rbplus) { 764 /* Disable RB+ blend optimizations for dual source blending. */ 765 if (blend.mrt0_is_dual_src) { 766 for (i = 0; i < 8; i++) { 767 blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | 768 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE); 769 } 770 } 771 772 /* RB+ doesn't work with dual source blending, logic op and 773 * RESOLVE. 774 */ 775 if (blend.mrt0_is_dual_src || (vkblend && vkblend->logicOpEnable) || 776 mode == V_028808_CB_RESOLVE) 777 cb_color_control |= S_028808_DISABLE_DUAL_QUAD(1); 778 } 779 780 if (blend.cb_target_mask) 781 cb_color_control |= S_028808_MODE(mode); 782 else 783 cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE); 784 785 radv_pipeline_compute_spi_color_formats(pipeline, pCreateInfo, &blend); 786 787 pipeline->graphics.cb_color_control = cb_color_control; 788 789 return blend; 790} 791 792static uint32_t 793si_translate_fill(VkPolygonMode func) 794{ 795 switch (func) { 796 case VK_POLYGON_MODE_FILL: 797 return V_028814_X_DRAW_TRIANGLES; 798 case VK_POLYGON_MODE_LINE: 799 return V_028814_X_DRAW_LINES; 800 case VK_POLYGON_MODE_POINT: 801 return V_028814_X_DRAW_POINTS; 802 default: 803 assert(0); 804 return V_028814_X_DRAW_POINTS; 805 } 806} 807 808static uint8_t 809radv_pipeline_get_ps_iter_samples(const VkGraphicsPipelineCreateInfo *pCreateInfo) 810{ 811 const VkPipelineMultisampleStateCreateInfo *vkms = pCreateInfo->pMultisampleState; 812 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 813 struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass]; 814 uint32_t ps_iter_samples = 1; 815 uint32_t num_samples; 816 817 /* From the Vulkan 1.1.129 spec, 26.7. Sample Shading: 818 * 819 * "If the VK_AMD_mixed_attachment_samples extension is enabled and the 820 * subpass uses color attachments, totalSamples is the number of 821 * samples of the color attachments. Otherwise, totalSamples is the 822 * value of VkPipelineMultisampleStateCreateInfo::rasterizationSamples 823 * specified at pipeline creation time." 824 */ 825 if (subpass->has_color_att) { 826 num_samples = subpass->color_sample_count; 827 } else { 828 num_samples = vkms->rasterizationSamples; 829 } 830 831 if (vkms->sampleShadingEnable) { 832 ps_iter_samples = ceilf(vkms->minSampleShading * num_samples); 833 ps_iter_samples = util_next_power_of_two(ps_iter_samples); 834 } 835 return ps_iter_samples; 836} 837 838static bool 839radv_is_depth_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo) 840{ 841 return pCreateInfo->depthTestEnable && pCreateInfo->depthWriteEnable && 842 pCreateInfo->depthCompareOp != VK_COMPARE_OP_NEVER; 843} 844 845static bool 846radv_writes_stencil(const VkStencilOpState *state) 847{ 848 return state->writeMask && 849 (state->failOp != VK_STENCIL_OP_KEEP || state->passOp != VK_STENCIL_OP_KEEP || 850 state->depthFailOp != VK_STENCIL_OP_KEEP); 851} 852 853static bool 854radv_is_stencil_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo) 855{ 856 return pCreateInfo->stencilTestEnable && 857 (radv_writes_stencil(&pCreateInfo->front) || radv_writes_stencil(&pCreateInfo->back)); 858} 859 860static bool 861radv_is_ds_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo) 862{ 863 return radv_is_depth_write_enabled(pCreateInfo) || radv_is_stencil_write_enabled(pCreateInfo); 864} 865 866static bool 867radv_order_invariant_stencil_op(VkStencilOp op) 868{ 869 /* REPLACE is normally order invariant, except when the stencil 870 * reference value is written by the fragment shader. Tracking this 871 * interaction does not seem worth the effort, so be conservative. 872 */ 873 return op != VK_STENCIL_OP_INCREMENT_AND_CLAMP && op != VK_STENCIL_OP_DECREMENT_AND_CLAMP && 874 op != VK_STENCIL_OP_REPLACE; 875} 876 877static bool 878radv_order_invariant_stencil_state(const VkStencilOpState *state) 879{ 880 /* Compute whether, assuming Z writes are disabled, this stencil state 881 * is order invariant in the sense that the set of passing fragments as 882 * well as the final stencil buffer result does not depend on the order 883 * of fragments. 884 */ 885 return !state->writeMask || 886 /* The following assumes that Z writes are disabled. */ 887 (state->compareOp == VK_COMPARE_OP_ALWAYS && 888 radv_order_invariant_stencil_op(state->passOp) && 889 radv_order_invariant_stencil_op(state->depthFailOp)) || 890 (state->compareOp == VK_COMPARE_OP_NEVER && 891 radv_order_invariant_stencil_op(state->failOp)); 892} 893 894static bool 895radv_pipeline_has_dynamic_ds_states(const VkGraphicsPipelineCreateInfo *pCreateInfo) 896{ 897 VkDynamicState ds_states[] = { 898 VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT, VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT, 899 VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT, VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT, 900 VK_DYNAMIC_STATE_STENCIL_OP_EXT, 901 }; 902 903 for (uint32_t i = 0; i < ARRAY_SIZE(ds_states); i++) { 904 if (radv_is_state_dynamic(pCreateInfo, ds_states[i])) 905 return true; 906 } 907 908 return false; 909} 910 911static bool 912radv_pipeline_out_of_order_rast(struct radv_pipeline *pipeline, 913 const struct radv_blend_state *blend, 914 const VkGraphicsPipelineCreateInfo *pCreateInfo) 915{ 916 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 917 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; 918 const VkPipelineDepthStencilStateCreateInfo *vkds = 919 radv_pipeline_get_depth_stencil_state(pCreateInfo); 920 const VkPipelineColorBlendStateCreateInfo *vkblend = 921 radv_pipeline_get_color_blend_state(pCreateInfo); 922 unsigned colormask = blend->cb_target_enabled_4bit; 923 924 if (!pipeline->device->physical_device->out_of_order_rast_allowed) 925 return false; 926 927 /* Be conservative if a logic operation is enabled with color buffers. */ 928 if (colormask && vkblend && vkblend->logicOpEnable) 929 return false; 930 931 /* Be conservative if an extended dynamic depth/stencil state is 932 * enabled because the driver can't update out-of-order rasterization 933 * dynamically. 934 */ 935 if (radv_pipeline_has_dynamic_ds_states(pCreateInfo)) 936 return false; 937 938 /* Default depth/stencil invariance when no attachment is bound. */ 939 struct radv_dsa_order_invariance dsa_order_invariant = {.zs = true, .pass_set = true}; 940 941 if (vkds) { 942 struct radv_render_pass_attachment *attachment = 943 pass->attachments + subpass->depth_stencil_attachment->attachment; 944 bool has_stencil = vk_format_has_stencil(attachment->format); 945 struct radv_dsa_order_invariance order_invariance[2]; 946 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; 947 948 /* Compute depth/stencil order invariance in order to know if 949 * it's safe to enable out-of-order. 950 */ 951 bool zfunc_is_ordered = vkds->depthCompareOp == VK_COMPARE_OP_NEVER || 952 vkds->depthCompareOp == VK_COMPARE_OP_LESS || 953 vkds->depthCompareOp == VK_COMPARE_OP_LESS_OR_EQUAL || 954 vkds->depthCompareOp == VK_COMPARE_OP_GREATER || 955 vkds->depthCompareOp == VK_COMPARE_OP_GREATER_OR_EQUAL; 956 957 bool nozwrite_and_order_invariant_stencil = 958 !radv_is_ds_write_enabled(vkds) || 959 (!radv_is_depth_write_enabled(vkds) && radv_order_invariant_stencil_state(&vkds->front) && 960 radv_order_invariant_stencil_state(&vkds->back)); 961 962 order_invariance[1].zs = nozwrite_and_order_invariant_stencil || 963 (!radv_is_stencil_write_enabled(vkds) && zfunc_is_ordered); 964 order_invariance[0].zs = !radv_is_depth_write_enabled(vkds) || zfunc_is_ordered; 965 966 order_invariance[1].pass_set = 967 nozwrite_and_order_invariant_stencil || 968 (!radv_is_stencil_write_enabled(vkds) && (vkds->depthCompareOp == VK_COMPARE_OP_ALWAYS || 969 vkds->depthCompareOp == VK_COMPARE_OP_NEVER)); 970 order_invariance[0].pass_set = 971 !radv_is_depth_write_enabled(vkds) || (vkds->depthCompareOp == VK_COMPARE_OP_ALWAYS || 972 vkds->depthCompareOp == VK_COMPARE_OP_NEVER); 973 974 dsa_order_invariant = order_invariance[has_stencil]; 975 if (!dsa_order_invariant.zs) 976 return false; 977 978 /* The set of PS invocations is always order invariant, 979 * except when early Z/S tests are requested. 980 */ 981 if (ps && ps->info.ps.writes_memory && ps->info.ps.early_fragment_test && 982 !dsa_order_invariant.pass_set) 983 return false; 984 985 /* Determine if out-of-order rasterization should be disabled 986 * when occlusion queries are used. 987 */ 988 pipeline->graphics.disable_out_of_order_rast_for_occlusion = !dsa_order_invariant.pass_set; 989 } 990 991 /* No color buffers are enabled for writing. */ 992 if (!colormask) 993 return true; 994 995 unsigned blendmask = colormask & blend->blend_enable_4bit; 996 997 if (blendmask) { 998 /* Only commutative blending. */ 999 if (blendmask & ~blend->commutative_4bit) 1000 return false; 1001 1002 if (!dsa_order_invariant.pass_set) 1003 return false; 1004 } 1005 1006 if (colormask & ~blendmask) 1007 return false; 1008 1009 return true; 1010} 1011 1012static const VkConservativeRasterizationModeEXT 1013radv_get_conservative_raster_mode(const VkPipelineRasterizationStateCreateInfo *pCreateInfo) 1014{ 1015 const VkPipelineRasterizationConservativeStateCreateInfoEXT *conservative_raster = 1016 vk_find_struct_const(pCreateInfo->pNext, 1017 PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT); 1018 1019 if (!conservative_raster) 1020 return VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT; 1021 return conservative_raster->conservativeRasterizationMode; 1022} 1023 1024static void 1025radv_pipeline_init_multisample_state(struct radv_pipeline *pipeline, 1026 const struct radv_blend_state *blend, 1027 const VkGraphicsPipelineCreateInfo *pCreateInfo) 1028{ 1029 const VkPipelineMultisampleStateCreateInfo *vkms = 1030 radv_pipeline_get_multisample_state(pCreateInfo); 1031 struct radv_multisample_state *ms = &pipeline->graphics.ms; 1032 unsigned num_tile_pipes = pipeline->device->physical_device->rad_info.num_tile_pipes; 1033 const VkConservativeRasterizationModeEXT mode = 1034 radv_get_conservative_raster_mode(pCreateInfo->pRasterizationState); 1035 bool out_of_order_rast = false; 1036 int ps_iter_samples = 1; 1037 uint32_t mask = 0xffff; 1038 1039 if (vkms) { 1040 ms->num_samples = vkms->rasterizationSamples; 1041 1042 /* From the Vulkan 1.1.129 spec, 26.7. Sample Shading: 1043 * 1044 * "Sample shading is enabled for a graphics pipeline: 1045 * 1046 * - If the interface of the fragment shader entry point of the 1047 * graphics pipeline includes an input variable decorated 1048 * with SampleId or SamplePosition. In this case 1049 * minSampleShadingFactor takes the value 1.0. 1050 * - Else if the sampleShadingEnable member of the 1051 * VkPipelineMultisampleStateCreateInfo structure specified 1052 * when creating the graphics pipeline is set to VK_TRUE. In 1053 * this case minSampleShadingFactor takes the value of 1054 * VkPipelineMultisampleStateCreateInfo::minSampleShading. 1055 * 1056 * Otherwise, sample shading is considered disabled." 1057 */ 1058 if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.uses_sample_shading) { 1059 ps_iter_samples = ms->num_samples; 1060 } else { 1061 ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo); 1062 } 1063 } else { 1064 ms->num_samples = 1; 1065 } 1066 1067 const struct VkPipelineRasterizationStateRasterizationOrderAMD *raster_order = 1068 vk_find_struct_const(pCreateInfo->pRasterizationState->pNext, 1069 PIPELINE_RASTERIZATION_STATE_RASTERIZATION_ORDER_AMD); 1070 if (raster_order && raster_order->rasterizationOrder == VK_RASTERIZATION_ORDER_RELAXED_AMD) { 1071 /* Out-of-order rasterization is explicitly enabled by the 1072 * application. 1073 */ 1074 out_of_order_rast = true; 1075 } else { 1076 /* Determine if the driver can enable out-of-order 1077 * rasterization internally. 1078 */ 1079 out_of_order_rast = radv_pipeline_out_of_order_rast(pipeline, blend, pCreateInfo); 1080 } 1081 1082 ms->pa_sc_aa_config = 0; 1083 ms->db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) | 1084 S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1); 1085 1086 /* Adjust MSAA state if conservative rasterization is enabled. */ 1087 if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) { 1088 ms->pa_sc_aa_config |= S_028BE0_AA_MASK_CENTROID_DTMN(1); 1089 1090 ms->db_eqaa |= 1091 S_028804_ENABLE_POSTZ_OVERRASTERIZATION(1) | S_028804_OVERRASTERIZATION_AMOUNT(4); 1092 } 1093 1094 ms->pa_sc_mode_cntl_1 = 1095 S_028A4C_WALK_FENCE_ENABLE(1) | // TODO linear dst fixes 1096 S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) | 1097 S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) | 1098 S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) | 1099 /* always 1: */ 1100 S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | 1101 S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) | 1102 S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1); 1103 ms->pa_sc_mode_cntl_0 = S_028A48_ALTERNATE_RBS_PER_TILE( 1104 pipeline->device->physical_device->rad_info.chip_class >= GFX9) | 1105 S_028A48_VPORT_SCISSOR_ENABLE(1); 1106 1107 const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line = vk_find_struct_const( 1108 pCreateInfo->pRasterizationState->pNext, PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT); 1109 if (rast_line) { 1110 ms->pa_sc_mode_cntl_0 |= S_028A48_LINE_STIPPLE_ENABLE(rast_line->stippledLineEnable); 1111 if (rast_line->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) { 1112 /* From the Vulkan spec 1.1.129: 1113 * 1114 * "When VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT lines 1115 * are being rasterized, sample locations may all be 1116 * treated as being at the pixel center (this may 1117 * affect attribute and depth interpolation)." 1118 */ 1119 ms->num_samples = 1; 1120 } 1121 } 1122 1123 if (ms->num_samples > 1) { 1124 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 1125 struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass]; 1126 uint32_t z_samples = 1127 subpass->depth_stencil_attachment ? subpass->depth_sample_count : ms->num_samples; 1128 unsigned log_samples = util_logbase2(ms->num_samples); 1129 unsigned log_z_samples = util_logbase2(z_samples); 1130 unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples); 1131 ms->pa_sc_mode_cntl_0 |= S_028A48_MSAA_ENABLE(1); 1132 ms->db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | 1133 S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | 1134 S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | 1135 S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples); 1136 ms->pa_sc_aa_config |= 1137 S_028BE0_MSAA_NUM_SAMPLES(log_samples) | 1138 S_028BE0_MAX_SAMPLE_DIST(radv_get_default_max_sample_dist(log_samples)) | 1139 S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) | /* CM_R_028BE0_PA_SC_AA_CONFIG */ 1140 S_028BE0_COVERED_CENTROID_IS_CENTER( 1141 pipeline->device->physical_device->rad_info.chip_class >= GFX10_3); 1142 ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1); 1143 if (ps_iter_samples > 1) 1144 pipeline->graphics.spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2); 1145 } 1146 1147 if (vkms && vkms->pSampleMask) { 1148 mask = vkms->pSampleMask[0] & 0xffff; 1149 } 1150 1151 ms->pa_sc_aa_mask[0] = mask | (mask << 16); 1152 ms->pa_sc_aa_mask[1] = mask | (mask << 16); 1153} 1154 1155static void 1156gfx103_pipeline_init_vrs_state(struct radv_pipeline *pipeline, 1157 const VkGraphicsPipelineCreateInfo *pCreateInfo) 1158{ 1159 const VkPipelineMultisampleStateCreateInfo *vkms = 1160 radv_pipeline_get_multisample_state(pCreateInfo); 1161 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; 1162 struct radv_multisample_state *ms = &pipeline->graphics.ms; 1163 struct radv_vrs_state *vrs = &pipeline->graphics.vrs; 1164 1165 if (vkms && (vkms->sampleShadingEnable || ps->info.ps.uses_sample_shading || 1166 ps->info.ps.reads_sample_mask_in)) { 1167 /* Disable VRS and use the rates from PS_ITER_SAMPLES if: 1168 * 1169 * 1) sample shading is enabled or per-sample interpolation is 1170 * used by the fragment shader 1171 * 2) the fragment shader reads gl_SampleMaskIn because the 1172 * 16-bit sample coverage mask isn't enough for MSAA8x and 1173 * 2x2 coarse shading isn't enough. 1174 */ 1175 vrs->pa_cl_vrs_cntl = S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE); 1176 1177 /* Make sure sample shading is enabled even if only MSAA1x is 1178 * used because the SAMPLE_ITER combiner is in passthrough 1179 * mode if PS_ITER_SAMPLE is 0, and it uses the per-draw rate. 1180 * The default VRS rate when sample shading is enabled is 1x1. 1181 */ 1182 if (!G_028A4C_PS_ITER_SAMPLE(ms->pa_sc_mode_cntl_1)) 1183 ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1); 1184 } else { 1185 vrs->pa_cl_vrs_cntl = S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU); 1186 } 1187 1188 /* The primitive combiner is always passthrough. */ 1189 vrs->pa_cl_vrs_cntl |= S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU); 1190} 1191 1192static bool 1193radv_prim_can_use_guardband(enum VkPrimitiveTopology topology) 1194{ 1195 switch (topology) { 1196 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: 1197 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: 1198 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: 1199 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: 1200 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: 1201 return false; 1202 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: 1203 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: 1204 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: 1205 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: 1206 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: 1207 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST: 1208 return true; 1209 default: 1210 unreachable("unhandled primitive type"); 1211 } 1212} 1213 1214static uint32_t 1215si_conv_gl_prim_to_gs_out(unsigned gl_prim) 1216{ 1217 switch (gl_prim) { 1218 case 0: /* GL_POINTS */ 1219 return V_028A6C_POINTLIST; 1220 case 1: /* GL_LINES */ 1221 case 3: /* GL_LINE_STRIP */ 1222 case 0xA: /* GL_LINE_STRIP_ADJACENCY_ARB */ 1223 case 0x8E7A: /* GL_ISOLINES */ 1224 return V_028A6C_LINESTRIP; 1225 1226 case 4: /* GL_TRIANGLES */ 1227 case 0xc: /* GL_TRIANGLES_ADJACENCY_ARB */ 1228 case 5: /* GL_TRIANGLE_STRIP */ 1229 case 7: /* GL_QUADS */ 1230 return V_028A6C_TRISTRIP; 1231 default: 1232 assert(0); 1233 return 0; 1234 } 1235} 1236 1237static uint64_t 1238radv_dynamic_state_mask(VkDynamicState state) 1239{ 1240 switch (state) { 1241 case VK_DYNAMIC_STATE_VIEWPORT: 1242 case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT: 1243 return RADV_DYNAMIC_VIEWPORT; 1244 case VK_DYNAMIC_STATE_SCISSOR: 1245 case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT: 1246 return RADV_DYNAMIC_SCISSOR; 1247 case VK_DYNAMIC_STATE_LINE_WIDTH: 1248 return RADV_DYNAMIC_LINE_WIDTH; 1249 case VK_DYNAMIC_STATE_DEPTH_BIAS: 1250 return RADV_DYNAMIC_DEPTH_BIAS; 1251 case VK_DYNAMIC_STATE_BLEND_CONSTANTS: 1252 return RADV_DYNAMIC_BLEND_CONSTANTS; 1253 case VK_DYNAMIC_STATE_DEPTH_BOUNDS: 1254 return RADV_DYNAMIC_DEPTH_BOUNDS; 1255 case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK: 1256 return RADV_DYNAMIC_STENCIL_COMPARE_MASK; 1257 case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK: 1258 return RADV_DYNAMIC_STENCIL_WRITE_MASK; 1259 case VK_DYNAMIC_STATE_STENCIL_REFERENCE: 1260 return RADV_DYNAMIC_STENCIL_REFERENCE; 1261 case VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT: 1262 return RADV_DYNAMIC_DISCARD_RECTANGLE; 1263 case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT: 1264 return RADV_DYNAMIC_SAMPLE_LOCATIONS; 1265 case VK_DYNAMIC_STATE_LINE_STIPPLE_EXT: 1266 return RADV_DYNAMIC_LINE_STIPPLE; 1267 case VK_DYNAMIC_STATE_CULL_MODE_EXT: 1268 return RADV_DYNAMIC_CULL_MODE; 1269 case VK_DYNAMIC_STATE_FRONT_FACE_EXT: 1270 return RADV_DYNAMIC_FRONT_FACE; 1271 case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT: 1272 return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY; 1273 case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT: 1274 return RADV_DYNAMIC_DEPTH_TEST_ENABLE; 1275 case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT: 1276 return RADV_DYNAMIC_DEPTH_WRITE_ENABLE; 1277 case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT: 1278 return RADV_DYNAMIC_DEPTH_COMPARE_OP; 1279 case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT: 1280 return RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE; 1281 case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT: 1282 return RADV_DYNAMIC_STENCIL_TEST_ENABLE; 1283 case VK_DYNAMIC_STATE_STENCIL_OP_EXT: 1284 return RADV_DYNAMIC_STENCIL_OP; 1285 case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT: 1286 return RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE; 1287 case VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR: 1288 return RADV_DYNAMIC_FRAGMENT_SHADING_RATE; 1289 case VK_DYNAMIC_STATE_PATCH_CONTROL_POINTS_EXT: 1290 return RADV_DYNAMIC_PATCH_CONTROL_POINTS; 1291 case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT: 1292 return RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE; 1293 case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT: 1294 return RADV_DYNAMIC_DEPTH_BIAS_ENABLE; 1295 case VK_DYNAMIC_STATE_LOGIC_OP_EXT: 1296 return RADV_DYNAMIC_LOGIC_OP; 1297 case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT: 1298 return RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE; 1299 case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT: 1300 return RADV_DYNAMIC_COLOR_WRITE_ENABLE; 1301 case VK_DYNAMIC_STATE_VERTEX_INPUT_EXT: 1302 return RADV_DYNAMIC_VERTEX_INPUT; 1303 default: 1304 unreachable("Unhandled dynamic state"); 1305 } 1306} 1307 1308static bool 1309radv_pipeline_is_blend_enabled(const VkGraphicsPipelineCreateInfo *pCreateInfo) 1310{ 1311 const VkPipelineColorBlendStateCreateInfo *vkblend = 1312 radv_pipeline_get_color_blend_state(pCreateInfo); 1313 1314 assert(vkblend); 1315 1316 for (uint32_t i = 0; i < vkblend->attachmentCount; i++) { 1317 const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i]; 1318 if (att->colorWriteMask && att->blendEnable) 1319 return true; 1320 } 1321 return false; 1322} 1323 1324static uint64_t 1325radv_pipeline_needed_dynamic_state(const VkGraphicsPipelineCreateInfo *pCreateInfo) 1326{ 1327 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 1328 struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass]; 1329 uint64_t states = RADV_DYNAMIC_ALL; 1330 1331 /* If rasterization is disabled we do not care about any of the 1332 * dynamic states, since they are all rasterization related only, 1333 * except primitive topology, primitive restart enable, vertex 1334 * binding stride and rasterization discard itself. 1335 */ 1336 if (pCreateInfo->pRasterizationState->rasterizerDiscardEnable && 1337 !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT)) { 1338 return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE | 1339 RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE | 1340 RADV_DYNAMIC_VERTEX_INPUT; 1341 } 1342 1343 if (!pCreateInfo->pRasterizationState->depthBiasEnable && 1344 !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT)) 1345 states &= ~RADV_DYNAMIC_DEPTH_BIAS; 1346 1347 if (!pCreateInfo->pDepthStencilState || 1348 (!pCreateInfo->pDepthStencilState->depthBoundsTestEnable && 1349 !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT))) 1350 states &= ~RADV_DYNAMIC_DEPTH_BOUNDS; 1351 1352 if (!pCreateInfo->pDepthStencilState || 1353 (!pCreateInfo->pDepthStencilState->stencilTestEnable && 1354 !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT))) 1355 states &= ~(RADV_DYNAMIC_STENCIL_COMPARE_MASK | RADV_DYNAMIC_STENCIL_WRITE_MASK | 1356 RADV_DYNAMIC_STENCIL_REFERENCE); 1357 1358 if (!vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT)) 1359 states &= ~RADV_DYNAMIC_DISCARD_RECTANGLE; 1360 1361 if (!pCreateInfo->pMultisampleState || 1362 !vk_find_struct_const(pCreateInfo->pMultisampleState->pNext, 1363 PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT)) 1364 states &= ~RADV_DYNAMIC_SAMPLE_LOCATIONS; 1365 1366 if (!pCreateInfo->pRasterizationState) 1367 states &= ~RADV_DYNAMIC_LINE_STIPPLE; 1368 else { 1369 const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_info = vk_find_struct_const(pCreateInfo->pRasterizationState->pNext, 1370 PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT); 1371 if (!rast_line_info || !rast_line_info->stippledLineEnable) 1372 states &= ~RADV_DYNAMIC_LINE_STIPPLE; 1373 } 1374 1375 if (!vk_find_struct_const(pCreateInfo->pNext, 1376 PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR) && 1377 !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR)) 1378 states &= ~RADV_DYNAMIC_FRAGMENT_SHADING_RATE; 1379 1380 if (!subpass->has_color_att || 1381 !radv_pipeline_is_blend_enabled(pCreateInfo)) 1382 states &= ~RADV_DYNAMIC_BLEND_CONSTANTS; 1383 1384 if (!subpass->has_color_att) 1385 states &= ~RADV_DYNAMIC_COLOR_WRITE_ENABLE; 1386 1387 return states; 1388} 1389 1390static struct radv_ia_multi_vgt_param_helpers 1391radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline) 1392{ 1393 struct radv_ia_multi_vgt_param_helpers ia_multi_vgt_param = {0}; 1394 const struct radv_device *device = pipeline->device; 1395 1396 if (radv_pipeline_has_tess(pipeline)) 1397 ia_multi_vgt_param.primgroup_size = 1398 pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches; 1399 else if (radv_pipeline_has_gs(pipeline)) 1400 ia_multi_vgt_param.primgroup_size = 64; 1401 else 1402 ia_multi_vgt_param.primgroup_size = 128; /* recommended without a GS */ 1403 1404 /* GS requirement. */ 1405 ia_multi_vgt_param.partial_es_wave = false; 1406 if (radv_pipeline_has_gs(pipeline) && device->physical_device->rad_info.chip_class <= GFX8) 1407 if (SI_GS_PER_ES / ia_multi_vgt_param.primgroup_size >= pipeline->device->gs_table_depth - 3) 1408 ia_multi_vgt_param.partial_es_wave = true; 1409 1410 ia_multi_vgt_param.ia_switch_on_eoi = false; 1411 if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.prim_id_input) 1412 ia_multi_vgt_param.ia_switch_on_eoi = true; 1413 if (radv_pipeline_has_gs(pipeline) && pipeline->shaders[MESA_SHADER_GEOMETRY]->info.uses_prim_id) 1414 ia_multi_vgt_param.ia_switch_on_eoi = true; 1415 if (radv_pipeline_has_tess(pipeline)) { 1416 /* SWITCH_ON_EOI must be set if PrimID is used. */ 1417 if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id || 1418 radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.uses_prim_id) 1419 ia_multi_vgt_param.ia_switch_on_eoi = true; 1420 } 1421 1422 ia_multi_vgt_param.partial_vs_wave = false; 1423 if (radv_pipeline_has_tess(pipeline)) { 1424 /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */ 1425 if ((device->physical_device->rad_info.family == CHIP_TAHITI || 1426 device->physical_device->rad_info.family == CHIP_PITCAIRN || 1427 device->physical_device->rad_info.family == CHIP_BONAIRE) && 1428 radv_pipeline_has_gs(pipeline)) 1429 ia_multi_vgt_param.partial_vs_wave = true; 1430 /* Needed for 028B6C_DISTRIBUTION_MODE != 0 */ 1431 if (device->physical_device->rad_info.has_distributed_tess) { 1432 if (radv_pipeline_has_gs(pipeline)) { 1433 if (device->physical_device->rad_info.chip_class <= GFX8) 1434 ia_multi_vgt_param.partial_es_wave = true; 1435 } else { 1436 ia_multi_vgt_param.partial_vs_wave = true; 1437 } 1438 } 1439 } 1440 1441 if (radv_pipeline_has_gs(pipeline)) { 1442 /* On these chips there is the possibility of a hang if the 1443 * pipeline uses a GS and partial_vs_wave is not set. 1444 * 1445 * This mostly does not hit 4-SE chips, as those typically set 1446 * ia_switch_on_eoi and then partial_vs_wave is set for pipelines 1447 * with GS due to another workaround. 1448 * 1449 * Reproducer: https://bugs.freedesktop.org/show_bug.cgi?id=109242 1450 */ 1451 if (device->physical_device->rad_info.family == CHIP_TONGA || 1452 device->physical_device->rad_info.family == CHIP_FIJI || 1453 device->physical_device->rad_info.family == CHIP_POLARIS10 || 1454 device->physical_device->rad_info.family == CHIP_POLARIS11 || 1455 device->physical_device->rad_info.family == CHIP_POLARIS12 || 1456 device->physical_device->rad_info.family == CHIP_VEGAM) { 1457 ia_multi_vgt_param.partial_vs_wave = true; 1458 } 1459 } 1460 1461 ia_multi_vgt_param.base = 1462 S_028AA8_PRIMGROUP_SIZE(ia_multi_vgt_param.primgroup_size - 1) | 1463 /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */ 1464 S_028AA8_MAX_PRIMGRP_IN_WAVE(device->physical_device->rad_info.chip_class == GFX8 ? 2 : 0) | 1465 S_030960_EN_INST_OPT_BASIC(device->physical_device->rad_info.chip_class >= GFX9) | 1466 S_030960_EN_INST_OPT_ADV(device->physical_device->rad_info.chip_class >= GFX9); 1467 1468 return ia_multi_vgt_param; 1469} 1470 1471static void 1472radv_pipeline_init_input_assembly_state(struct radv_pipeline *pipeline, 1473 const VkGraphicsPipelineCreateInfo *pCreateInfo, 1474 const struct radv_graphics_pipeline_create_info *extra) 1475{ 1476 const VkPipelineInputAssemblyStateCreateInfo *ia_state = pCreateInfo->pInputAssemblyState; 1477 struct radv_shader_variant *tes = pipeline->shaders[MESA_SHADER_TESS_EVAL]; 1478 struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY]; 1479 1480 pipeline->graphics.can_use_guardband = radv_prim_can_use_guardband(ia_state->topology); 1481 1482 if (radv_pipeline_has_gs(pipeline)) { 1483 if (si_conv_gl_prim_to_gs_out(gs->info.gs.output_prim) == V_028A6C_TRISTRIP) 1484 pipeline->graphics.can_use_guardband = true; 1485 } else if (radv_pipeline_has_tess(pipeline)) { 1486 if (!tes->info.tes.point_mode && 1487 si_conv_gl_prim_to_gs_out(tes->info.tes.primitive_mode) == V_028A6C_TRISTRIP) 1488 pipeline->graphics.can_use_guardband = true; 1489 } 1490 1491 if (extra && extra->use_rectlist) { 1492 pipeline->graphics.can_use_guardband = true; 1493 } 1494 1495 pipeline->graphics.ia_multi_vgt_param = radv_compute_ia_multi_vgt_param_helpers(pipeline); 1496} 1497 1498static void 1499radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline, 1500 const VkGraphicsPipelineCreateInfo *pCreateInfo, 1501 const struct radv_graphics_pipeline_create_info *extra) 1502{ 1503 uint64_t needed_states = radv_pipeline_needed_dynamic_state(pCreateInfo); 1504 uint64_t states = needed_states; 1505 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 1506 struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass]; 1507 1508 pipeline->dynamic_state = default_dynamic_state; 1509 pipeline->graphics.needed_dynamic_state = needed_states; 1510 1511 if (pCreateInfo->pDynamicState) { 1512 /* Remove all of the states that are marked as dynamic */ 1513 uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount; 1514 for (uint32_t s = 0; s < count; s++) 1515 states &= ~radv_dynamic_state_mask(pCreateInfo->pDynamicState->pDynamicStates[s]); 1516 } 1517 1518 struct radv_dynamic_state *dynamic = &pipeline->dynamic_state; 1519 1520 if (needed_states & RADV_DYNAMIC_VIEWPORT) { 1521 assert(pCreateInfo->pViewportState); 1522 1523 dynamic->viewport.count = pCreateInfo->pViewportState->viewportCount; 1524 if (states & RADV_DYNAMIC_VIEWPORT) { 1525 typed_memcpy(dynamic->viewport.viewports, pCreateInfo->pViewportState->pViewports, 1526 pCreateInfo->pViewportState->viewportCount); 1527 for (unsigned i = 0; i < dynamic->viewport.count; i++) 1528 radv_get_viewport_xform(&dynamic->viewport.viewports[i], 1529 dynamic->viewport.xform[i].scale, dynamic->viewport.xform[i].translate); 1530 } 1531 } 1532 1533 if (needed_states & RADV_DYNAMIC_SCISSOR) { 1534 dynamic->scissor.count = pCreateInfo->pViewportState->scissorCount; 1535 if (states & RADV_DYNAMIC_SCISSOR) { 1536 typed_memcpy(dynamic->scissor.scissors, pCreateInfo->pViewportState->pScissors, 1537 pCreateInfo->pViewportState->scissorCount); 1538 } 1539 } 1540 1541 if (states & RADV_DYNAMIC_LINE_WIDTH) { 1542 assert(pCreateInfo->pRasterizationState); 1543 dynamic->line_width = pCreateInfo->pRasterizationState->lineWidth; 1544 } 1545 1546 if (states & RADV_DYNAMIC_DEPTH_BIAS) { 1547 assert(pCreateInfo->pRasterizationState); 1548 dynamic->depth_bias.bias = pCreateInfo->pRasterizationState->depthBiasConstantFactor; 1549 dynamic->depth_bias.clamp = pCreateInfo->pRasterizationState->depthBiasClamp; 1550 dynamic->depth_bias.slope = pCreateInfo->pRasterizationState->depthBiasSlopeFactor; 1551 } 1552 1553 /* Section 9.2 of the Vulkan 1.0.15 spec says: 1554 * 1555 * pColorBlendState is [...] NULL if the pipeline has rasterization 1556 * disabled or if the subpass of the render pass the pipeline is 1557 * created against does not use any color attachments. 1558 */ 1559 if (states & RADV_DYNAMIC_BLEND_CONSTANTS) { 1560 assert(pCreateInfo->pColorBlendState); 1561 typed_memcpy(dynamic->blend_constants, pCreateInfo->pColorBlendState->blendConstants, 4); 1562 } 1563 1564 if (states & RADV_DYNAMIC_CULL_MODE) { 1565 dynamic->cull_mode = pCreateInfo->pRasterizationState->cullMode; 1566 } 1567 1568 if (states & RADV_DYNAMIC_FRONT_FACE) { 1569 dynamic->front_face = pCreateInfo->pRasterizationState->frontFace; 1570 } 1571 1572 if (states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) { 1573 dynamic->primitive_topology = si_translate_prim(pCreateInfo->pInputAssemblyState->topology); 1574 if (extra && extra->use_rectlist) { 1575 dynamic->primitive_topology = V_008958_DI_PT_RECTLIST; 1576 } 1577 } 1578 1579 /* If there is no depthstencil attachment, then don't read 1580 * pDepthStencilState. The Vulkan spec states that pDepthStencilState may 1581 * be NULL in this case. Even if pDepthStencilState is non-NULL, there is 1582 * no need to override the depthstencil defaults in 1583 * radv_pipeline::dynamic_state when there is no depthstencil attachment. 1584 * 1585 * Section 9.2 of the Vulkan 1.0.15 spec says: 1586 * 1587 * pDepthStencilState is [...] NULL if the pipeline has rasterization 1588 * disabled or if the subpass of the render pass the pipeline is created 1589 * against does not use a depth/stencil attachment. 1590 */ 1591 if (needed_states && subpass->depth_stencil_attachment) { 1592 if (states & RADV_DYNAMIC_DEPTH_BOUNDS) { 1593 dynamic->depth_bounds.min = pCreateInfo->pDepthStencilState->minDepthBounds; 1594 dynamic->depth_bounds.max = pCreateInfo->pDepthStencilState->maxDepthBounds; 1595 } 1596 1597 if (states & RADV_DYNAMIC_STENCIL_COMPARE_MASK) { 1598 dynamic->stencil_compare_mask.front = pCreateInfo->pDepthStencilState->front.compareMask; 1599 dynamic->stencil_compare_mask.back = pCreateInfo->pDepthStencilState->back.compareMask; 1600 } 1601 1602 if (states & RADV_DYNAMIC_STENCIL_WRITE_MASK) { 1603 dynamic->stencil_write_mask.front = pCreateInfo->pDepthStencilState->front.writeMask; 1604 dynamic->stencil_write_mask.back = pCreateInfo->pDepthStencilState->back.writeMask; 1605 } 1606 1607 if (states & RADV_DYNAMIC_STENCIL_REFERENCE) { 1608 dynamic->stencil_reference.front = pCreateInfo->pDepthStencilState->front.reference; 1609 dynamic->stencil_reference.back = pCreateInfo->pDepthStencilState->back.reference; 1610 } 1611 1612 if (states & RADV_DYNAMIC_DEPTH_TEST_ENABLE) { 1613 dynamic->depth_test_enable = pCreateInfo->pDepthStencilState->depthTestEnable; 1614 } 1615 1616 if (states & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) { 1617 dynamic->depth_write_enable = pCreateInfo->pDepthStencilState->depthWriteEnable; 1618 } 1619 1620 if (states & RADV_DYNAMIC_DEPTH_COMPARE_OP) { 1621 dynamic->depth_compare_op = pCreateInfo->pDepthStencilState->depthCompareOp; 1622 } 1623 1624 if (states & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) { 1625 dynamic->depth_bounds_test_enable = pCreateInfo->pDepthStencilState->depthBoundsTestEnable; 1626 } 1627 1628 if (states & RADV_DYNAMIC_STENCIL_TEST_ENABLE) { 1629 dynamic->stencil_test_enable = pCreateInfo->pDepthStencilState->stencilTestEnable; 1630 } 1631 1632 if (states & RADV_DYNAMIC_STENCIL_OP) { 1633 dynamic->stencil_op.front.compare_op = pCreateInfo->pDepthStencilState->front.compareOp; 1634 dynamic->stencil_op.front.fail_op = pCreateInfo->pDepthStencilState->front.failOp; 1635 dynamic->stencil_op.front.pass_op = pCreateInfo->pDepthStencilState->front.passOp; 1636 dynamic->stencil_op.front.depth_fail_op = 1637 pCreateInfo->pDepthStencilState->front.depthFailOp; 1638 1639 dynamic->stencil_op.back.compare_op = pCreateInfo->pDepthStencilState->back.compareOp; 1640 dynamic->stencil_op.back.fail_op = pCreateInfo->pDepthStencilState->back.failOp; 1641 dynamic->stencil_op.back.pass_op = pCreateInfo->pDepthStencilState->back.passOp; 1642 dynamic->stencil_op.back.depth_fail_op = pCreateInfo->pDepthStencilState->back.depthFailOp; 1643 } 1644 } 1645 1646 const VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info = 1647 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT); 1648 if (needed_states & RADV_DYNAMIC_DISCARD_RECTANGLE) { 1649 dynamic->discard_rectangle.count = discard_rectangle_info->discardRectangleCount; 1650 if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) { 1651 typed_memcpy(dynamic->discard_rectangle.rectangles, 1652 discard_rectangle_info->pDiscardRectangles, 1653 discard_rectangle_info->discardRectangleCount); 1654 } 1655 } 1656 1657 if (needed_states & RADV_DYNAMIC_SAMPLE_LOCATIONS) { 1658 const VkPipelineSampleLocationsStateCreateInfoEXT *sample_location_info = 1659 vk_find_struct_const(pCreateInfo->pMultisampleState->pNext, 1660 PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT); 1661 /* If sampleLocationsEnable is VK_FALSE, the default sample 1662 * locations are used and the values specified in 1663 * sampleLocationsInfo are ignored. 1664 */ 1665 if (sample_location_info->sampleLocationsEnable) { 1666 const VkSampleLocationsInfoEXT *pSampleLocationsInfo = 1667 &sample_location_info->sampleLocationsInfo; 1668 1669 assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS); 1670 1671 dynamic->sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel; 1672 dynamic->sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize; 1673 dynamic->sample_location.count = pSampleLocationsInfo->sampleLocationsCount; 1674 typed_memcpy(&dynamic->sample_location.locations[0], 1675 pSampleLocationsInfo->pSampleLocations, 1676 pSampleLocationsInfo->sampleLocationsCount); 1677 } 1678 } 1679 1680 const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_info = vk_find_struct_const( 1681 pCreateInfo->pRasterizationState->pNext, PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT); 1682 if (needed_states & RADV_DYNAMIC_LINE_STIPPLE) { 1683 dynamic->line_stipple.factor = rast_line_info->lineStippleFactor; 1684 dynamic->line_stipple.pattern = rast_line_info->lineStipplePattern; 1685 } 1686 1687 if (!(states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE) || 1688 !(states & RADV_DYNAMIC_VERTEX_INPUT)) 1689 pipeline->graphics.uses_dynamic_stride = true; 1690 1691 const VkPipelineFragmentShadingRateStateCreateInfoKHR *shading_rate = vk_find_struct_const( 1692 pCreateInfo->pNext, PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR); 1693 if (states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) { 1694 dynamic->fragment_shading_rate.size = shading_rate->fragmentSize; 1695 for (int i = 0; i < 2; i++) 1696 dynamic->fragment_shading_rate.combiner_ops[i] = shading_rate->combinerOps[i]; 1697 } 1698 1699 if (states & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) { 1700 dynamic->depth_bias_enable = pCreateInfo->pRasterizationState->depthBiasEnable; 1701 } 1702 1703 if (states & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) { 1704 dynamic->primitive_restart_enable = 1705 !!pCreateInfo->pInputAssemblyState->primitiveRestartEnable; 1706 } 1707 1708 if (states & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) { 1709 dynamic->rasterizer_discard_enable = 1710 pCreateInfo->pRasterizationState->rasterizerDiscardEnable; 1711 } 1712 1713 if (subpass->has_color_att && states & RADV_DYNAMIC_LOGIC_OP) { 1714 if (pCreateInfo->pColorBlendState->logicOpEnable) { 1715 dynamic->logic_op = si_translate_blend_logic_op(pCreateInfo->pColorBlendState->logicOp); 1716 } else { 1717 dynamic->logic_op = V_028808_ROP3_COPY; 1718 } 1719 } 1720 1721 if (states & RADV_DYNAMIC_COLOR_WRITE_ENABLE) { 1722 const VkPipelineColorWriteCreateInfoEXT *color_write_info = vk_find_struct_const( 1723 pCreateInfo->pColorBlendState->pNext, PIPELINE_COLOR_WRITE_CREATE_INFO_EXT); 1724 if (color_write_info) { 1725 dynamic->color_write_enable = 0; 1726 for (uint32_t i = 0; i < color_write_info->attachmentCount; i++) { 1727 dynamic->color_write_enable |= 1728 color_write_info->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0; 1729 } 1730 } 1731 } 1732 1733 pipeline->dynamic_state.mask = states; 1734} 1735 1736static void 1737radv_pipeline_init_raster_state(struct radv_pipeline *pipeline, 1738 const VkGraphicsPipelineCreateInfo *pCreateInfo) 1739{ 1740 const VkPipelineRasterizationStateCreateInfo *raster_info = pCreateInfo->pRasterizationState; 1741 const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_info = 1742 vk_find_struct_const(raster_info->pNext, 1743 PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT); 1744 bool provoking_vtx_last = false; 1745 1746 if (provoking_vtx_info && 1747 provoking_vtx_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) { 1748 provoking_vtx_last = true; 1749 } 1750 1751 pipeline->graphics.pa_su_sc_mode_cntl = 1752 S_028814_FACE(raster_info->frontFace) | 1753 S_028814_CULL_FRONT(!!(raster_info->cullMode & VK_CULL_MODE_FRONT_BIT)) | 1754 S_028814_CULL_BACK(!!(raster_info->cullMode & VK_CULL_MODE_BACK_BIT)) | 1755 S_028814_POLY_MODE(raster_info->polygonMode != VK_POLYGON_MODE_FILL) | 1756 S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(raster_info->polygonMode)) | 1757 S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(raster_info->polygonMode)) | 1758 S_028814_POLY_OFFSET_FRONT_ENABLE(raster_info->depthBiasEnable ? 1 : 0) | 1759 S_028814_POLY_OFFSET_BACK_ENABLE(raster_info->depthBiasEnable ? 1 : 0) | 1760 S_028814_POLY_OFFSET_PARA_ENABLE(raster_info->depthBiasEnable ? 1 : 0) | 1761 S_028814_PROVOKING_VTX_LAST(provoking_vtx_last); 1762 1763 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { 1764 /* It should also be set if PERPENDICULAR_ENDCAP_ENA is set. */ 1765 pipeline->graphics.pa_su_sc_mode_cntl |= 1766 S_028814_KEEP_TOGETHER_ENABLE(raster_info->polygonMode != VK_POLYGON_MODE_FILL); 1767 } 1768 1769 bool depth_clip_disable = raster_info->depthClampEnable; 1770 const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state = 1771 vk_find_struct_const(raster_info->pNext, 1772 PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT); 1773 if (depth_clip_state) { 1774 depth_clip_disable = !depth_clip_state->depthClipEnable; 1775 } 1776 1777 pipeline->graphics.pa_cl_clip_cntl = 1778 S_028810_DX_CLIP_SPACE_DEF(1) | // vulkan uses DX conventions. 1779 S_028810_ZCLIP_NEAR_DISABLE(depth_clip_disable ? 1 : 0) | 1780 S_028810_ZCLIP_FAR_DISABLE(depth_clip_disable ? 1 : 0) | 1781 S_028810_DX_RASTERIZATION_KILL(raster_info->rasterizerDiscardEnable ? 1 : 0) | 1782 S_028810_DX_LINEAR_ATTR_CLIP_ENA(1); 1783 1784 pipeline->graphics.uses_conservative_overestimate = 1785 radv_get_conservative_raster_mode(pCreateInfo->pRasterizationState) == 1786 VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT; 1787} 1788 1789static void 1790radv_pipeline_init_depth_stencil_state(struct radv_pipeline *pipeline, 1791 const VkGraphicsPipelineCreateInfo *pCreateInfo) 1792{ 1793 const VkPipelineDepthStencilStateCreateInfo *ds_info = 1794 radv_pipeline_get_depth_stencil_state(pCreateInfo); 1795 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 1796 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; 1797 struct radv_render_pass_attachment *attachment = NULL; 1798 uint32_t db_depth_control = 0; 1799 1800 if (subpass->depth_stencil_attachment) 1801 attachment = pass->attachments + subpass->depth_stencil_attachment->attachment; 1802 1803 bool has_depth_attachment = attachment && vk_format_has_depth(attachment->format); 1804 bool has_stencil_attachment = attachment && vk_format_has_stencil(attachment->format); 1805 1806 if (ds_info) { 1807 if (has_depth_attachment) { 1808 db_depth_control = S_028800_Z_ENABLE(ds_info->depthTestEnable ? 1 : 0) | 1809 S_028800_Z_WRITE_ENABLE(ds_info->depthWriteEnable ? 1 : 0) | 1810 S_028800_ZFUNC(ds_info->depthCompareOp) | 1811 S_028800_DEPTH_BOUNDS_ENABLE(ds_info->depthBoundsTestEnable ? 1 : 0); 1812 } 1813 1814 if (has_stencil_attachment && ds_info->stencilTestEnable) { 1815 db_depth_control |= S_028800_STENCIL_ENABLE(1) | S_028800_BACKFACE_ENABLE(1); 1816 db_depth_control |= S_028800_STENCILFUNC(ds_info->front.compareOp); 1817 db_depth_control |= S_028800_STENCILFUNC_BF(ds_info->back.compareOp); 1818 } 1819 } 1820 1821 pipeline->graphics.db_depth_control = db_depth_control; 1822} 1823 1824static void 1825gfx9_get_gs_info(const struct radv_pipeline_key *key, const struct radv_pipeline *pipeline, 1826 nir_shader **nir, struct radv_shader_info *infos, struct gfx9_gs_info *out) 1827{ 1828 struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY]; 1829 struct radv_es_output_info *es_info; 1830 bool has_tess = !!nir[MESA_SHADER_TESS_CTRL]; 1831 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) 1832 es_info = has_tess ? &gs_info->tes.es_info : &gs_info->vs.es_info; 1833 else 1834 es_info = has_tess ? &infos[MESA_SHADER_TESS_EVAL].tes.es_info 1835 : &infos[MESA_SHADER_VERTEX].vs.es_info; 1836 1837 unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1); 1838 bool uses_adjacency; 1839 switch (key->vs.topology) { 1840 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: 1841 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: 1842 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: 1843 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: 1844 uses_adjacency = true; 1845 break; 1846 default: 1847 uses_adjacency = false; 1848 break; 1849 } 1850 1851 /* All these are in dwords: */ 1852 /* We can't allow using the whole LDS, because GS waves compete with 1853 * other shader stages for LDS space. */ 1854 const unsigned max_lds_size = 8 * 1024; 1855 const unsigned esgs_itemsize = es_info->esgs_itemsize / 4; 1856 unsigned esgs_lds_size; 1857 1858 /* All these are per subgroup: */ 1859 const unsigned max_out_prims = 32 * 1024; 1860 const unsigned max_es_verts = 255; 1861 const unsigned ideal_gs_prims = 64; 1862 unsigned max_gs_prims, gs_prims; 1863 unsigned min_es_verts, es_verts, worst_case_es_verts; 1864 1865 if (uses_adjacency || gs_num_invocations > 1) 1866 max_gs_prims = 127 / gs_num_invocations; 1867 else 1868 max_gs_prims = 255; 1869 1870 /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations. 1871 * Make sure we don't go over the maximum value. 1872 */ 1873 if (gs_info->gs.vertices_out > 0) { 1874 max_gs_prims = 1875 MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations)); 1876 } 1877 assert(max_gs_prims > 0); 1878 1879 /* If the primitive has adjacency, halve the number of vertices 1880 * that will be reused in multiple primitives. 1881 */ 1882 min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1); 1883 1884 gs_prims = MIN2(ideal_gs_prims, max_gs_prims); 1885 worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts); 1886 1887 /* Compute ESGS LDS size based on the worst case number of ES vertices 1888 * needed to create the target number of GS prims per subgroup. 1889 */ 1890 esgs_lds_size = esgs_itemsize * worst_case_es_verts; 1891 1892 /* If total LDS usage is too big, refactor partitions based on ratio 1893 * of ESGS item sizes. 1894 */ 1895 if (esgs_lds_size > max_lds_size) { 1896 /* Our target GS Prims Per Subgroup was too large. Calculate 1897 * the maximum number of GS Prims Per Subgroup that will fit 1898 * into LDS, capped by the maximum that the hardware can support. 1899 */ 1900 gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims); 1901 assert(gs_prims > 0); 1902 worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts); 1903 1904 esgs_lds_size = esgs_itemsize * worst_case_es_verts; 1905 assert(esgs_lds_size <= max_lds_size); 1906 } 1907 1908 /* Now calculate remaining ESGS information. */ 1909 if (esgs_lds_size) 1910 es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts); 1911 else 1912 es_verts = max_es_verts; 1913 1914 /* Vertices for adjacency primitives are not always reused, so restore 1915 * it for ES_VERTS_PER_SUBGRP. 1916 */ 1917 min_es_verts = gs_info->gs.vertices_in; 1918 1919 /* For normal primitives, the VGT only checks if they are past the ES 1920 * verts per subgroup after allocating a full GS primitive and if they 1921 * are, kick off a new subgroup. But if those additional ES verts are 1922 * unique (e.g. not reused) we need to make sure there is enough LDS 1923 * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP. 1924 */ 1925 es_verts -= min_es_verts - 1; 1926 1927 uint32_t es_verts_per_subgroup = es_verts; 1928 uint32_t gs_prims_per_subgroup = gs_prims; 1929 uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations; 1930 uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out; 1931 out->lds_size = align(esgs_lds_size, 128) / 128; 1932 out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) | 1933 S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) | 1934 S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup); 1935 out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup); 1936 out->vgt_esgs_ring_itemsize = esgs_itemsize; 1937 assert(max_prims_per_subgroup <= max_out_prims); 1938 1939 gl_shader_stage es_stage = has_tess ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX; 1940 unsigned workgroup_size = 1941 ac_compute_esgs_workgroup_size( 1942 pipeline->device->physical_device->rad_info.chip_class, infos[es_stage].wave_size, 1943 es_verts_per_subgroup, gs_inst_prims_in_subgroup); 1944 infos[es_stage].workgroup_size = workgroup_size; 1945 infos[MESA_SHADER_GEOMETRY].workgroup_size = workgroup_size; 1946} 1947 1948static void 1949clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim, 1950 bool use_adjacency) 1951{ 1952 unsigned max_reuse = max_esverts - min_verts_per_prim; 1953 if (use_adjacency) 1954 max_reuse /= 2; 1955 *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse); 1956} 1957 1958static unsigned 1959radv_get_num_input_vertices(nir_shader **nir) 1960{ 1961 if (nir[MESA_SHADER_GEOMETRY]) { 1962 nir_shader *gs = nir[MESA_SHADER_GEOMETRY]; 1963 1964 return gs->info.gs.vertices_in; 1965 } 1966 1967 if (nir[MESA_SHADER_TESS_CTRL]) { 1968 nir_shader *tes = nir[MESA_SHADER_TESS_EVAL]; 1969 1970 if (tes->info.tess.point_mode) 1971 return 1; 1972 if (tes->info.tess.primitive_mode == GL_ISOLINES) 1973 return 2; 1974 return 3; 1975 } 1976 1977 return 3; 1978} 1979 1980static void 1981gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum chip_class chip_class, uint32_t oversub_pc_lines) 1982{ 1983 radeon_set_uconfig_reg( 1984 cs, R_030980_GE_PC_ALLOC, 1985 S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1)); 1986} 1987 1988static void 1989gfx10_get_ngg_info(const struct radv_pipeline_key *key, struct radv_pipeline *pipeline, 1990 nir_shader **nir, struct radv_shader_info *infos, struct gfx10_ngg_info *ngg) 1991{ 1992 struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY]; 1993 struct radv_es_output_info *es_info = 1994 nir[MESA_SHADER_TESS_CTRL] ? &gs_info->tes.es_info : &gs_info->vs.es_info; 1995 unsigned gs_type = nir[MESA_SHADER_GEOMETRY] ? MESA_SHADER_GEOMETRY : MESA_SHADER_VERTEX; 1996 unsigned max_verts_per_prim = radv_get_num_input_vertices(nir); 1997 unsigned min_verts_per_prim = gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1; 1998 unsigned gs_num_invocations = nir[MESA_SHADER_GEOMETRY] ? MAX2(gs_info->gs.invocations, 1) : 1; 1999 bool uses_adjacency; 2000 switch (key->vs.topology) { 2001 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: 2002 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: 2003 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: 2004 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: 2005 uses_adjacency = true; 2006 break; 2007 default: 2008 uses_adjacency = false; 2009 break; 2010 } 2011 2012 /* All these are in dwords: */ 2013 /* We can't allow using the whole LDS, because GS waves compete with 2014 * other shader stages for LDS space. 2015 * 2016 * TODO: We should really take the shader's internal LDS use into 2017 * account. The linker will fail if the size is greater than 2018 * 8K dwords. 2019 */ 2020 const unsigned max_lds_size = 8 * 1024 - 768; 2021 const unsigned target_lds_size = max_lds_size; 2022 unsigned esvert_lds_size = 0; 2023 unsigned gsprim_lds_size = 0; 2024 2025 /* All these are per subgroup: */ 2026 const unsigned min_esverts = 2027 pipeline->device->physical_device->rad_info.chip_class >= GFX10_3 ? 29 : 24; 2028 bool max_vert_out_per_gs_instance = false; 2029 unsigned max_esverts_base = 128; 2030 unsigned max_gsprims_base = 128; /* default prim group size clamp */ 2031 2032 /* Hardware has the following non-natural restrictions on the value 2033 * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of 2034 * the draw: 2035 * - at most 252 for any line input primitive type 2036 * - at most 251 for any quad input primitive type 2037 * - at most 251 for triangle strips with adjacency (this happens to 2038 * be the natural limit for triangle *lists* with adjacency) 2039 */ 2040 max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1); 2041 2042 if (gs_type == MESA_SHADER_GEOMETRY) { 2043 unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations; 2044 2045 if (max_out_verts_per_gsprim <= 256) { 2046 if (max_out_verts_per_gsprim) { 2047 max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim); 2048 } 2049 } else { 2050 /* Use special multi-cycling mode in which each GS 2051 * instance gets its own subgroup. Does not work with 2052 * tessellation. */ 2053 max_vert_out_per_gs_instance = true; 2054 max_gsprims_base = 1; 2055 max_out_verts_per_gsprim = gs_info->gs.vertices_out; 2056 } 2057 2058 esvert_lds_size = es_info->esgs_itemsize / 4; 2059 gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim; 2060 } else { 2061 /* VS and TES. */ 2062 /* LDS size for passing data from GS to ES. */ 2063 struct radv_streamout_info *so_info = nir[MESA_SHADER_TESS_CTRL] 2064 ? &infos[MESA_SHADER_TESS_EVAL].so 2065 : &infos[MESA_SHADER_VERTEX].so; 2066 2067 if (so_info->num_outputs) 2068 esvert_lds_size = 4 * so_info->num_outputs + 1; 2069 2070 /* GS stores Primitive IDs (one DWORD) into LDS at the address 2071 * corresponding to the ES thread of the provoking vertex. All 2072 * ES threads load and export PrimitiveID for their thread. 2073 */ 2074 if (!nir[MESA_SHADER_TESS_CTRL] && infos[MESA_SHADER_VERTEX].vs.outinfo.export_prim_id) 2075 esvert_lds_size = MAX2(esvert_lds_size, 1); 2076 } 2077 2078 unsigned max_gsprims = max_gsprims_base; 2079 unsigned max_esverts = max_esverts_base; 2080 2081 if (esvert_lds_size) 2082 max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size); 2083 if (gsprim_lds_size) 2084 max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size); 2085 2086 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); 2087 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency); 2088 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); 2089 2090 if (esvert_lds_size || gsprim_lds_size) { 2091 /* Now that we have a rough proportionality between esverts 2092 * and gsprims based on the primitive type, scale both of them 2093 * down simultaneously based on required LDS space. 2094 * 2095 * We could be smarter about this if we knew how much vertex 2096 * reuse to expect. 2097 */ 2098 unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size; 2099 if (lds_total > target_lds_size) { 2100 max_esverts = max_esverts * target_lds_size / lds_total; 2101 max_gsprims = max_gsprims * target_lds_size / lds_total; 2102 2103 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); 2104 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency); 2105 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); 2106 } 2107 } 2108 2109 /* Round up towards full wave sizes for better ALU utilization. */ 2110 if (!max_vert_out_per_gs_instance) { 2111 unsigned orig_max_esverts; 2112 unsigned orig_max_gsprims; 2113 unsigned wavesize; 2114 2115 if (gs_type == MESA_SHADER_GEOMETRY) { 2116 wavesize = gs_info->wave_size; 2117 } else { 2118 wavesize = nir[MESA_SHADER_TESS_CTRL] ? infos[MESA_SHADER_TESS_EVAL].wave_size 2119 : infos[MESA_SHADER_VERTEX].wave_size; 2120 } 2121 2122 do { 2123 orig_max_esverts = max_esverts; 2124 orig_max_gsprims = max_gsprims; 2125 2126 max_esverts = align(max_esverts, wavesize); 2127 max_esverts = MIN2(max_esverts, max_esverts_base); 2128 if (esvert_lds_size) 2129 max_esverts = 2130 MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size); 2131 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); 2132 2133 /* Hardware restriction: minimum value of max_esverts */ 2134 if (pipeline->device->physical_device->rad_info.chip_class == GFX10) 2135 max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim); 2136 else 2137 max_esverts = MAX2(max_esverts, min_esverts); 2138 2139 max_gsprims = align(max_gsprims, wavesize); 2140 max_gsprims = MIN2(max_gsprims, max_gsprims_base); 2141 if (gsprim_lds_size) { 2142 /* Don't count unusable vertices to the LDS 2143 * size. Those are vertices above the maximum 2144 * number of vertices that can occur in the 2145 * workgroup, which is e.g. max_gsprims * 3 2146 * for triangles. 2147 */ 2148 unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); 2149 max_gsprims = MIN2(max_gsprims, 2150 (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size); 2151 } 2152 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency); 2153 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); 2154 } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims); 2155 2156 /* Verify the restriction. */ 2157 if (pipeline->device->physical_device->rad_info.chip_class == GFX10) 2158 assert(max_esverts >= min_esverts - 1 + max_verts_per_prim); 2159 else 2160 assert(max_esverts >= min_esverts); 2161 } else { 2162 /* Hardware restriction: minimum value of max_esverts */ 2163 if (pipeline->device->physical_device->rad_info.chip_class == GFX10) 2164 max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim); 2165 else 2166 max_esverts = MAX2(max_esverts, min_esverts); 2167 } 2168 2169 unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out 2170 : gs_type == MESA_SHADER_GEOMETRY 2171 ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out 2172 : max_esverts; 2173 assert(max_out_vertices <= 256); 2174 2175 unsigned prim_amp_factor = 1; 2176 if (gs_type == MESA_SHADER_GEOMETRY) { 2177 /* Number of output primitives per GS input primitive after 2178 * GS instancing. */ 2179 prim_amp_factor = gs_info->gs.vertices_out; 2180 } 2181 2182 /* On Gfx10, the GE only checks against the maximum number of ES verts 2183 * after allocating a full GS primitive. So we need to ensure that 2184 * whenever this check passes, there is enough space for a full 2185 * primitive without vertex reuse. 2186 */ 2187 if (pipeline->device->physical_device->rad_info.chip_class == GFX10) 2188 ngg->hw_max_esverts = max_esverts - max_verts_per_prim + 1; 2189 else 2190 ngg->hw_max_esverts = max_esverts; 2191 2192 ngg->max_gsprims = max_gsprims; 2193 ngg->max_out_verts = max_out_vertices; 2194 ngg->prim_amp_factor = prim_amp_factor; 2195 ngg->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance; 2196 ngg->ngg_emit_size = max_gsprims * gsprim_lds_size; 2197 ngg->enable_vertex_grouping = true; 2198 2199 /* Don't count unusable vertices. */ 2200 ngg->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4; 2201 2202 if (gs_type == MESA_SHADER_GEOMETRY) { 2203 ngg->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4; 2204 } else { 2205 ngg->vgt_esgs_ring_itemsize = 1; 2206 } 2207 2208 assert(ngg->hw_max_esverts >= min_esverts); /* HW limitation */ 2209 2210 gl_shader_stage es_stage = nir[MESA_SHADER_TESS_CTRL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX; 2211 unsigned workgroup_size = 2212 ac_compute_ngg_workgroup_size( 2213 max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor); 2214 infos[MESA_SHADER_GEOMETRY].workgroup_size = workgroup_size; 2215 infos[es_stage].workgroup_size = workgroup_size; 2216} 2217 2218static void 2219radv_pipeline_init_gs_ring_state(struct radv_pipeline *pipeline, const struct gfx9_gs_info *gs) 2220{ 2221 struct radv_device *device = pipeline->device; 2222 unsigned num_se = device->physical_device->rad_info.max_se; 2223 unsigned wave_size = 64; 2224 unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */ 2225 /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16. 2226 * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2). 2227 */ 2228 unsigned gs_vertex_reuse = 2229 (device->physical_device->rad_info.chip_class >= GFX8 ? 32 : 16) * num_se; 2230 unsigned alignment = 256 * num_se; 2231 /* The maximum size is 63.999 MB per SE. */ 2232 unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se; 2233 struct radv_shader_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info; 2234 2235 /* Calculate the minimum size. */ 2236 unsigned min_esgs_ring_size = 2237 align(gs->vgt_esgs_ring_itemsize * 4 * gs_vertex_reuse * wave_size, alignment); 2238 /* These are recommended sizes, not minimum sizes. */ 2239 unsigned esgs_ring_size = 2240 max_gs_waves * 2 * wave_size * gs->vgt_esgs_ring_itemsize * 4 * gs_info->gs.vertices_in; 2241 unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs_info->gs.max_gsvs_emit_size; 2242 2243 min_esgs_ring_size = align(min_esgs_ring_size, alignment); 2244 esgs_ring_size = align(esgs_ring_size, alignment); 2245 gsvs_ring_size = align(gsvs_ring_size, alignment); 2246 2247 if (pipeline->device->physical_device->rad_info.chip_class <= GFX8) 2248 pipeline->graphics.esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size); 2249 2250 pipeline->graphics.gsvs_ring_size = MIN2(gsvs_ring_size, max_size); 2251} 2252 2253struct radv_shader_variant * 2254radv_get_shader(const struct radv_pipeline *pipeline, gl_shader_stage stage) 2255{ 2256 if (stage == MESA_SHADER_VERTEX) { 2257 if (pipeline->shaders[MESA_SHADER_VERTEX]) 2258 return pipeline->shaders[MESA_SHADER_VERTEX]; 2259 if (pipeline->shaders[MESA_SHADER_TESS_CTRL]) 2260 return pipeline->shaders[MESA_SHADER_TESS_CTRL]; 2261 if (pipeline->shaders[MESA_SHADER_GEOMETRY]) 2262 return pipeline->shaders[MESA_SHADER_GEOMETRY]; 2263 } else if (stage == MESA_SHADER_TESS_EVAL) { 2264 if (!radv_pipeline_has_tess(pipeline)) 2265 return NULL; 2266 if (pipeline->shaders[MESA_SHADER_TESS_EVAL]) 2267 return pipeline->shaders[MESA_SHADER_TESS_EVAL]; 2268 if (pipeline->shaders[MESA_SHADER_GEOMETRY]) 2269 return pipeline->shaders[MESA_SHADER_GEOMETRY]; 2270 } 2271 return pipeline->shaders[stage]; 2272} 2273 2274static const struct radv_vs_output_info * 2275get_vs_output_info(const struct radv_pipeline *pipeline) 2276{ 2277 if (radv_pipeline_has_gs(pipeline)) 2278 if (radv_pipeline_has_ngg(pipeline)) 2279 return &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.vs.outinfo; 2280 else 2281 return &pipeline->gs_copy_shader->info.vs.outinfo; 2282 else if (radv_pipeline_has_tess(pipeline)) 2283 return &pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.outinfo; 2284 else 2285 return &pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.outinfo; 2286} 2287 2288static bool 2289radv_nir_stage_uses_xfb(const nir_shader *nir) 2290{ 2291 nir_xfb_info *xfb = nir_gather_xfb_info(nir, NULL); 2292 bool uses_xfb = !!xfb; 2293 2294 ralloc_free(xfb); 2295 return uses_xfb; 2296} 2297 2298static void 2299radv_link_shaders(struct radv_pipeline *pipeline, 2300 const struct radv_pipeline_key *pipeline_key, 2301 nir_shader **shaders, 2302 bool optimize_conservatively) 2303{ 2304 nir_shader *ordered_shaders[MESA_SHADER_STAGES]; 2305 int shader_count = 0; 2306 2307 if (shaders[MESA_SHADER_FRAGMENT]) { 2308 ordered_shaders[shader_count++] = shaders[MESA_SHADER_FRAGMENT]; 2309 } 2310 if (shaders[MESA_SHADER_GEOMETRY]) { 2311 ordered_shaders[shader_count++] = shaders[MESA_SHADER_GEOMETRY]; 2312 } 2313 if (shaders[MESA_SHADER_TESS_EVAL]) { 2314 ordered_shaders[shader_count++] = shaders[MESA_SHADER_TESS_EVAL]; 2315 } 2316 if (shaders[MESA_SHADER_TESS_CTRL]) { 2317 ordered_shaders[shader_count++] = shaders[MESA_SHADER_TESS_CTRL]; 2318 } 2319 if (shaders[MESA_SHADER_VERTEX]) { 2320 ordered_shaders[shader_count++] = shaders[MESA_SHADER_VERTEX]; 2321 } 2322 if (shaders[MESA_SHADER_COMPUTE]) { 2323 ordered_shaders[shader_count++] = shaders[MESA_SHADER_COMPUTE]; 2324 } 2325 2326 bool has_geom_tess = shaders[MESA_SHADER_GEOMETRY] || shaders[MESA_SHADER_TESS_CTRL]; 2327 bool merged_gs = shaders[MESA_SHADER_GEOMETRY] && 2328 pipeline->device->physical_device->rad_info.chip_class >= GFX9; 2329 2330 if (!optimize_conservatively && shader_count > 1) { 2331 unsigned first = ordered_shaders[shader_count - 1]->info.stage; 2332 unsigned last = ordered_shaders[0]->info.stage; 2333 2334 if (ordered_shaders[0]->info.stage == MESA_SHADER_FRAGMENT && 2335 ordered_shaders[1]->info.has_transform_feedback_varyings) 2336 nir_link_xfb_varyings(ordered_shaders[1], ordered_shaders[0]); 2337 2338 for (int i = 1; i < shader_count; ++i) { 2339 nir_lower_io_arrays_to_elements(ordered_shaders[i], ordered_shaders[i - 1]); 2340 } 2341 2342 for (int i = 0; i < shader_count; ++i) { 2343 nir_variable_mode mask = 0; 2344 2345 if (ordered_shaders[i]->info.stage != first) 2346 mask = mask | nir_var_shader_in; 2347 2348 if (ordered_shaders[i]->info.stage != last) 2349 mask = mask | nir_var_shader_out; 2350 2351 if (nir_lower_io_to_scalar_early(ordered_shaders[i], mask)) { 2352 /* Optimize the new vector code and then remove dead vars */ 2353 nir_copy_prop(ordered_shaders[i]); 2354 nir_opt_shrink_vectors(ordered_shaders[i], 2355 !pipeline->device->instance->disable_shrink_image_store); 2356 2357 if (ordered_shaders[i]->info.stage != last) { 2358 /* Optimize swizzled movs of load_const for 2359 * nir_link_opt_varyings's constant propagation 2360 */ 2361 nir_opt_constant_folding(ordered_shaders[i]); 2362 /* For nir_link_opt_varyings's duplicate input opt */ 2363 nir_opt_cse(ordered_shaders[i]); 2364 } 2365 2366 /* Run copy-propagation to help remove dead 2367 * output variables (some shaders have useless 2368 * copies to/from an output), so compaction 2369 * later will be more effective. 2370 * 2371 * This will have been done earlier but it might 2372 * not have worked because the outputs were vector. 2373 */ 2374 if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) 2375 nir_opt_copy_prop_vars(ordered_shaders[i]); 2376 2377 nir_opt_dce(ordered_shaders[i]); 2378 nir_remove_dead_variables( 2379 ordered_shaders[i], nir_var_function_temp | nir_var_shader_in | nir_var_shader_out, 2380 NULL); 2381 } 2382 } 2383 } 2384 2385 bool uses_xfb = pipeline->graphics.last_vgt_api_stage != -1 && 2386 radv_nir_stage_uses_xfb(shaders[pipeline->graphics.last_vgt_api_stage]); 2387 if (!uses_xfb && !optimize_conservatively) { 2388 /* Remove PSIZ from shaders when it's not needed. 2389 * This is typically produced by translation layers like Zink or D9VK. 2390 */ 2391 for (unsigned i = 0; i < shader_count; ++i) { 2392 shader_info *info = &ordered_shaders[i]->info; 2393 if (!(info->outputs_written & VARYING_BIT_PSIZ)) 2394 continue; 2395 2396 bool next_stage_needs_psiz = 2397 i != 0 && /* ordered_shaders is backwards, so next stage is: i - 1 */ 2398 ordered_shaders[i - 1]->info.inputs_read & VARYING_BIT_PSIZ; 2399 bool topology_uses_psiz = 2400 info->stage == pipeline->graphics.last_vgt_api_stage && 2401 ((info->stage == MESA_SHADER_VERTEX && pipeline_key->vs.topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) || 2402 (info->stage == MESA_SHADER_TESS_EVAL && info->tess.point_mode) || 2403 (info->stage == MESA_SHADER_GEOMETRY && info->gs.output_primitive == GL_POINTS)); 2404 2405 nir_variable *psiz_var = 2406 nir_find_variable_with_location(ordered_shaders[i], nir_var_shader_out, VARYING_SLOT_PSIZ); 2407 2408 if (!next_stage_needs_psiz && !topology_uses_psiz && psiz_var) { 2409 /* Change PSIZ to a global variable which allows it to be DCE'd. */ 2410 psiz_var->data.location = 0; 2411 psiz_var->data.mode = nir_var_shader_temp; 2412 2413 info->outputs_written &= ~VARYING_BIT_PSIZ; 2414 nir_fixup_deref_modes(ordered_shaders[i]); 2415 nir_remove_dead_variables(ordered_shaders[i], nir_var_shader_temp, NULL); 2416 nir_opt_dce(ordered_shaders[i]); 2417 } 2418 } 2419 } 2420 2421 for (int i = 1; !optimize_conservatively && (i < shader_count); ++i) { 2422 if (nir_link_opt_varyings(ordered_shaders[i], ordered_shaders[i - 1])) { 2423 nir_opt_constant_folding(ordered_shaders[i - 1]); 2424 nir_opt_algebraic(ordered_shaders[i - 1]); 2425 nir_opt_dce(ordered_shaders[i - 1]); 2426 } 2427 2428 nir_remove_dead_variables(ordered_shaders[i], nir_var_shader_out, NULL); 2429 nir_remove_dead_variables(ordered_shaders[i - 1], nir_var_shader_in, NULL); 2430 2431 bool progress = nir_remove_unused_varyings(ordered_shaders[i], ordered_shaders[i - 1]); 2432 2433 nir_compact_varyings(ordered_shaders[i], ordered_shaders[i - 1], true); 2434 2435 if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL || 2436 (ordered_shaders[i]->info.stage == MESA_SHADER_VERTEX && has_geom_tess) || 2437 (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_EVAL && merged_gs)) { 2438 nir_lower_io_to_vector(ordered_shaders[i], nir_var_shader_out); 2439 if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) 2440 nir_vectorize_tess_levels(ordered_shaders[i]); 2441 nir_opt_combine_stores(ordered_shaders[i], nir_var_shader_out); 2442 } 2443 if (ordered_shaders[i - 1]->info.stage == MESA_SHADER_GEOMETRY || 2444 ordered_shaders[i - 1]->info.stage == MESA_SHADER_TESS_CTRL || 2445 ordered_shaders[i - 1]->info.stage == MESA_SHADER_TESS_EVAL) { 2446 nir_lower_io_to_vector(ordered_shaders[i - 1], nir_var_shader_in); 2447 } 2448 2449 if (progress) { 2450 if (nir_lower_global_vars_to_local(ordered_shaders[i])) { 2451 ac_nir_lower_indirect_derefs(ordered_shaders[i], 2452 pipeline->device->physical_device->rad_info.chip_class); 2453 /* remove dead writes, which can remove input loads */ 2454 nir_lower_vars_to_ssa(ordered_shaders[i]); 2455 nir_opt_dce(ordered_shaders[i]); 2456 } 2457 2458 if (nir_lower_global_vars_to_local(ordered_shaders[i - 1])) { 2459 ac_nir_lower_indirect_derefs(ordered_shaders[i - 1], 2460 pipeline->device->physical_device->rad_info.chip_class); 2461 } 2462 } 2463 } 2464} 2465 2466static void 2467radv_set_driver_locations(struct radv_pipeline *pipeline, nir_shader **shaders, 2468 struct radv_shader_info infos[MESA_SHADER_STAGES]) 2469{ 2470 if (shaders[MESA_SHADER_FRAGMENT]) { 2471 nir_foreach_shader_out_variable(var, shaders[MESA_SHADER_FRAGMENT]) 2472 { 2473 var->data.driver_location = var->data.location + var->data.index; 2474 } 2475 } 2476 2477 if (!shaders[MESA_SHADER_VERTEX]) 2478 return; 2479 2480 bool has_tess = shaders[MESA_SHADER_TESS_CTRL]; 2481 bool has_gs = shaders[MESA_SHADER_GEOMETRY]; 2482 2483 /* Merged stage for VS and TES */ 2484 unsigned vs_info_idx = MESA_SHADER_VERTEX; 2485 unsigned tes_info_idx = MESA_SHADER_TESS_EVAL; 2486 2487 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) { 2488 /* These are merged into the next stage */ 2489 vs_info_idx = has_tess ? MESA_SHADER_TESS_CTRL : MESA_SHADER_GEOMETRY; 2490 tes_info_idx = has_gs ? MESA_SHADER_GEOMETRY : MESA_SHADER_TESS_EVAL; 2491 } 2492 2493 nir_foreach_shader_in_variable (var, shaders[MESA_SHADER_VERTEX]) { 2494 var->data.driver_location = var->data.location; 2495 } 2496 2497 if (has_tess) { 2498 nir_linked_io_var_info vs2tcs = nir_assign_linked_io_var_locations( 2499 shaders[MESA_SHADER_VERTEX], shaders[MESA_SHADER_TESS_CTRL]); 2500 nir_linked_io_var_info tcs2tes = nir_assign_linked_io_var_locations( 2501 shaders[MESA_SHADER_TESS_CTRL], shaders[MESA_SHADER_TESS_EVAL]); 2502 2503 infos[MESA_SHADER_VERTEX].vs.num_linked_outputs = vs2tcs.num_linked_io_vars; 2504 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs = vs2tcs.num_linked_io_vars; 2505 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs = tcs2tes.num_linked_io_vars; 2506 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs = tcs2tes.num_linked_patch_io_vars; 2507 infos[MESA_SHADER_TESS_EVAL].tes.num_linked_inputs = tcs2tes.num_linked_io_vars; 2508 infos[MESA_SHADER_TESS_EVAL].tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars; 2509 2510 /* Copy data to merged stage */ 2511 infos[vs_info_idx].vs.num_linked_outputs = vs2tcs.num_linked_io_vars; 2512 infos[tes_info_idx].tes.num_linked_inputs = tcs2tes.num_linked_io_vars; 2513 infos[tes_info_idx].tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars; 2514 2515 if (has_gs) { 2516 nir_linked_io_var_info tes2gs = nir_assign_linked_io_var_locations( 2517 shaders[MESA_SHADER_TESS_EVAL], shaders[MESA_SHADER_GEOMETRY]); 2518 2519 infos[MESA_SHADER_TESS_EVAL].tes.num_linked_outputs = tes2gs.num_linked_io_vars; 2520 infos[MESA_SHADER_GEOMETRY].gs.num_linked_inputs = tes2gs.num_linked_io_vars; 2521 2522 /* Copy data to merged stage */ 2523 infos[tes_info_idx].tes.num_linked_outputs = tes2gs.num_linked_io_vars; 2524 } 2525 } else if (has_gs) { 2526 nir_linked_io_var_info vs2gs = nir_assign_linked_io_var_locations( 2527 shaders[MESA_SHADER_VERTEX], shaders[MESA_SHADER_GEOMETRY]); 2528 2529 infos[MESA_SHADER_VERTEX].vs.num_linked_outputs = vs2gs.num_linked_io_vars; 2530 infos[MESA_SHADER_GEOMETRY].gs.num_linked_inputs = vs2gs.num_linked_io_vars; 2531 2532 /* Copy data to merged stage */ 2533 infos[vs_info_idx].vs.num_linked_outputs = vs2gs.num_linked_io_vars; 2534 } 2535 2536 assert(pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE); 2537 nir_foreach_shader_out_variable(var, shaders[pipeline->graphics.last_vgt_api_stage]) 2538 { 2539 var->data.driver_location = var->data.location; 2540 } 2541} 2542 2543static uint32_t 2544radv_get_attrib_stride(const VkPipelineVertexInputStateCreateInfo *input_state, 2545 uint32_t attrib_binding) 2546{ 2547 for (uint32_t i = 0; i < input_state->vertexBindingDescriptionCount; i++) { 2548 const VkVertexInputBindingDescription *input_binding = 2549 &input_state->pVertexBindingDescriptions[i]; 2550 2551 if (input_binding->binding == attrib_binding) 2552 return input_binding->stride; 2553 } 2554 2555 return 0; 2556} 2557 2558static struct radv_pipeline_key 2559radv_generate_graphics_pipeline_key(const struct radv_pipeline *pipeline, 2560 const VkGraphicsPipelineCreateInfo *pCreateInfo, 2561 const struct radv_blend_state *blend) 2562{ 2563 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 2564 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; 2565 bool uses_dynamic_stride = false; 2566 2567 struct radv_pipeline_key key; 2568 memset(&key, 0, sizeof(key)); 2569 2570 if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT) 2571 key.optimisations_disabled = 1; 2572 2573 key.has_multiview_view_index = !!subpass->view_mask; 2574 2575 if (pCreateInfo->pDynamicState) { 2576 uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount; 2577 for (uint32_t i = 0; i < count; i++) { 2578 if (pCreateInfo->pDynamicState->pDynamicStates[i] == VK_DYNAMIC_STATE_VERTEX_INPUT_EXT) { 2579 key.vs.dynamic_input_state = true; 2580 /* we don't care about use_dynamic_stride in this case */ 2581 break; 2582 } else if (pCreateInfo->pDynamicState->pDynamicStates[i] == 2583 VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT) { 2584 uses_dynamic_stride = true; 2585 } 2586 } 2587 } 2588 2589 if (!key.vs.dynamic_input_state) { 2590 const VkPipelineVertexInputStateCreateInfo *input_state = pCreateInfo->pVertexInputState; 2591 const VkPipelineVertexInputDivisorStateCreateInfoEXT *divisor_state = vk_find_struct_const( 2592 input_state->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT); 2593 2594 uint32_t binding_input_rate = 0; 2595 uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS]; 2596 for (unsigned i = 0; i < input_state->vertexBindingDescriptionCount; ++i) { 2597 if (input_state->pVertexBindingDescriptions[i].inputRate) { 2598 unsigned binding = input_state->pVertexBindingDescriptions[i].binding; 2599 binding_input_rate |= 1u << binding; 2600 instance_rate_divisors[binding] = 1; 2601 } 2602 } 2603 if (divisor_state) { 2604 for (unsigned i = 0; i < divisor_state->vertexBindingDivisorCount; ++i) { 2605 instance_rate_divisors[divisor_state->pVertexBindingDivisors[i].binding] = 2606 divisor_state->pVertexBindingDivisors[i].divisor; 2607 } 2608 } 2609 2610 for (unsigned i = 0; i < input_state->vertexAttributeDescriptionCount; ++i) { 2611 const VkVertexInputAttributeDescription *desc = 2612 &input_state->pVertexAttributeDescriptions[i]; 2613 const struct util_format_description *format_desc; 2614 unsigned location = desc->location; 2615 unsigned binding = desc->binding; 2616 unsigned num_format, data_format; 2617 bool post_shuffle; 2618 2619 if (binding_input_rate & (1u << binding)) { 2620 key.vs.instance_rate_inputs |= 1u << location; 2621 key.vs.instance_rate_divisors[location] = instance_rate_divisors[binding]; 2622 } 2623 2624 format_desc = vk_format_description(desc->format); 2625 radv_translate_vertex_format(pipeline->device->physical_device, desc->format, format_desc, 2626 &data_format, &num_format, &post_shuffle, 2627 &key.vs.vertex_alpha_adjust[location]); 2628 2629 key.vs.vertex_attribute_formats[location] = data_format | (num_format << 4); 2630 key.vs.vertex_attribute_bindings[location] = desc->binding; 2631 key.vs.vertex_attribute_offsets[location] = desc->offset; 2632 2633 const struct ac_data_format_info *dfmt_info = ac_get_data_format_info(data_format); 2634 unsigned attrib_align = 2635 dfmt_info->chan_byte_size ? dfmt_info->chan_byte_size : dfmt_info->element_size; 2636 2637 /* If desc->offset is misaligned, then the buffer offset must be too. Just 2638 * skip updating vertex_binding_align in this case. 2639 */ 2640 if (desc->offset % attrib_align == 0) 2641 key.vs.vertex_binding_align[desc->binding] = 2642 MAX2(key.vs.vertex_binding_align[desc->binding], attrib_align); 2643 2644 if (!uses_dynamic_stride) { 2645 /* From the Vulkan spec 1.2.157: 2646 * 2647 * "If the bound pipeline state object was created 2648 * with the 2649 * VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT 2650 * dynamic state enabled then pStrides[i] specifies 2651 * the distance in bytes between two consecutive 2652 * elements within the corresponding buffer. In this 2653 * case the VkVertexInputBindingDescription::stride 2654 * state from the pipeline state object is ignored." 2655 * 2656 * Make sure the vertex attribute stride is zero to 2657 * avoid computing a wrong offset if it's initialized 2658 * to something else than zero. 2659 */ 2660 key.vs.vertex_attribute_strides[location] = 2661 radv_get_attrib_stride(input_state, desc->binding); 2662 } 2663 2664 if (post_shuffle) 2665 key.vs.vertex_post_shuffle |= 1 << location; 2666 } 2667 } 2668 2669 const VkPipelineTessellationStateCreateInfo *tess = 2670 radv_pipeline_get_tessellation_state(pCreateInfo); 2671 if (tess) 2672 key.tcs.tess_input_vertices = tess->patchControlPoints; 2673 2674 const VkPipelineMultisampleStateCreateInfo *vkms = 2675 radv_pipeline_get_multisample_state(pCreateInfo); 2676 if (vkms && vkms->rasterizationSamples > 1) { 2677 uint32_t num_samples = vkms->rasterizationSamples; 2678 uint32_t ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo); 2679 key.ps.num_samples = num_samples; 2680 key.ps.log2_ps_iter_samples = util_logbase2(ps_iter_samples); 2681 } 2682 2683 key.ps.col_format = blend->spi_shader_col_format; 2684 if (pipeline->device->physical_device->rad_info.chip_class < GFX8) { 2685 key.ps.is_int8 = blend->col_format_is_int8; 2686 key.ps.is_int10 = blend->col_format_is_int10; 2687 } 2688 2689 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { 2690 key.vs.topology = pCreateInfo->pInputAssemblyState->topology; 2691 2692 const VkPipelineRasterizationStateCreateInfo *raster_info = pCreateInfo->pRasterizationState; 2693 const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_info = 2694 vk_find_struct_const(raster_info->pNext, 2695 PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT); 2696 if (provoking_vtx_info && 2697 provoking_vtx_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) { 2698 key.vs.provoking_vtx_last = true; 2699 } 2700 } 2701 2702 if (pipeline->device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE) 2703 key.ps.lower_discard_to_demote = true; 2704 2705 if (pipeline->device->instance->enable_mrt_output_nan_fixup) 2706 key.ps.enable_mrt_output_nan_fixup = true; 2707 2708 key.ps.force_vrs = pipeline->device->force_vrs; 2709 2710 if (pipeline->device->instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM) 2711 key.invariant_geom = true; 2712 2713 key.use_ngg = pipeline->device->physical_device->use_ngg; 2714 2715 return key; 2716} 2717 2718static uint8_t 2719radv_get_wave_size(struct radv_device *device, const VkPipelineShaderStageCreateInfo *pStage, 2720 gl_shader_stage stage, const struct radv_shader_info *info) 2721{ 2722 if (stage == MESA_SHADER_GEOMETRY && !info->is_ngg) 2723 return 64; 2724 else if (stage == MESA_SHADER_COMPUTE) { 2725 return info->cs.subgroup_size; 2726 } else if (stage == MESA_SHADER_FRAGMENT) 2727 return device->physical_device->ps_wave_size; 2728 else 2729 return device->physical_device->ge_wave_size; 2730} 2731 2732static uint8_t 2733radv_get_ballot_bit_size(struct radv_device *device, const VkPipelineShaderStageCreateInfo *pStage, 2734 gl_shader_stage stage, const struct radv_shader_info *info) 2735{ 2736 if (stage == MESA_SHADER_COMPUTE && info->cs.subgroup_size) 2737 return info->cs.subgroup_size; 2738 return 64; 2739} 2740 2741static void 2742radv_determine_ngg_settings(struct radv_pipeline *pipeline, 2743 const struct radv_pipeline_key *pipeline_key, 2744 struct radv_shader_info *infos, nir_shader **nir) 2745{ 2746 struct radv_device *device = pipeline->device; 2747 2748 if (!nir[MESA_SHADER_GEOMETRY] && pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE) { 2749 uint64_t ps_inputs_read = 2750 nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0; 2751 gl_shader_stage es_stage = pipeline->graphics.last_vgt_api_stage; 2752 2753 unsigned num_vertices_per_prim = si_conv_prim_to_gs_out(pipeline_key->vs.topology) + 1; 2754 if (es_stage == MESA_SHADER_TESS_EVAL) 2755 num_vertices_per_prim = nir[es_stage]->info.tess.point_mode ? 1 2756 : nir[es_stage]->info.tess.primitive_mode == GL_ISOLINES ? 2 2757 : 3; 2758 2759 infos[es_stage].has_ngg_culling = radv_consider_culling( 2760 device, nir[es_stage], ps_inputs_read, num_vertices_per_prim, &infos[es_stage]); 2761 2762 nir_function_impl *impl = nir_shader_get_entrypoint(nir[es_stage]); 2763 infos[es_stage].has_ngg_early_prim_export = exec_list_is_singular(&impl->body); 2764 2765 /* Invocations that process an input vertex */ 2766 const struct gfx10_ngg_info *ngg_info = &infos[es_stage].ngg_info; 2767 unsigned max_vtx_in = MIN2(256, ngg_info->enable_vertex_grouping ? ngg_info->hw_max_esverts : num_vertices_per_prim * ngg_info->max_gsprims); 2768 2769 unsigned lds_bytes_if_culling_off = 0; 2770 /* We need LDS space when VS needs to export the primitive ID. */ 2771 if (es_stage == MESA_SHADER_VERTEX && infos[es_stage].vs.outinfo.export_prim_id) 2772 lds_bytes_if_culling_off = max_vtx_in * 4u; 2773 infos[es_stage].num_lds_blocks_when_not_culling = 2774 DIV_ROUND_UP(lds_bytes_if_culling_off, 2775 device->physical_device->rad_info.lds_encode_granularity); 2776 2777 /* NGG passthrough mode should be disabled when culling and when the vertex shader exports the 2778 * primitive ID. 2779 */ 2780 infos[es_stage].is_ngg_passthrough = infos[es_stage].is_ngg_passthrough && 2781 !infos[es_stage].has_ngg_culling && 2782 !(es_stage == MESA_SHADER_VERTEX && 2783 infos[es_stage].vs.outinfo.export_prim_id); 2784 } 2785} 2786 2787static void 2788radv_fill_shader_info(struct radv_pipeline *pipeline, 2789 struct radv_pipeline_layout *pipeline_layout, 2790 const VkPipelineShaderStageCreateInfo **pStages, 2791 const struct radv_pipeline_key *pipeline_key, 2792 struct radv_shader_info *infos, nir_shader **nir) 2793{ 2794 struct radv_device *device = pipeline->device; 2795 unsigned active_stages = 0; 2796 unsigned filled_stages = 0; 2797 2798 for (int i = 0; i < MESA_SHADER_STAGES; i++) { 2799 if (nir[i]) 2800 active_stages |= (1 << i); 2801 } 2802 2803 if (nir[MESA_SHADER_TESS_CTRL]) { 2804 infos[MESA_SHADER_VERTEX].vs.as_ls = true; 2805 } 2806 2807 if (nir[MESA_SHADER_GEOMETRY]) { 2808 if (nir[MESA_SHADER_TESS_CTRL]) 2809 infos[MESA_SHADER_TESS_EVAL].tes.as_es = true; 2810 else 2811 infos[MESA_SHADER_VERTEX].vs.as_es = true; 2812 } 2813 2814 if (pipeline_key->use_ngg) { 2815 if (nir[MESA_SHADER_TESS_CTRL]) { 2816 infos[MESA_SHADER_TESS_EVAL].is_ngg = true; 2817 } else { 2818 infos[MESA_SHADER_VERTEX].is_ngg = true; 2819 } 2820 2821 if (nir[MESA_SHADER_TESS_CTRL] && nir[MESA_SHADER_GEOMETRY] && 2822 nir[MESA_SHADER_GEOMETRY]->info.gs.invocations * 2823 nir[MESA_SHADER_GEOMETRY]->info.gs.vertices_out > 2824 256) { 2825 /* Fallback to the legacy path if tessellation is 2826 * enabled with extreme geometry because 2827 * EN_MAX_VERT_OUT_PER_GS_INSTANCE doesn't work and it 2828 * might hang. 2829 */ 2830 infos[MESA_SHADER_TESS_EVAL].is_ngg = false; 2831 } 2832 2833 gl_shader_stage last_xfb_stage = MESA_SHADER_VERTEX; 2834 2835 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { 2836 if (nir[i]) 2837 last_xfb_stage = i; 2838 } 2839 2840 bool uses_xfb = nir[last_xfb_stage] && radv_nir_stage_uses_xfb(nir[last_xfb_stage]); 2841 2842 if (!device->physical_device->use_ngg_streamout && uses_xfb) { 2843 if (nir[MESA_SHADER_TESS_CTRL]) 2844 infos[MESA_SHADER_TESS_EVAL].is_ngg = false; 2845 else 2846 infos[MESA_SHADER_VERTEX].is_ngg = false; 2847 } 2848 2849 /* Determine if the pipeline is eligible for the NGG passthrough 2850 * mode. It can't be enabled for geometry shaders, for NGG 2851 * streamout or for vertex shaders that export the primitive ID 2852 * (this is checked later because we don't have the info here.) 2853 */ 2854 if (!nir[MESA_SHADER_GEOMETRY] && !uses_xfb) { 2855 if (nir[MESA_SHADER_TESS_CTRL] && infos[MESA_SHADER_TESS_EVAL].is_ngg) { 2856 infos[MESA_SHADER_TESS_EVAL].is_ngg_passthrough = true; 2857 } else if (nir[MESA_SHADER_VERTEX] && infos[MESA_SHADER_VERTEX].is_ngg) { 2858 infos[MESA_SHADER_VERTEX].is_ngg_passthrough = true; 2859 } 2860 } 2861 } 2862 2863 if (nir[MESA_SHADER_FRAGMENT]) { 2864 radv_nir_shader_info_init(&infos[MESA_SHADER_FRAGMENT]); 2865 radv_nir_shader_info_pass(pipeline->device, nir[MESA_SHADER_FRAGMENT], pipeline_layout, 2866 pipeline_key, &infos[MESA_SHADER_FRAGMENT]); 2867 2868 assert(pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE); 2869 if (infos[MESA_SHADER_FRAGMENT].ps.prim_id_input) { 2870 if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_VERTEX) { 2871 infos[MESA_SHADER_VERTEX].vs.outinfo.export_prim_id = true; 2872 } else if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_TESS_EVAL) { 2873 infos[MESA_SHADER_TESS_EVAL].tes.outinfo.export_prim_id = true; 2874 } else { 2875 assert(pipeline->graphics.last_vgt_api_stage == MESA_SHADER_GEOMETRY); 2876 } 2877 } 2878 2879 if (!!infos[MESA_SHADER_FRAGMENT].ps.num_input_clips_culls) { 2880 if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_VERTEX) { 2881 infos[MESA_SHADER_VERTEX].vs.outinfo.export_clip_dists = true; 2882 } else if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_TESS_EVAL) { 2883 infos[MESA_SHADER_TESS_EVAL].tes.outinfo.export_clip_dists = true; 2884 } else { 2885 assert(pipeline->graphics.last_vgt_api_stage == MESA_SHADER_GEOMETRY); 2886 infos[MESA_SHADER_GEOMETRY].vs.outinfo.export_clip_dists = true; 2887 } 2888 } 2889 2890 filled_stages |= (1 << MESA_SHADER_FRAGMENT); 2891 } 2892 2893 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 && 2894 nir[MESA_SHADER_TESS_CTRL]) { 2895 struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]}; 2896 2897 radv_nir_shader_info_init(&infos[MESA_SHADER_TESS_CTRL]); 2898 2899 /* Copy data to merged stage. */ 2900 infos[MESA_SHADER_TESS_CTRL].vs.as_ls = true; 2901 2902 for (int i = 0; i < 2; i++) { 2903 radv_nir_shader_info_pass(pipeline->device, combined_nir[i], pipeline_layout, pipeline_key, 2904 &infos[MESA_SHADER_TESS_CTRL]); 2905 } 2906 2907 filled_stages |= (1 << MESA_SHADER_VERTEX); 2908 filled_stages |= (1 << MESA_SHADER_TESS_CTRL); 2909 } 2910 2911 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 && 2912 nir[MESA_SHADER_GEOMETRY]) { 2913 gl_shader_stage pre_stage = 2914 nir[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX; 2915 struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]}; 2916 2917 radv_nir_shader_info_init(&infos[MESA_SHADER_GEOMETRY]); 2918 2919 /* Copy data to merged stage. */ 2920 if (pre_stage == MESA_SHADER_VERTEX) { 2921 infos[MESA_SHADER_GEOMETRY].vs.as_es = infos[MESA_SHADER_VERTEX].vs.as_es; 2922 } else { 2923 infos[MESA_SHADER_GEOMETRY].tes.as_es = infos[MESA_SHADER_TESS_EVAL].tes.as_es; 2924 } 2925 infos[MESA_SHADER_GEOMETRY].is_ngg = infos[pre_stage].is_ngg; 2926 infos[MESA_SHADER_GEOMETRY].gs.es_type = pre_stage; 2927 2928 for (int i = 0; i < 2; i++) { 2929 radv_nir_shader_info_pass(pipeline->device, combined_nir[i], pipeline_layout, pipeline_key, 2930 &infos[MESA_SHADER_GEOMETRY]); 2931 } 2932 2933 filled_stages |= (1 << pre_stage); 2934 filled_stages |= (1 << MESA_SHADER_GEOMETRY); 2935 } 2936 2937 active_stages ^= filled_stages; 2938 while (active_stages) { 2939 int i = u_bit_scan(&active_stages); 2940 radv_nir_shader_info_init(&infos[i]); 2941 radv_nir_shader_info_pass(pipeline->device, nir[i], pipeline_layout, pipeline_key, &infos[i]); 2942 } 2943 2944 if (nir[MESA_SHADER_COMPUTE]) { 2945 unsigned subgroup_size = pipeline_key->cs.compute_subgroup_size; 2946 unsigned req_subgroup_size = subgroup_size; 2947 bool require_full_subgroups = pipeline_key->cs.require_full_subgroups; 2948 2949 if (!subgroup_size) 2950 subgroup_size = device->physical_device->cs_wave_size; 2951 2952 unsigned local_size = nir[MESA_SHADER_COMPUTE]->info.workgroup_size[0] * 2953 nir[MESA_SHADER_COMPUTE]->info.workgroup_size[1] * 2954 nir[MESA_SHADER_COMPUTE]->info.workgroup_size[2]; 2955 2956 /* Games don't always request full subgroups when they should, 2957 * which can cause bugs if cswave32 is enabled. 2958 */ 2959 if (device->physical_device->cs_wave_size == 32 && 2960 nir[MESA_SHADER_COMPUTE]->info.cs.uses_wide_subgroup_intrinsics && !req_subgroup_size && 2961 local_size % RADV_SUBGROUP_SIZE == 0) 2962 require_full_subgroups = true; 2963 2964 if (require_full_subgroups && !req_subgroup_size) { 2965 /* don't use wave32 pretending to be wave64 */ 2966 subgroup_size = RADV_SUBGROUP_SIZE; 2967 } 2968 2969 infos[MESA_SHADER_COMPUTE].cs.subgroup_size = subgroup_size; 2970 } 2971 2972 for (int i = 0; i < MESA_SHADER_STAGES; i++) { 2973 if (nir[i]) { 2974 infos[i].wave_size = radv_get_wave_size(pipeline->device, pStages[i], i, &infos[i]); 2975 infos[i].ballot_bit_size = 2976 radv_get_ballot_bit_size(pipeline->device, pStages[i], i, &infos[i]); 2977 } 2978 } 2979 2980 /* PS always operates without workgroups. */ 2981 if (nir[MESA_SHADER_FRAGMENT]) 2982 infos[MESA_SHADER_FRAGMENT].workgroup_size = infos[MESA_SHADER_FRAGMENT].wave_size; 2983 2984 if (nir[MESA_SHADER_COMPUTE]) { 2985 /* Variable workgroup size is not supported by Vulkan. */ 2986 assert(!nir[MESA_SHADER_COMPUTE]->info.workgroup_size_variable); 2987 2988 infos[MESA_SHADER_COMPUTE].workgroup_size = 2989 ac_compute_cs_workgroup_size( 2990 nir[MESA_SHADER_COMPUTE]->info.workgroup_size, false, UINT32_MAX); 2991 } 2992} 2993 2994static void 2995merge_tess_info(struct shader_info *tes_info, struct shader_info *tcs_info) 2996{ 2997 /* The Vulkan 1.0.38 spec, section 21.1 Tessellator says: 2998 * 2999 * "PointMode. Controls generation of points rather than triangles 3000 * or lines. This functionality defaults to disabled, and is 3001 * enabled if either shader stage includes the execution mode. 3002 * 3003 * and about Triangles, Quads, IsoLines, VertexOrderCw, VertexOrderCcw, 3004 * PointMode, SpacingEqual, SpacingFractionalEven, SpacingFractionalOdd, 3005 * and OutputVertices, it says: 3006 * 3007 * "One mode must be set in at least one of the tessellation 3008 * shader stages." 3009 * 3010 * So, the fields can be set in either the TCS or TES, but they must 3011 * agree if set in both. Our backend looks at TES, so bitwise-or in 3012 * the values from the TCS. 3013 */ 3014 assert(tcs_info->tess.tcs_vertices_out == 0 || tes_info->tess.tcs_vertices_out == 0 || 3015 tcs_info->tess.tcs_vertices_out == tes_info->tess.tcs_vertices_out); 3016 tes_info->tess.tcs_vertices_out |= tcs_info->tess.tcs_vertices_out; 3017 3018 assert(tcs_info->tess.spacing == TESS_SPACING_UNSPECIFIED || 3019 tes_info->tess.spacing == TESS_SPACING_UNSPECIFIED || 3020 tcs_info->tess.spacing == tes_info->tess.spacing); 3021 tes_info->tess.spacing |= tcs_info->tess.spacing; 3022 3023 assert(tcs_info->tess.primitive_mode == 0 || tes_info->tess.primitive_mode == 0 || 3024 tcs_info->tess.primitive_mode == tes_info->tess.primitive_mode); 3025 tes_info->tess.primitive_mode |= tcs_info->tess.primitive_mode; 3026 tes_info->tess.ccw |= tcs_info->tess.ccw; 3027 tes_info->tess.point_mode |= tcs_info->tess.point_mode; 3028 3029 /* Copy the merged info back to the TCS */ 3030 tcs_info->tess.tcs_vertices_out = tes_info->tess.tcs_vertices_out; 3031 tcs_info->tess.spacing = tes_info->tess.spacing; 3032 tcs_info->tess.primitive_mode = tes_info->tess.primitive_mode; 3033 tcs_info->tess.ccw = tes_info->tess.ccw; 3034 tcs_info->tess.point_mode = tes_info->tess.point_mode; 3035} 3036 3037static void 3038gather_tess_info(struct radv_device *device, nir_shader **nir, struct radv_shader_info *infos, 3039 const struct radv_pipeline_key *pipeline_key) 3040{ 3041 merge_tess_info(&nir[MESA_SHADER_TESS_EVAL]->info, &nir[MESA_SHADER_TESS_CTRL]->info); 3042 3043 unsigned tess_in_patch_size = pipeline_key->tcs.tess_input_vertices; 3044 unsigned tess_out_patch_size = nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out; 3045 3046 /* Number of tessellation patches per workgroup processed by the current pipeline. */ 3047 unsigned num_patches = get_tcs_num_patches( 3048 tess_in_patch_size, tess_out_patch_size, 3049 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs, 3050 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs, 3051 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs, device->tess_offchip_block_dw_size, 3052 device->physical_device->rad_info.chip_class, device->physical_device->rad_info.family); 3053 3054 /* LDS size used by VS+TCS for storing TCS inputs and outputs. */ 3055 unsigned tcs_lds_size = calculate_tess_lds_size( 3056 device->physical_device->rad_info.chip_class, tess_in_patch_size, tess_out_patch_size, 3057 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs, num_patches, 3058 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs, 3059 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs); 3060 3061 infos[MESA_SHADER_TESS_CTRL].num_tess_patches = num_patches; 3062 infos[MESA_SHADER_TESS_CTRL].tcs.num_lds_blocks = tcs_lds_size; 3063 infos[MESA_SHADER_TESS_CTRL].tcs.tes_reads_tess_factors = 3064 !!(nir[MESA_SHADER_TESS_EVAL]->info.inputs_read & 3065 (VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER)); 3066 infos[MESA_SHADER_TESS_CTRL].tcs.tes_inputs_read = nir[MESA_SHADER_TESS_EVAL]->info.inputs_read; 3067 infos[MESA_SHADER_TESS_CTRL].tcs.tes_patch_inputs_read = 3068 nir[MESA_SHADER_TESS_EVAL]->info.patch_inputs_read; 3069 3070 infos[MESA_SHADER_TESS_EVAL].num_tess_patches = num_patches; 3071 infos[MESA_SHADER_GEOMETRY].num_tess_patches = num_patches; 3072 infos[MESA_SHADER_VERTEX].num_tess_patches = num_patches; 3073 infos[MESA_SHADER_TESS_CTRL].tcs.tcs_vertices_out = tess_out_patch_size; 3074 infos[MESA_SHADER_VERTEX].tcs.tcs_vertices_out = tess_out_patch_size; 3075 3076 if (!radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX)) { 3077 /* When the number of TCS input and output vertices are the same (typically 3): 3078 * - There is an equal amount of LS and HS invocations 3079 * - In case of merged LSHS shaders, the LS and HS halves of the shader 3080 * always process the exact same vertex. We can use this knowledge to optimize them. 3081 * 3082 * We don't set tcs_in_out_eq if the float controls differ because that might 3083 * involve different float modes for the same block and our optimizer 3084 * doesn't handle a instruction dominating another with a different mode. 3085 */ 3086 infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq = 3087 device->physical_device->rad_info.chip_class >= GFX9 && 3088 tess_in_patch_size == tess_out_patch_size && 3089 nir[MESA_SHADER_VERTEX]->info.float_controls_execution_mode == 3090 nir[MESA_SHADER_TESS_CTRL]->info.float_controls_execution_mode; 3091 3092 if (infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq) 3093 infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask = 3094 nir[MESA_SHADER_TESS_CTRL]->info.inputs_read & 3095 nir[MESA_SHADER_VERTEX]->info.outputs_written & 3096 ~nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_cross_invocation_inputs_read & 3097 ~nir[MESA_SHADER_TESS_CTRL]->info.inputs_read_indirectly & 3098 ~nir[MESA_SHADER_VERTEX]->info.outputs_accessed_indirectly; 3099 3100 /* Copy data to TCS so it can be accessed by the backend if they are merged. */ 3101 infos[MESA_SHADER_TESS_CTRL].vs.tcs_in_out_eq = infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq; 3102 infos[MESA_SHADER_TESS_CTRL].vs.tcs_temp_only_input_mask = 3103 infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask; 3104 } 3105 3106 for (gl_shader_stage s = MESA_SHADER_VERTEX; s <= MESA_SHADER_TESS_CTRL; ++s) 3107 infos[s].workgroup_size = 3108 ac_compute_lshs_workgroup_size( 3109 device->physical_device->rad_info.chip_class, s, 3110 num_patches, tess_in_patch_size, tess_out_patch_size); 3111} 3112 3113static void 3114radv_init_feedback(const VkPipelineCreationFeedbackCreateInfoEXT *ext) 3115{ 3116 if (!ext) 3117 return; 3118 3119 if (ext->pPipelineCreationFeedback) { 3120 ext->pPipelineCreationFeedback->flags = 0; 3121 ext->pPipelineCreationFeedback->duration = 0; 3122 } 3123 3124 for (unsigned i = 0; i < ext->pipelineStageCreationFeedbackCount; ++i) { 3125 ext->pPipelineStageCreationFeedbacks[i].flags = 0; 3126 ext->pPipelineStageCreationFeedbacks[i].duration = 0; 3127 } 3128} 3129 3130static void 3131radv_start_feedback(VkPipelineCreationFeedbackEXT *feedback) 3132{ 3133 if (!feedback) 3134 return; 3135 3136 feedback->duration -= radv_get_current_time(); 3137 feedback->flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT; 3138} 3139 3140static void 3141radv_stop_feedback(VkPipelineCreationFeedbackEXT *feedback, bool cache_hit) 3142{ 3143 if (!feedback) 3144 return; 3145 3146 feedback->duration += radv_get_current_time(); 3147 feedback->flags = 3148 VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT | 3149 (cache_hit ? VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT : 0); 3150} 3151 3152static bool 3153mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size, 3154 unsigned num_components, nir_intrinsic_instr *low, nir_intrinsic_instr *high, 3155 void *data) 3156{ 3157 if (num_components > 4) 3158 return false; 3159 3160 /* >128 bit loads are split except with SMEM */ 3161 if (bit_size * num_components > 128) 3162 return false; 3163 3164 uint32_t align; 3165 if (align_offset) 3166 align = 1 << (ffs(align_offset) - 1); 3167 else 3168 align = align_mul; 3169 3170 switch (low->intrinsic) { 3171 case nir_intrinsic_load_global: 3172 case nir_intrinsic_store_global: 3173 case nir_intrinsic_store_ssbo: 3174 case nir_intrinsic_load_ssbo: 3175 case nir_intrinsic_load_ubo: 3176 case nir_intrinsic_load_push_constant: { 3177 unsigned max_components; 3178 if (align % 4 == 0) 3179 max_components = NIR_MAX_VEC_COMPONENTS; 3180 else if (align % 2 == 0) 3181 max_components = 16u / bit_size; 3182 else 3183 max_components = 8u / bit_size; 3184 return (align % (bit_size / 8u)) == 0 && num_components <= max_components; 3185 } 3186 case nir_intrinsic_load_deref: 3187 case nir_intrinsic_store_deref: 3188 assert(nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared)); 3189 FALLTHROUGH; 3190 case nir_intrinsic_load_shared: 3191 case nir_intrinsic_store_shared: 3192 if (bit_size * num_components == 3193 96) { /* 96 bit loads require 128 bit alignment and are split otherwise */ 3194 return align % 16 == 0; 3195 } else if (bit_size == 16 && (align % 4)) { 3196 /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU 3197 * vectorization, because our vectorizer requires the scalar IR to already contain vectors. 3198 */ 3199 return (align % 2 == 0) && num_components <= 2; 3200 } else { 3201 if (num_components == 3) { 3202 /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */ 3203 return false; 3204 } 3205 unsigned req = bit_size * num_components; 3206 if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */ 3207 req /= 2u; 3208 return align % (req / 8u) == 0; 3209 } 3210 default: 3211 return false; 3212 } 3213 return false; 3214} 3215 3216static unsigned 3217lower_bit_size_callback(const nir_instr *instr, void *_) 3218{ 3219 struct radv_device *device = _; 3220 enum chip_class chip = device->physical_device->rad_info.chip_class; 3221 3222 if (instr->type != nir_instr_type_alu) 3223 return 0; 3224 nir_alu_instr *alu = nir_instr_as_alu(instr); 3225 3226 if (alu->dest.dest.ssa.bit_size & (8 | 16)) { 3227 unsigned bit_size = alu->dest.dest.ssa.bit_size; 3228 switch (alu->op) { 3229 case nir_op_iabs: 3230 case nir_op_bitfield_select: 3231 case nir_op_imul_high: 3232 case nir_op_umul_high: 3233 case nir_op_ineg: 3234 case nir_op_isign: 3235 return 32; 3236 case nir_op_imax: 3237 case nir_op_umax: 3238 case nir_op_imin: 3239 case nir_op_umin: 3240 case nir_op_ishr: 3241 case nir_op_ushr: 3242 case nir_op_ishl: 3243 case nir_op_uadd_sat: 3244 return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32 3245 : 0; 3246 case nir_op_iadd_sat: 3247 return bit_size == 8 || !nir_dest_is_divergent(alu->dest.dest) ? 32 : 0; 3248 3249 default: 3250 return 0; 3251 } 3252 } 3253 3254 if (nir_src_bit_size(alu->src[0].src) & (8 | 16)) { 3255 unsigned bit_size = nir_src_bit_size(alu->src[0].src); 3256 switch (alu->op) { 3257 case nir_op_bit_count: 3258 case nir_op_find_lsb: 3259 case nir_op_ufind_msb: 3260 case nir_op_i2b1: 3261 return 32; 3262 case nir_op_ilt: 3263 case nir_op_ige: 3264 case nir_op_ieq: 3265 case nir_op_ine: 3266 case nir_op_ult: 3267 case nir_op_uge: 3268 return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32 3269 : 0; 3270 default: 3271 return 0; 3272 } 3273 } 3274 3275 return 0; 3276} 3277 3278static bool 3279opt_vectorize_callback(const nir_instr *instr, void *_) 3280{ 3281 assert(instr->type == nir_instr_type_alu); 3282 nir_alu_instr *alu = nir_instr_as_alu(instr); 3283 unsigned bit_size = alu->dest.dest.ssa.bit_size; 3284 if (bit_size != 16) 3285 return false; 3286 3287 switch (alu->op) { 3288 case nir_op_fadd: 3289 case nir_op_fsub: 3290 case nir_op_fmul: 3291 case nir_op_fneg: 3292 case nir_op_fsat: 3293 case nir_op_fmin: 3294 case nir_op_fmax: 3295 case nir_op_iadd: 3296 case nir_op_isub: 3297 case nir_op_imul: 3298 case nir_op_imin: 3299 case nir_op_imax: 3300 case nir_op_umin: 3301 case nir_op_umax: 3302 return true; 3303 case nir_op_ishl: /* TODO: in NIR, these have 32bit shift operands */ 3304 case nir_op_ishr: /* while Radeon needs 16bit operands when vectorized */ 3305 case nir_op_ushr: 3306 default: 3307 return false; 3308 } 3309} 3310 3311static nir_component_mask_t 3312non_uniform_access_callback(const nir_src *src, void *_) 3313{ 3314 if (src->ssa->num_components == 1) 3315 return 0x1; 3316 return nir_chase_binding(*src).success ? 0x2 : 0x3; 3317} 3318 3319VkResult 3320radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout *pipeline_layout, 3321 struct radv_device *device, struct radv_pipeline_cache *cache, 3322 const struct radv_pipeline_key *pipeline_key, 3323 const VkPipelineShaderStageCreateInfo **pStages, 3324 const VkPipelineCreateFlags flags, const uint8_t *custom_hash, 3325 VkPipelineCreationFeedbackEXT *pipeline_feedback, 3326 VkPipelineCreationFeedbackEXT **stage_feedbacks) 3327{ 3328 struct vk_shader_module fs_m = {0}; 3329 struct vk_shader_module *modules[MESA_SHADER_STAGES] = { 3330 0, 3331 }; 3332 nir_shader *nir[MESA_SHADER_STAGES] = {0}; 3333 struct radv_shader_binary *binaries[MESA_SHADER_STAGES] = {NULL}; 3334 struct radv_shader_info infos[MESA_SHADER_STAGES] = {0}; 3335 unsigned char hash[20], gs_copy_hash[20]; 3336 bool keep_executable_info = 3337 (flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR) || 3338 device->keep_shader_info; 3339 bool keep_statistic_info = (flags & VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR) || 3340 (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) || 3341 device->keep_shader_info; 3342 struct radv_pipeline_shader_stack_size **stack_sizes = 3343 pipeline->type == RADV_PIPELINE_COMPUTE ? &pipeline->compute.rt_stack_sizes : NULL; 3344 uint32_t *num_stack_sizes = stack_sizes ? &pipeline->compute.group_count : NULL; 3345 3346 radv_start_feedback(pipeline_feedback); 3347 3348 for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) { 3349 if (pStages[i]) { 3350 modules[i] = vk_shader_module_from_handle(pStages[i]->module); 3351 if (modules[i]->nir) 3352 _mesa_sha1_compute(modules[i]->nir->info.name, strlen(modules[i]->nir->info.name), 3353 modules[i]->sha1); 3354 3355 pipeline->active_stages |= mesa_to_vk_shader_stage(i); 3356 if (i < MESA_SHADER_FRAGMENT) 3357 pipeline->graphics.last_vgt_api_stage = i; 3358 } 3359 } 3360 3361 if (custom_hash) 3362 memcpy(hash, custom_hash, 20); 3363 else { 3364 radv_hash_shaders(hash, pStages, pipeline_layout, pipeline_key, 3365 radv_get_hash_flags(device, keep_statistic_info)); 3366 } 3367 memcpy(gs_copy_hash, hash, 20); 3368 gs_copy_hash[0] ^= 1; 3369 3370 pipeline->pipeline_hash = *(uint64_t *)hash; 3371 3372 bool found_in_application_cache = true; 3373 if (modules[MESA_SHADER_GEOMETRY] && !keep_executable_info) { 3374 struct radv_shader_variant *variants[MESA_SHADER_STAGES] = {0}; 3375 radv_create_shader_variants_from_pipeline_cache(device, cache, gs_copy_hash, variants, NULL, 3376 NULL, &found_in_application_cache); 3377 pipeline->gs_copy_shader = variants[MESA_SHADER_GEOMETRY]; 3378 } 3379 3380 if (!keep_executable_info && 3381 radv_create_shader_variants_from_pipeline_cache(device, cache, hash, pipeline->shaders, 3382 stack_sizes, num_stack_sizes, 3383 &found_in_application_cache) && 3384 (!modules[MESA_SHADER_GEOMETRY] || pipeline->gs_copy_shader || 3385 pipeline->shaders[MESA_SHADER_GEOMETRY]->info.is_ngg)) { 3386 radv_stop_feedback(pipeline_feedback, found_in_application_cache); 3387 return VK_SUCCESS; 3388 } 3389 3390 if (flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT) { 3391 radv_stop_feedback(pipeline_feedback, found_in_application_cache); 3392 return VK_PIPELINE_COMPILE_REQUIRED_EXT; 3393 } 3394 3395 if (!modules[MESA_SHADER_FRAGMENT] && !modules[MESA_SHADER_COMPUTE]) { 3396 nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL, "noop_fs"); 3397 fs_m = vk_shader_module_from_nir(fs_b.shader); 3398 modules[MESA_SHADER_FRAGMENT] = &fs_m; 3399 } 3400 3401 for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) { 3402 const VkPipelineShaderStageCreateInfo *stage = pStages[i]; 3403 3404 if (!modules[i]) 3405 continue; 3406 3407 radv_start_feedback(stage_feedbacks[i]); 3408 3409 nir[i] = radv_shader_compile_to_nir(device, modules[i], stage ? stage->pName : "main", i, 3410 stage ? stage->pSpecializationInfo : NULL, 3411 pipeline_layout, pipeline_key); 3412 3413 /* We don't want to alter meta shaders IR directly so clone it 3414 * first. 3415 */ 3416 if (nir[i]->info.name) { 3417 nir[i] = nir_shader_clone(NULL, nir[i]); 3418 } 3419 3420 radv_stop_feedback(stage_feedbacks[i], false); 3421 } 3422 3423 bool optimize_conservatively = pipeline_key->optimisations_disabled; 3424 3425 radv_link_shaders(pipeline, pipeline_key, nir, optimize_conservatively); 3426 radv_set_driver_locations(pipeline, nir, infos); 3427 3428 for (int i = 0; i < MESA_SHADER_STAGES; ++i) { 3429 if (nir[i]) { 3430 radv_start_feedback(stage_feedbacks[i]); 3431 radv_optimize_nir(device, nir[i], optimize_conservatively, false); 3432 3433 /* Gather info again, information such as outputs_read can be out-of-date. */ 3434 nir_shader_gather_info(nir[i], nir_shader_get_entrypoint(nir[i])); 3435 radv_lower_io(device, nir[i]); 3436 3437 radv_stop_feedback(stage_feedbacks[i], false); 3438 } 3439 } 3440 3441 if (nir[MESA_SHADER_TESS_CTRL]) { 3442 nir_lower_patch_vertices(nir[MESA_SHADER_TESS_EVAL], 3443 nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, NULL); 3444 gather_tess_info(device, nir, infos, pipeline_key); 3445 } 3446 3447 radv_fill_shader_info(pipeline, pipeline_layout, pStages, pipeline_key, infos, nir); 3448 3449 bool pipeline_has_ngg = (nir[MESA_SHADER_VERTEX] && infos[MESA_SHADER_VERTEX].is_ngg) || 3450 (nir[MESA_SHADER_TESS_EVAL] && infos[MESA_SHADER_TESS_EVAL].is_ngg); 3451 3452 if (pipeline_has_ngg) { 3453 struct gfx10_ngg_info *ngg_info; 3454 3455 if (nir[MESA_SHADER_GEOMETRY]) 3456 ngg_info = &infos[MESA_SHADER_GEOMETRY].ngg_info; 3457 else if (nir[MESA_SHADER_TESS_CTRL]) 3458 ngg_info = &infos[MESA_SHADER_TESS_EVAL].ngg_info; 3459 else 3460 ngg_info = &infos[MESA_SHADER_VERTEX].ngg_info; 3461 3462 gfx10_get_ngg_info(pipeline_key, pipeline, nir, infos, ngg_info); 3463 } else if (nir[MESA_SHADER_GEOMETRY]) { 3464 struct gfx9_gs_info *gs_info = &infos[MESA_SHADER_GEOMETRY].gs_ring_info; 3465 3466 gfx9_get_gs_info(pipeline_key, pipeline, nir, infos, gs_info); 3467 } else { 3468 gl_shader_stage hw_vs_api_stage = 3469 nir[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX; 3470 infos[hw_vs_api_stage].workgroup_size = infos[hw_vs_api_stage].wave_size; 3471 } 3472 3473 radv_determine_ngg_settings(pipeline, pipeline_key, infos, nir); 3474 3475 for (int i = 0; i < MESA_SHADER_STAGES; ++i) { 3476 if (nir[i]) { 3477 radv_start_feedback(stage_feedbacks[i]); 3478 3479 /* Wave and workgroup size should already be filled. */ 3480 assert(infos[i].wave_size && infos[i].workgroup_size); 3481 3482 if (!radv_use_llvm_for_stage(device, i)) { 3483 nir_lower_non_uniform_access_options options = { 3484 .types = nir_lower_non_uniform_ubo_access | nir_lower_non_uniform_ssbo_access | 3485 nir_lower_non_uniform_texture_access | nir_lower_non_uniform_image_access, 3486 .callback = &non_uniform_access_callback, 3487 .callback_data = NULL, 3488 }; 3489 NIR_PASS_V(nir[i], nir_lower_non_uniform_access, &options); 3490 } 3491 NIR_PASS_V(nir[i], nir_lower_memory_model); 3492 3493 bool lower_to_scalar = false; 3494 3495 nir_load_store_vectorize_options vectorize_opts = { 3496 .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const | 3497 nir_var_mem_shared | nir_var_mem_global, 3498 .callback = mem_vectorize_callback, 3499 .robust_modes = 0, 3500 }; 3501 3502 if (device->robust_buffer_access2) { 3503 vectorize_opts.robust_modes = 3504 nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global | nir_var_mem_push_const; 3505 } 3506 3507 if (nir_opt_load_store_vectorize(nir[i], &vectorize_opts)) { 3508 NIR_PASS_V(nir[i], nir_copy_prop); 3509 lower_to_scalar = true; 3510 3511 /* Gather info again, to update whether 8/16-bit are used. */ 3512 nir_shader_gather_info(nir[i], nir_shader_get_entrypoint(nir[i])); 3513 } 3514 3515 lower_to_scalar |= 3516 nir_opt_shrink_vectors(nir[i], !device->instance->disable_shrink_image_store); 3517 3518 if (lower_to_scalar) 3519 nir_lower_alu_to_scalar(nir[i], NULL, NULL); 3520 3521 /* lower ALU operations */ 3522 nir_lower_int64(nir[i]); 3523 3524 nir_opt_idiv_const(nir[i], 8); 3525 3526 nir_lower_idiv(nir[i], 3527 &(nir_lower_idiv_options){ 3528 .imprecise_32bit_lowering = false, 3529 .allow_fp16 = device->physical_device->rad_info.chip_class >= GFX9, 3530 }); 3531 3532 nir_opt_sink(nir[i], nir_move_load_input | nir_move_const_undef | nir_move_copies); 3533 nir_opt_move(nir[i], nir_move_load_input | nir_move_const_undef | nir_move_copies); 3534 3535 /* Lower I/O intrinsics to memory instructions. */ 3536 bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key); 3537 bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage && 3538 !radv_use_llvm_for_stage(device, i); 3539 if (lowered_ngg) 3540 radv_lower_ngg(device, nir[i], &infos[i], pipeline_key); 3541 3542 radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE); 3543 3544 if (nir[i]->info.bit_sizes_int & (8 | 16)) { 3545 if (device->physical_device->rad_info.chip_class >= GFX8) { 3546 nir_convert_to_lcssa(nir[i], true, true); 3547 nir_divergence_analysis(nir[i]); 3548 } 3549 3550 if (nir_lower_bit_size(nir[i], lower_bit_size_callback, device)) { 3551 NIR_PASS_V(nir[i], nir_opt_constant_folding); 3552 NIR_PASS_V(nir[i], nir_opt_dce); 3553 } 3554 3555 if (device->physical_device->rad_info.chip_class >= GFX8) 3556 nir_opt_remove_phis(nir[i]); /* cleanup LCSSA phis */ 3557 } 3558 if (((nir[i]->info.bit_sizes_int | nir[i]->info.bit_sizes_float) & 16) && 3559 device->physical_device->rad_info.chip_class >= GFX9) 3560 NIR_PASS_V(nir[i], nir_opt_vectorize, opt_vectorize_callback, NULL); 3561 3562 /* cleanup passes */ 3563 nir_lower_load_const_to_scalar(nir[i]); 3564 nir_move_options move_opts = nir_move_const_undef | nir_move_load_ubo | 3565 nir_move_load_input | nir_move_comparisons | nir_move_copies; 3566 nir_opt_sink(nir[i], move_opts | nir_move_load_ssbo); 3567 nir_opt_move(nir[i], move_opts); 3568 3569 radv_stop_feedback(stage_feedbacks[i], false); 3570 } 3571 } 3572 3573 for (int i = 0; i < MESA_SHADER_STAGES; ++i) { 3574 if (radv_can_dump_shader(device, modules[i], false)) 3575 nir_print_shader(nir[i], stderr); 3576 } 3577 3578 if (modules[MESA_SHADER_GEOMETRY]) { 3579 struct radv_shader_binary *gs_copy_binary = NULL; 3580 if (!pipeline_has_ngg) { 3581 struct radv_shader_info info = {0}; 3582 3583 if (infos[MESA_SHADER_GEOMETRY].vs.outinfo.export_clip_dists) 3584 info.vs.outinfo.export_clip_dists = true; 3585 3586 radv_nir_shader_info_pass(device, nir[MESA_SHADER_GEOMETRY], pipeline_layout, pipeline_key, 3587 &info); 3588 info.wave_size = 64; /* Wave32 not supported. */ 3589 info.workgroup_size = 64; /* HW VS: separate waves, no workgroups */ 3590 info.ballot_bit_size = 64; 3591 3592 pipeline->gs_copy_shader = radv_create_gs_copy_shader( 3593 device, nir[MESA_SHADER_GEOMETRY], &info, &gs_copy_binary, keep_executable_info, 3594 keep_statistic_info, pipeline_key->has_multiview_view_index, 3595 pipeline_key->optimisations_disabled); 3596 } 3597 3598 if (!keep_executable_info && pipeline->gs_copy_shader) { 3599 struct radv_shader_binary *gs_binaries[MESA_SHADER_STAGES] = {NULL}; 3600 struct radv_shader_variant *gs_variants[MESA_SHADER_STAGES] = {0}; 3601 3602 gs_binaries[MESA_SHADER_GEOMETRY] = gs_copy_binary; 3603 gs_variants[MESA_SHADER_GEOMETRY] = pipeline->gs_copy_shader; 3604 3605 radv_pipeline_cache_insert_shaders(device, cache, gs_copy_hash, gs_variants, gs_binaries, 3606 NULL, 0); 3607 3608 pipeline->gs_copy_shader = gs_variants[MESA_SHADER_GEOMETRY]; 3609 } 3610 free(gs_copy_binary); 3611 } 3612 3613 if (nir[MESA_SHADER_FRAGMENT]) { 3614 if (!pipeline->shaders[MESA_SHADER_FRAGMENT]) { 3615 radv_start_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT]); 3616 3617 pipeline->shaders[MESA_SHADER_FRAGMENT] = radv_shader_variant_compile( 3618 device, modules[MESA_SHADER_FRAGMENT], &nir[MESA_SHADER_FRAGMENT], 1, pipeline_layout, 3619 pipeline_key, infos + MESA_SHADER_FRAGMENT, keep_executable_info, 3620 keep_statistic_info, &binaries[MESA_SHADER_FRAGMENT]); 3621 3622 radv_stop_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT], false); 3623 } 3624 } 3625 3626 if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_TESS_CTRL]) { 3627 if (!pipeline->shaders[MESA_SHADER_TESS_CTRL]) { 3628 struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]}; 3629 3630 radv_start_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL]); 3631 3632 pipeline->shaders[MESA_SHADER_TESS_CTRL] = radv_shader_variant_compile( 3633 device, modules[MESA_SHADER_TESS_CTRL], combined_nir, 2, pipeline_layout, pipeline_key, 3634 &infos[MESA_SHADER_TESS_CTRL], keep_executable_info, keep_statistic_info, 3635 &binaries[MESA_SHADER_TESS_CTRL]); 3636 3637 radv_stop_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL], false); 3638 } 3639 modules[MESA_SHADER_VERTEX] = NULL; 3640 } 3641 3642 if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_GEOMETRY]) { 3643 gl_shader_stage pre_stage = 3644 modules[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX; 3645 if (!pipeline->shaders[MESA_SHADER_GEOMETRY]) { 3646 struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]}; 3647 3648 radv_start_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY]); 3649 3650 pipeline->shaders[MESA_SHADER_GEOMETRY] = radv_shader_variant_compile( 3651 device, modules[MESA_SHADER_GEOMETRY], combined_nir, 2, pipeline_layout, pipeline_key, 3652 &infos[MESA_SHADER_GEOMETRY], keep_executable_info, 3653 keep_statistic_info, &binaries[MESA_SHADER_GEOMETRY]); 3654 3655 radv_stop_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY], false); 3656 } 3657 modules[pre_stage] = NULL; 3658 } 3659 3660 for (int i = 0; i < MESA_SHADER_STAGES; ++i) { 3661 if (modules[i] && !pipeline->shaders[i]) { 3662 radv_start_feedback(stage_feedbacks[i]); 3663 3664 pipeline->shaders[i] = radv_shader_variant_compile( 3665 device, modules[i], &nir[i], 1, pipeline_layout, pipeline_key, infos + i, 3666 keep_executable_info, keep_statistic_info, &binaries[i]); 3667 3668 radv_stop_feedback(stage_feedbacks[i], false); 3669 } 3670 } 3671 3672 if (!keep_executable_info) { 3673 radv_pipeline_cache_insert_shaders(device, cache, hash, pipeline->shaders, binaries, 3674 stack_sizes ? *stack_sizes : NULL, 3675 num_stack_sizes ? *num_stack_sizes : 0); 3676 } 3677 3678 for (int i = 0; i < MESA_SHADER_STAGES; ++i) { 3679 free(binaries[i]); 3680 if (nir[i]) { 3681 ralloc_free(nir[i]); 3682 3683 if (radv_can_dump_shader_stats(device, modules[i])) { 3684 radv_dump_shader_stats(device, pipeline, i, stderr); 3685 } 3686 } 3687 } 3688 3689 if (fs_m.nir) 3690 ralloc_free(fs_m.nir); 3691 3692 radv_stop_feedback(pipeline_feedback, false); 3693 return VK_SUCCESS; 3694} 3695 3696static uint32_t 3697radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline, gl_shader_stage stage, 3698 enum chip_class chip_class) 3699{ 3700 bool has_gs = radv_pipeline_has_gs(pipeline); 3701 bool has_tess = radv_pipeline_has_tess(pipeline); 3702 bool has_ngg = radv_pipeline_has_ngg(pipeline); 3703 3704 switch (stage) { 3705 case MESA_SHADER_FRAGMENT: 3706 return R_00B030_SPI_SHADER_USER_DATA_PS_0; 3707 case MESA_SHADER_VERTEX: 3708 if (has_tess) { 3709 if (chip_class >= GFX10) { 3710 return R_00B430_SPI_SHADER_USER_DATA_HS_0; 3711 } else if (chip_class == GFX9) { 3712 return R_00B430_SPI_SHADER_USER_DATA_LS_0; 3713 } else { 3714 return R_00B530_SPI_SHADER_USER_DATA_LS_0; 3715 } 3716 } 3717 3718 if (has_gs) { 3719 if (chip_class >= GFX10) { 3720 return R_00B230_SPI_SHADER_USER_DATA_GS_0; 3721 } else { 3722 return R_00B330_SPI_SHADER_USER_DATA_ES_0; 3723 } 3724 } 3725 3726 if (has_ngg) 3727 return R_00B230_SPI_SHADER_USER_DATA_GS_0; 3728 3729 return R_00B130_SPI_SHADER_USER_DATA_VS_0; 3730 case MESA_SHADER_GEOMETRY: 3731 return chip_class == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 3732 : R_00B230_SPI_SHADER_USER_DATA_GS_0; 3733 case MESA_SHADER_COMPUTE: 3734 return R_00B900_COMPUTE_USER_DATA_0; 3735 case MESA_SHADER_TESS_CTRL: 3736 return chip_class == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0 3737 : R_00B430_SPI_SHADER_USER_DATA_HS_0; 3738 case MESA_SHADER_TESS_EVAL: 3739 if (has_gs) { 3740 return chip_class >= GFX10 ? R_00B230_SPI_SHADER_USER_DATA_GS_0 3741 : R_00B330_SPI_SHADER_USER_DATA_ES_0; 3742 } else if (has_ngg) { 3743 return R_00B230_SPI_SHADER_USER_DATA_GS_0; 3744 } else { 3745 return R_00B130_SPI_SHADER_USER_DATA_VS_0; 3746 } 3747 default: 3748 unreachable("unknown shader"); 3749 } 3750} 3751 3752struct radv_bin_size_entry { 3753 unsigned bpp; 3754 VkExtent2D extent; 3755}; 3756 3757static VkExtent2D 3758radv_gfx9_compute_bin_size(const struct radv_pipeline *pipeline, 3759 const VkGraphicsPipelineCreateInfo *pCreateInfo) 3760{ 3761 static const struct radv_bin_size_entry color_size_table[][3][9] = { 3762 { 3763 /* One RB / SE */ 3764 { 3765 /* One shader engine */ 3766 {0, {128, 128}}, 3767 {1, {64, 128}}, 3768 {2, {32, 128}}, 3769 {3, {16, 128}}, 3770 {17, {0, 0}}, 3771 {UINT_MAX, {0, 0}}, 3772 }, 3773 { 3774 /* Two shader engines */ 3775 {0, {128, 128}}, 3776 {2, {64, 128}}, 3777 {3, {32, 128}}, 3778 {5, {16, 128}}, 3779 {17, {0, 0}}, 3780 {UINT_MAX, {0, 0}}, 3781 }, 3782 { 3783 /* Four shader engines */ 3784 {0, {128, 128}}, 3785 {3, {64, 128}}, 3786 {5, {16, 128}}, 3787 {17, {0, 0}}, 3788 {UINT_MAX, {0, 0}}, 3789 }, 3790 }, 3791 { 3792 /* Two RB / SE */ 3793 { 3794 /* One shader engine */ 3795 {0, {128, 128}}, 3796 {2, {64, 128}}, 3797 {3, {32, 128}}, 3798 {5, {16, 128}}, 3799 {33, {0, 0}}, 3800 {UINT_MAX, {0, 0}}, 3801 }, 3802 { 3803 /* Two shader engines */ 3804 {0, {128, 128}}, 3805 {3, {64, 128}}, 3806 {5, {32, 128}}, 3807 {9, {16, 128}}, 3808 {33, {0, 0}}, 3809 {UINT_MAX, {0, 0}}, 3810 }, 3811 { 3812 /* Four shader engines */ 3813 {0, {256, 256}}, 3814 {2, {128, 256}}, 3815 {3, {128, 128}}, 3816 {5, {64, 128}}, 3817 {9, {16, 128}}, 3818 {33, {0, 0}}, 3819 {UINT_MAX, {0, 0}}, 3820 }, 3821 }, 3822 { 3823 /* Four RB / SE */ 3824 { 3825 /* One shader engine */ 3826 {0, {128, 256}}, 3827 {2, {128, 128}}, 3828 {3, {64, 128}}, 3829 {5, {32, 128}}, 3830 {9, {16, 128}}, 3831 {33, {0, 0}}, 3832 {UINT_MAX, {0, 0}}, 3833 }, 3834 { 3835 /* Two shader engines */ 3836 {0, {256, 256}}, 3837 {2, {128, 256}}, 3838 {3, {128, 128}}, 3839 {5, {64, 128}}, 3840 {9, {32, 128}}, 3841 {17, {16, 128}}, 3842 {33, {0, 0}}, 3843 {UINT_MAX, {0, 0}}, 3844 }, 3845 { 3846 /* Four shader engines */ 3847 {0, {256, 512}}, 3848 {2, {256, 256}}, 3849 {3, {128, 256}}, 3850 {5, {128, 128}}, 3851 {9, {64, 128}}, 3852 {17, {16, 128}}, 3853 {33, {0, 0}}, 3854 {UINT_MAX, {0, 0}}, 3855 }, 3856 }, 3857 }; 3858 static const struct radv_bin_size_entry ds_size_table[][3][9] = { 3859 { 3860 // One RB / SE 3861 { 3862 // One shader engine 3863 {0, {128, 256}}, 3864 {2, {128, 128}}, 3865 {4, {64, 128}}, 3866 {7, {32, 128}}, 3867 {13, {16, 128}}, 3868 {49, {0, 0}}, 3869 {UINT_MAX, {0, 0}}, 3870 }, 3871 { 3872 // Two shader engines 3873 {0, {256, 256}}, 3874 {2, {128, 256}}, 3875 {4, {128, 128}}, 3876 {7, {64, 128}}, 3877 {13, {32, 128}}, 3878 {25, {16, 128}}, 3879 {49, {0, 0}}, 3880 {UINT_MAX, {0, 0}}, 3881 }, 3882 { 3883 // Four shader engines 3884 {0, {256, 512}}, 3885 {2, {256, 256}}, 3886 {4, {128, 256}}, 3887 {7, {128, 128}}, 3888 {13, {64, 128}}, 3889 {25, {16, 128}}, 3890 {49, {0, 0}}, 3891 {UINT_MAX, {0, 0}}, 3892 }, 3893 }, 3894 { 3895 // Two RB / SE 3896 { 3897 // One shader engine 3898 {0, {256, 256}}, 3899 {2, {128, 256}}, 3900 {4, {128, 128}}, 3901 {7, {64, 128}}, 3902 {13, {32, 128}}, 3903 {25, {16, 128}}, 3904 {97, {0, 0}}, 3905 {UINT_MAX, {0, 0}}, 3906 }, 3907 { 3908 // Two shader engines 3909 {0, {256, 512}}, 3910 {2, {256, 256}}, 3911 {4, {128, 256}}, 3912 {7, {128, 128}}, 3913 {13, {64, 128}}, 3914 {25, {32, 128}}, 3915 {49, {16, 128}}, 3916 {97, {0, 0}}, 3917 {UINT_MAX, {0, 0}}, 3918 }, 3919 { 3920 // Four shader engines 3921 {0, {512, 512}}, 3922 {2, {256, 512}}, 3923 {4, {256, 256}}, 3924 {7, {128, 256}}, 3925 {13, {128, 128}}, 3926 {25, {64, 128}}, 3927 {49, {16, 128}}, 3928 {97, {0, 0}}, 3929 {UINT_MAX, {0, 0}}, 3930 }, 3931 }, 3932 { 3933 // Four RB / SE 3934 { 3935 // One shader engine 3936 {0, {256, 512}}, 3937 {2, {256, 256}}, 3938 {4, {128, 256}}, 3939 {7, {128, 128}}, 3940 {13, {64, 128}}, 3941 {25, {32, 128}}, 3942 {49, {16, 128}}, 3943 {UINT_MAX, {0, 0}}, 3944 }, 3945 { 3946 // Two shader engines 3947 {0, {512, 512}}, 3948 {2, {256, 512}}, 3949 {4, {256, 256}}, 3950 {7, {128, 256}}, 3951 {13, {128, 128}}, 3952 {25, {64, 128}}, 3953 {49, {32, 128}}, 3954 {97, {16, 128}}, 3955 {UINT_MAX, {0, 0}}, 3956 }, 3957 { 3958 // Four shader engines 3959 {0, {512, 512}}, 3960 {4, {256, 512}}, 3961 {7, {256, 256}}, 3962 {13, {128, 256}}, 3963 {25, {128, 128}}, 3964 {49, {64, 128}}, 3965 {97, {16, 128}}, 3966 {UINT_MAX, {0, 0}}, 3967 }, 3968 }, 3969 }; 3970 3971 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 3972 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; 3973 VkExtent2D extent = {512, 512}; 3974 3975 unsigned log_num_rb_per_se = 3976 util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_render_backends / 3977 pipeline->device->physical_device->rad_info.max_se); 3978 unsigned log_num_se = util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_se); 3979 3980 unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_aa_config); 3981 unsigned ps_iter_samples = 1u << G_028804_PS_ITER_SAMPLES(pipeline->graphics.ms.db_eqaa); 3982 unsigned effective_samples = total_samples; 3983 unsigned color_bytes_per_pixel = 0; 3984 3985 const VkPipelineColorBlendStateCreateInfo *vkblend = 3986 radv_pipeline_get_color_blend_state(pCreateInfo); 3987 if (vkblend) { 3988 for (unsigned i = 0; i < subpass->color_count; i++) { 3989 if (!vkblend->pAttachments[i].colorWriteMask) 3990 continue; 3991 3992 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) 3993 continue; 3994 3995 VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format; 3996 color_bytes_per_pixel += vk_format_get_blocksize(format); 3997 } 3998 3999 /* MSAA images typically don't use all samples all the time. */ 4000 if (effective_samples >= 2 && ps_iter_samples <= 1) 4001 effective_samples = 2; 4002 color_bytes_per_pixel *= effective_samples; 4003 } 4004 4005 const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se]; 4006 while (color_entry[1].bpp <= color_bytes_per_pixel) 4007 ++color_entry; 4008 4009 extent = color_entry->extent; 4010 4011 if (subpass->depth_stencil_attachment) { 4012 struct radv_render_pass_attachment *attachment = 4013 pass->attachments + subpass->depth_stencil_attachment->attachment; 4014 4015 /* Coefficients taken from AMDVLK */ 4016 unsigned depth_coeff = vk_format_has_depth(attachment->format) ? 5 : 0; 4017 unsigned stencil_coeff = vk_format_has_stencil(attachment->format) ? 1 : 0; 4018 unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples; 4019 4020 const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se]; 4021 while (ds_entry[1].bpp <= ds_bytes_per_pixel) 4022 ++ds_entry; 4023 4024 if (ds_entry->extent.width * ds_entry->extent.height < extent.width * extent.height) 4025 extent = ds_entry->extent; 4026 } 4027 4028 return extent; 4029} 4030 4031static VkExtent2D 4032radv_gfx10_compute_bin_size(const struct radv_pipeline *pipeline, 4033 const VkGraphicsPipelineCreateInfo *pCreateInfo) 4034{ 4035 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 4036 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; 4037 VkExtent2D extent = {512, 512}; 4038 4039 const unsigned db_tag_size = 64; 4040 const unsigned db_tag_count = 312; 4041 const unsigned color_tag_size = 1024; 4042 const unsigned color_tag_count = 31; 4043 const unsigned fmask_tag_size = 256; 4044 const unsigned fmask_tag_count = 44; 4045 4046 const unsigned rb_count = pipeline->device->physical_device->rad_info.max_render_backends; 4047 const unsigned pipe_count = 4048 MAX2(rb_count, pipeline->device->physical_device->rad_info.num_tcc_blocks); 4049 4050 const unsigned db_tag_part = (db_tag_count * rb_count / pipe_count) * db_tag_size * pipe_count; 4051 const unsigned color_tag_part = 4052 (color_tag_count * rb_count / pipe_count) * color_tag_size * pipe_count; 4053 const unsigned fmask_tag_part = 4054 (fmask_tag_count * rb_count / pipe_count) * fmask_tag_size * pipe_count; 4055 4056 const unsigned total_samples = 4057 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_aa_config); 4058 const unsigned samples_log = util_logbase2_ceil(total_samples); 4059 4060 unsigned color_bytes_per_pixel = 0; 4061 unsigned fmask_bytes_per_pixel = 0; 4062 4063 const VkPipelineColorBlendStateCreateInfo *vkblend = 4064 radv_pipeline_get_color_blend_state(pCreateInfo); 4065 if (vkblend) { 4066 for (unsigned i = 0; i < subpass->color_count; i++) { 4067 if (!vkblend->pAttachments[i].colorWriteMask) 4068 continue; 4069 4070 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) 4071 continue; 4072 4073 VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format; 4074 color_bytes_per_pixel += vk_format_get_blocksize(format); 4075 4076 if (total_samples > 1) { 4077 assert(samples_log <= 3); 4078 const unsigned fmask_array[] = {0, 1, 1, 4}; 4079 fmask_bytes_per_pixel += fmask_array[samples_log]; 4080 } 4081 } 4082 4083 color_bytes_per_pixel *= total_samples; 4084 } 4085 color_bytes_per_pixel = MAX2(color_bytes_per_pixel, 1); 4086 4087 const unsigned color_pixel_count_log = util_logbase2(color_tag_part / color_bytes_per_pixel); 4088 extent.width = 1ull << ((color_pixel_count_log + 1) / 2); 4089 extent.height = 1ull << (color_pixel_count_log / 2); 4090 4091 if (fmask_bytes_per_pixel) { 4092 const unsigned fmask_pixel_count_log = util_logbase2(fmask_tag_part / fmask_bytes_per_pixel); 4093 4094 const VkExtent2D fmask_extent = 4095 (VkExtent2D){.width = 1ull << ((fmask_pixel_count_log + 1) / 2), 4096 .height = 1ull << (color_pixel_count_log / 2)}; 4097 4098 if (fmask_extent.width * fmask_extent.height < extent.width * extent.height) 4099 extent = fmask_extent; 4100 } 4101 4102 if (subpass->depth_stencil_attachment) { 4103 struct radv_render_pass_attachment *attachment = 4104 pass->attachments + subpass->depth_stencil_attachment->attachment; 4105 4106 /* Coefficients taken from AMDVLK */ 4107 unsigned depth_coeff = vk_format_has_depth(attachment->format) ? 5 : 0; 4108 unsigned stencil_coeff = vk_format_has_stencil(attachment->format) ? 1 : 0; 4109 unsigned db_bytes_per_pixel = (depth_coeff + stencil_coeff) * total_samples; 4110 4111 const unsigned db_pixel_count_log = util_logbase2(db_tag_part / db_bytes_per_pixel); 4112 4113 const VkExtent2D db_extent = (VkExtent2D){.width = 1ull << ((db_pixel_count_log + 1) / 2), 4114 .height = 1ull << (color_pixel_count_log / 2)}; 4115 4116 if (db_extent.width * db_extent.height < extent.width * extent.height) 4117 extent = db_extent; 4118 } 4119 4120 extent.width = MAX2(extent.width, 128); 4121 extent.height = MAX2(extent.width, 64); 4122 4123 return extent; 4124} 4125 4126static void 4127radv_pipeline_init_disabled_binning_state(struct radv_pipeline *pipeline, 4128 const VkGraphicsPipelineCreateInfo *pCreateInfo) 4129{ 4130 uint32_t pa_sc_binner_cntl_0 = S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) | 4131 S_028C44_DISABLE_START_OF_PRIM(1); 4132 4133 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { 4134 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 4135 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; 4136 const VkPipelineColorBlendStateCreateInfo *vkblend = 4137 radv_pipeline_get_color_blend_state(pCreateInfo); 4138 unsigned min_bytes_per_pixel = 0; 4139 4140 if (vkblend) { 4141 for (unsigned i = 0; i < subpass->color_count; i++) { 4142 if (!vkblend->pAttachments[i].colorWriteMask) 4143 continue; 4144 4145 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) 4146 continue; 4147 4148 VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format; 4149 unsigned bytes = vk_format_get_blocksize(format); 4150 if (!min_bytes_per_pixel || bytes < min_bytes_per_pixel) 4151 min_bytes_per_pixel = bytes; 4152 } 4153 } 4154 4155 pa_sc_binner_cntl_0 = 4156 S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) | S_028C44_BIN_SIZE_X(0) | 4157 S_028C44_BIN_SIZE_Y(0) | S_028C44_BIN_SIZE_X_EXTEND(2) | /* 128 */ 4158 S_028C44_BIN_SIZE_Y_EXTEND(min_bytes_per_pixel <= 4 ? 2 : 1) | /* 128 or 64 */ 4159 S_028C44_DISABLE_START_OF_PRIM(1); 4160 } 4161 4162 pipeline->graphics.binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0; 4163} 4164 4165struct radv_binning_settings 4166radv_get_binning_settings(const struct radv_physical_device *pdev) 4167{ 4168 struct radv_binning_settings settings; 4169 if (pdev->rad_info.has_dedicated_vram) { 4170 if (pdev->rad_info.max_render_backends > 4) { 4171 settings.context_states_per_bin = 1; 4172 settings.persistent_states_per_bin = 1; 4173 } else { 4174 settings.context_states_per_bin = 3; 4175 settings.persistent_states_per_bin = 8; 4176 } 4177 settings.fpovs_per_batch = 63; 4178 } else { 4179 /* The context states are affected by the scissor bug. */ 4180 settings.context_states_per_bin = 6; 4181 /* 32 causes hangs for RAVEN. */ 4182 settings.persistent_states_per_bin = 16; 4183 settings.fpovs_per_batch = 63; 4184 } 4185 4186 if (pdev->rad_info.has_gfx9_scissor_bug) 4187 settings.context_states_per_bin = 1; 4188 4189 return settings; 4190} 4191 4192static void 4193radv_pipeline_init_binning_state(struct radv_pipeline *pipeline, 4194 const VkGraphicsPipelineCreateInfo *pCreateInfo, 4195 const struct radv_blend_state *blend) 4196{ 4197 if (pipeline->device->physical_device->rad_info.chip_class < GFX9) 4198 return; 4199 4200 VkExtent2D bin_size; 4201 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { 4202 bin_size = radv_gfx10_compute_bin_size(pipeline, pCreateInfo); 4203 } else if (pipeline->device->physical_device->rad_info.chip_class == GFX9) { 4204 bin_size = radv_gfx9_compute_bin_size(pipeline, pCreateInfo); 4205 } else 4206 unreachable("Unhandled generation for binning bin size calculation"); 4207 4208 if (pipeline->device->pbb_allowed && bin_size.width && bin_size.height) { 4209 struct radv_binning_settings settings = 4210 radv_get_binning_settings(pipeline->device->physical_device); 4211 4212 const uint32_t pa_sc_binner_cntl_0 = 4213 S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | 4214 S_028C44_BIN_SIZE_X(bin_size.width == 16) | S_028C44_BIN_SIZE_Y(bin_size.height == 16) | 4215 S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) | 4216 S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) | 4217 S_028C44_CONTEXT_STATES_PER_BIN(settings.context_states_per_bin - 1) | 4218 S_028C44_PERSISTENT_STATES_PER_BIN(settings.persistent_states_per_bin - 1) | 4219 S_028C44_DISABLE_START_OF_PRIM(1) | 4220 S_028C44_FPOVS_PER_BATCH(settings.fpovs_per_batch) | S_028C44_OPTIMAL_BIN_SELECTION(1); 4221 4222 pipeline->graphics.binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0; 4223 } else 4224 radv_pipeline_init_disabled_binning_state(pipeline, pCreateInfo); 4225} 4226 4227static void 4228radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf *ctx_cs, 4229 const struct radv_pipeline *pipeline, 4230 const VkGraphicsPipelineCreateInfo *pCreateInfo, 4231 const struct radv_graphics_pipeline_create_info *extra) 4232{ 4233 const VkPipelineDepthStencilStateCreateInfo *vkds = 4234 radv_pipeline_get_depth_stencil_state(pCreateInfo); 4235 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); 4236 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; 4237 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; 4238 struct radv_render_pass_attachment *attachment = NULL; 4239 uint32_t db_render_control = 0, db_render_override2 = 0; 4240 uint32_t db_render_override = 0; 4241 4242 if (subpass->depth_stencil_attachment) 4243 attachment = pass->attachments + subpass->depth_stencil_attachment->attachment; 4244 4245 bool has_depth_attachment = attachment && vk_format_has_depth(attachment->format); 4246 4247 if (vkds && has_depth_attachment) { 4248 /* from amdvlk: For 4xAA and 8xAA need to decompress on flush for better performance */ 4249 db_render_override2 |= S_028010_DECOMPRESS_Z_ON_FLUSH(attachment->samples > 2); 4250 4251 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3) 4252 db_render_override2 |= S_028010_CENTROID_COMPUTATION_MODE(1); 4253 } 4254 4255 if (attachment && extra) { 4256 db_render_control |= S_028000_DEPTH_CLEAR_ENABLE(extra->db_depth_clear); 4257 db_render_control |= S_028000_STENCIL_CLEAR_ENABLE(extra->db_stencil_clear); 4258 4259 db_render_control |= S_028000_RESUMMARIZE_ENABLE(extra->resummarize_enable); 4260 db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(extra->depth_compress_disable); 4261 db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(extra->stencil_compress_disable); 4262 } 4263 4264 db_render_override |= S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | 4265 S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE); 4266 4267 if (!pCreateInfo->pRasterizationState->depthClampEnable && ps->info.ps.writes_z) { 4268 /* From VK_EXT_depth_range_unrestricted spec: 4269 * 4270 * "The behavior described in Primitive Clipping still applies. 4271 * If depth clamping is disabled the depth values are still 4272 * clipped to 0 ≤ zc ≤ wc before the viewport transform. If 4273 * depth clamping is enabled the above equation is ignored and 4274 * the depth values are instead clamped to the VkViewport 4275 * minDepth and maxDepth values, which in the case of this 4276 * extension can be outside of the 0.0 to 1.0 range." 4277 */ 4278 db_render_override |= S_02800C_DISABLE_VIEWPORT_CLAMP(1); 4279 } 4280 4281 radeon_set_context_reg(ctx_cs, R_028000_DB_RENDER_CONTROL, db_render_control); 4282 4283 radeon_set_context_reg_seq(ctx_cs, R_02800C_DB_RENDER_OVERRIDE, 2); 4284 radeon_emit(ctx_cs, db_render_override); 4285 radeon_emit(ctx_cs, db_render_override2); 4286} 4287 4288static void 4289radv_pipeline_generate_blend_state(struct radeon_cmdbuf *ctx_cs, 4290 const struct radv_pipeline *pipeline, 4291 const struct radv_blend_state *blend) 4292{ 4293 radeon_set_context_reg_seq(ctx_cs, R_028780_CB_BLEND0_CONTROL, 8); 4294 radeon_emit_array(ctx_cs, blend->cb_blend_control, 8); 4295 radeon_set_context_reg(ctx_cs, R_028B70_DB_ALPHA_TO_MASK, blend->db_alpha_to_mask); 4296 4297 if (pipeline->device->physical_device->rad_info.has_rbplus) { 4298 4299 radeon_set_context_reg_seq(ctx_cs, R_028760_SX_MRT0_BLEND_OPT, 8); 4300 radeon_emit_array(ctx_cs, blend->sx_mrt_blend_opt, 8); 4301 } 4302 4303 radeon_set_context_reg(ctx_cs, R_028714_SPI_SHADER_COL_FORMAT, blend->spi_shader_col_format); 4304 4305 radeon_set_context_reg(ctx_cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask); 4306} 4307 4308static void 4309radv_pipeline_generate_raster_state(struct radeon_cmdbuf *ctx_cs, 4310 const struct radv_pipeline *pipeline, 4311 const VkGraphicsPipelineCreateInfo *pCreateInfo) 4312{ 4313 const VkPipelineRasterizationStateCreateInfo *vkraster = pCreateInfo->pRasterizationState; 4314 const VkConservativeRasterizationModeEXT mode = radv_get_conservative_raster_mode(vkraster); 4315 uint32_t pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1); 4316 4317 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) { 4318 /* Conservative rasterization. */ 4319 if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) { 4320 pa_sc_conservative_rast = S_028C4C_PREZ_AA_MASK_ENABLE(1) | S_028C4C_POSTZ_AA_MASK_ENABLE(1) | 4321 S_028C4C_CENTROID_SAMPLE_OVERRIDE(1); 4322 4323 if (mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT) { 4324 pa_sc_conservative_rast |= 4325 S_028C4C_OVER_RAST_ENABLE(1) | S_028C4C_OVER_RAST_SAMPLE_SELECT(0) | 4326 S_028C4C_UNDER_RAST_ENABLE(0) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(1) | 4327 S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(1); 4328 } else { 4329 assert(mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT); 4330 pa_sc_conservative_rast |= 4331 S_028C4C_OVER_RAST_ENABLE(0) | S_028C4C_OVER_RAST_SAMPLE_SELECT(1) | 4332 S_028C4C_UNDER_RAST_ENABLE(1) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(0) | 4333 S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(0); 4334 } 4335 } 4336 4337 radeon_set_context_reg(ctx_cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, 4338 pa_sc_conservative_rast); 4339 } 4340} 4341 4342static void 4343radv_pipeline_generate_multisample_state(struct radeon_cmdbuf *ctx_cs, 4344 const struct radv_pipeline *pipeline) 4345{ 4346 const struct radv_multisample_state *ms = &pipeline->graphics.ms; 4347 4348 radeon_set_context_reg_seq(ctx_cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); 4349 radeon_emit(ctx_cs, ms->pa_sc_aa_mask[0]); 4350 radeon_emit(ctx_cs, ms->pa_sc_aa_mask[1]); 4351 4352 radeon_set_context_reg(ctx_cs, R_028804_DB_EQAA, ms->db_eqaa); 4353 radeon_set_context_reg(ctx_cs, R_028BE0_PA_SC_AA_CONFIG, ms->pa_sc_aa_config); 4354 4355 radeon_set_context_reg_seq(ctx_cs, R_028A48_PA_SC_MODE_CNTL_0, 2); 4356 radeon_emit(ctx_cs, ms->pa_sc_mode_cntl_0); 4357 radeon_emit(ctx_cs, ms->pa_sc_mode_cntl_1); 4358 4359 /* The exclusion bits can be set to improve rasterization efficiency 4360 * if no sample lies on the pixel boundary (-8 sample offset). It's 4361 * currently always TRUE because the driver doesn't support 16 samples. 4362 */ 4363 bool exclusion = pipeline->device->physical_device->rad_info.chip_class >= GFX7; 4364 radeon_set_context_reg( 4365 ctx_cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, 4366 S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion)); 4367} 4368 4369static void 4370radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs, 4371 const struct radv_pipeline *pipeline) 4372{ 4373 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline); 4374 const struct radv_shader_variant *vs = pipeline->shaders[MESA_SHADER_TESS_EVAL] 4375 ? pipeline->shaders[MESA_SHADER_TESS_EVAL] 4376 : pipeline->shaders[MESA_SHADER_VERTEX]; 4377 unsigned vgt_primitiveid_en = 0; 4378 uint32_t vgt_gs_mode = 0; 4379 4380 if (radv_pipeline_has_ngg(pipeline)) 4381 return; 4382 4383 if (radv_pipeline_has_gs(pipeline)) { 4384 const struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY]; 4385 4386 vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out, 4387 pipeline->device->physical_device->rad_info.chip_class); 4388 } else if (outinfo->export_prim_id || vs->info.uses_prim_id) { 4389 vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A); 4390 vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1); 4391 } 4392 4393 radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en); 4394 radeon_set_context_reg(ctx_cs, R_028A40_VGT_GS_MODE, vgt_gs_mode); 4395} 4396 4397static void 4398radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, 4399 const struct radv_pipeline *pipeline, 4400 const struct radv_shader_variant *shader) 4401{ 4402 uint64_t va = radv_shader_variant_get_va(shader); 4403 4404 radeon_set_sh_reg_seq(cs, R_00B120_SPI_SHADER_PGM_LO_VS, 4); 4405 radeon_emit(cs, va >> 8); 4406 radeon_emit(cs, S_00B124_MEM_BASE(va >> 40)); 4407 radeon_emit(cs, shader->config.rsrc1); 4408 radeon_emit(cs, shader->config.rsrc2); 4409 4410 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline); 4411 unsigned clip_dist_mask, cull_dist_mask, total_mask; 4412 clip_dist_mask = outinfo->clip_dist_mask; 4413 cull_dist_mask = outinfo->cull_dist_mask; 4414 total_mask = clip_dist_mask | cull_dist_mask; 4415 4416 bool writes_primitive_shading_rate = 4417 outinfo->writes_primitive_shading_rate || pipeline->device->force_vrs != RADV_FORCE_VRS_NONE; 4418 bool misc_vec_ena = outinfo->writes_pointsize || outinfo->writes_layer || 4419 outinfo->writes_viewport_index || writes_primitive_shading_rate; 4420 unsigned spi_vs_out_config, nparams; 4421 4422 /* VS is required to export at least one param. */ 4423 nparams = MAX2(outinfo->param_exports, 1); 4424 spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1); 4425 4426 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { 4427 spi_vs_out_config |= S_0286C4_NO_PC_EXPORT(outinfo->param_exports == 0); 4428 } 4429 4430 radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG, spi_vs_out_config); 4431 4432 radeon_set_context_reg( 4433 ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT, 4434 S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | 4435 S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP 4436 : V_02870C_SPI_SHADER_NONE) | 4437 S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP 4438 : V_02870C_SPI_SHADER_NONE) | 4439 S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP 4440 : V_02870C_SPI_SHADER_NONE)); 4441 4442 radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL, 4443 S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) | 4444 S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) | 4445 S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) | 4446 S_02881C_USE_VTX_VRS_RATE(writes_primitive_shading_rate) | 4447 S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) | 4448 S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) | 4449 S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) | 4450 S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) | 4451 total_mask << 8 | clip_dist_mask); 4452 4453 if (pipeline->device->physical_device->rad_info.chip_class <= GFX8) 4454 radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF, outinfo->writes_viewport_index); 4455 4456 unsigned late_alloc_wave64, cu_mask; 4457 ac_compute_late_alloc(&pipeline->device->physical_device->rad_info, false, false, 4458 shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask); 4459 4460 if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) { 4461 radeon_set_sh_reg_idx(pipeline->device->physical_device, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3, 4462 S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F)); 4463 radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64)); 4464 } 4465 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { 4466 uint32_t oversub_pc_lines = late_alloc_wave64 ? pipeline->device->physical_device->rad_info.pc_lines / 4 : 0; 4467 gfx10_emit_ge_pc_alloc(cs, pipeline->device->physical_device->rad_info.chip_class, oversub_pc_lines); 4468 } 4469} 4470 4471static void 4472radv_pipeline_generate_hw_es(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline, 4473 const struct radv_shader_variant *shader) 4474{ 4475 uint64_t va = radv_shader_variant_get_va(shader); 4476 4477 radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 4); 4478 radeon_emit(cs, va >> 8); 4479 radeon_emit(cs, S_00B324_MEM_BASE(va >> 40)); 4480 radeon_emit(cs, shader->config.rsrc1); 4481 radeon_emit(cs, shader->config.rsrc2); 4482} 4483 4484static void 4485radv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline, 4486 const struct radv_shader_variant *shader) 4487{ 4488 unsigned num_lds_blocks = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_lds_blocks; 4489 uint64_t va = radv_shader_variant_get_va(shader); 4490 uint32_t rsrc2 = shader->config.rsrc2; 4491 4492 radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); 4493 4494 rsrc2 |= S_00B52C_LDS_SIZE(num_lds_blocks); 4495 if (pipeline->device->physical_device->rad_info.chip_class == GFX7 && 4496 pipeline->device->physical_device->rad_info.family != CHIP_HAWAII) 4497 radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, rsrc2); 4498 4499 radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2); 4500 radeon_emit(cs, shader->config.rsrc1); 4501 radeon_emit(cs, rsrc2); 4502} 4503 4504static void 4505radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, 4506 const struct radv_pipeline *pipeline, 4507 const struct radv_shader_variant *shader) 4508{ 4509 uint64_t va = radv_shader_variant_get_va(shader); 4510 gl_shader_stage es_type = 4511 radv_pipeline_has_tess(pipeline) ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX; 4512 struct radv_shader_variant *es = es_type == MESA_SHADER_TESS_EVAL 4513 ? pipeline->shaders[MESA_SHADER_TESS_EVAL] 4514 : pipeline->shaders[MESA_SHADER_VERTEX]; 4515 const struct gfx10_ngg_info *ngg_state = &shader->info.ngg_info; 4516 4517 radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); 4518 4519 radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2); 4520 radeon_emit(cs, shader->config.rsrc1); 4521 radeon_emit(cs, shader->config.rsrc2); 4522 4523 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline); 4524 unsigned clip_dist_mask, cull_dist_mask, total_mask; 4525 clip_dist_mask = outinfo->clip_dist_mask; 4526 cull_dist_mask = outinfo->cull_dist_mask; 4527 total_mask = clip_dist_mask | cull_dist_mask; 4528 4529 bool writes_primitive_shading_rate = 4530 outinfo->writes_primitive_shading_rate || pipeline->device->force_vrs != RADV_FORCE_VRS_NONE; 4531 bool misc_vec_ena = outinfo->writes_pointsize || outinfo->writes_layer || 4532 outinfo->writes_viewport_index || writes_primitive_shading_rate; 4533 bool es_enable_prim_id = outinfo->export_prim_id || (es && es->info.uses_prim_id); 4534 bool break_wave_at_eoi = false; 4535 unsigned ge_cntl; 4536 unsigned nparams; 4537 4538 if (es_type == MESA_SHADER_TESS_EVAL) { 4539 struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY]; 4540 4541 if (es_enable_prim_id || (gs && gs->info.uses_prim_id)) 4542 break_wave_at_eoi = true; 4543 } 4544 4545 nparams = MAX2(outinfo->param_exports, 1); 4546 radeon_set_context_reg( 4547 ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG, 4548 S_0286C4_VS_EXPORT_COUNT(nparams - 1) | S_0286C4_NO_PC_EXPORT(outinfo->param_exports == 0)); 4549 4550 radeon_set_context_reg(ctx_cs, R_028708_SPI_SHADER_IDX_FORMAT, 4551 S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP)); 4552 radeon_set_context_reg( 4553 ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT, 4554 S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | 4555 S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP 4556 : V_02870C_SPI_SHADER_NONE) | 4557 S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP 4558 : V_02870C_SPI_SHADER_NONE) | 4559 S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP 4560 : V_02870C_SPI_SHADER_NONE)); 4561 4562 radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL, 4563 S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) | 4564 S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) | 4565 S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) | 4566 S_02881C_USE_VTX_VRS_RATE(writes_primitive_shading_rate) | 4567 S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) | 4568 S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) | 4569 S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) | 4570 S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) | 4571 total_mask << 8 | clip_dist_mask); 4572 4573 radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, 4574 S_028A84_PRIMITIVEID_EN(es_enable_prim_id) | 4575 S_028A84_NGG_DISABLE_PROVOK_REUSE(outinfo->export_prim_id)); 4576 4577 radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE, 4578 ngg_state->vgt_esgs_ring_itemsize); 4579 4580 /* NGG specific registers. */ 4581 struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY]; 4582 uint32_t gs_num_invocations = gs ? gs->info.gs.invocations : 1; 4583 4584 radeon_set_context_reg( 4585 ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL, 4586 S_028A44_ES_VERTS_PER_SUBGRP(ngg_state->hw_max_esverts) | 4587 S_028A44_GS_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) | 4588 S_028A44_GS_INST_PRIMS_IN_SUBGRP(ngg_state->max_gsprims * gs_num_invocations)); 4589 radeon_set_context_reg(ctx_cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, 4590 S_0287FC_MAX_VERTS_PER_SUBGROUP(ngg_state->max_out_verts)); 4591 radeon_set_context_reg(ctx_cs, R_028B4C_GE_NGG_SUBGRP_CNTL, 4592 S_028B4C_PRIM_AMP_FACTOR(ngg_state->prim_amp_factor) | 4593 S_028B4C_THDS_PER_SUBGRP(0)); /* for fast launch */ 4594 radeon_set_context_reg( 4595 ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT, 4596 S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) | 4597 S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance)); 4598 4599 ge_cntl = S_03096C_PRIM_GRP_SIZE(ngg_state->max_gsprims) | 4600 S_03096C_VERT_GRP_SIZE(ngg_state->enable_vertex_grouping ? ngg_state->hw_max_esverts : 256) | /* 256 = disable vertex grouping */ 4601 S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); 4602 4603 /* Bug workaround for a possible hang with non-tessellation cases. 4604 * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0 4605 * 4606 * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5 4607 */ 4608 if (pipeline->device->physical_device->rad_info.chip_class == GFX10 && 4609 !radv_pipeline_has_tess(pipeline) && ngg_state->hw_max_esverts != 256) { 4610 ge_cntl &= C_03096C_VERT_GRP_SIZE; 4611 4612 if (ngg_state->hw_max_esverts > 5) { 4613 ge_cntl |= S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts - 5); 4614 } 4615 } 4616 4617 radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL, ge_cntl); 4618 4619 unsigned late_alloc_wave64, cu_mask; 4620 ac_compute_late_alloc(&pipeline->device->physical_device->rad_info, true, shader->info.has_ngg_culling, 4621 shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask); 4622 4623 radeon_set_sh_reg_idx( 4624 pipeline->device->physical_device, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3, 4625 S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F)); 4626 radeon_set_sh_reg_idx( 4627 pipeline->device->physical_device, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3, 4628 S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64)); 4629 4630 uint32_t oversub_pc_lines = late_alloc_wave64 ? pipeline->device->physical_device->rad_info.pc_lines / 4 : 0; 4631 if (shader->info.has_ngg_culling) { 4632 unsigned oversub_factor = 2; 4633 4634 if (outinfo->param_exports > 4) 4635 oversub_factor = 4; 4636 else if (outinfo->param_exports > 2) 4637 oversub_factor = 3; 4638 4639 oversub_pc_lines *= oversub_factor; 4640 } 4641 4642 gfx10_emit_ge_pc_alloc(cs, pipeline->device->physical_device->rad_info.chip_class, oversub_pc_lines); 4643} 4644 4645static void 4646radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline, 4647 const struct radv_shader_variant *shader) 4648{ 4649 uint64_t va = radv_shader_variant_get_va(shader); 4650 4651 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) { 4652 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { 4653 radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); 4654 } else { 4655 radeon_set_sh_reg(cs, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8); 4656 } 4657 4658 radeon_set_sh_reg_seq(cs, R_00B428_SPI_SHADER_PGM_RSRC1_HS, 2); 4659 radeon_emit(cs, shader->config.rsrc1); 4660 radeon_emit(cs, shader->config.rsrc2); 4661 } else { 4662 radeon_set_sh_reg_seq(cs, R_00B420_SPI_SHADER_PGM_LO_HS, 4); 4663 radeon_emit(cs, va >> 8); 4664 radeon_emit(cs, S_00B424_MEM_BASE(va >> 40)); 4665 radeon_emit(cs, shader->config.rsrc1); 4666 radeon_emit(cs, shader->config.rsrc2); 4667 } 4668} 4669 4670static void 4671radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, 4672 const struct radv_pipeline *pipeline) 4673{ 4674 struct radv_shader_variant *vs; 4675 4676 /* Skip shaders merged into HS/GS */ 4677 vs = pipeline->shaders[MESA_SHADER_VERTEX]; 4678 if (!vs) 4679 return; 4680 4681 if (vs->info.vs.as_ls) 4682 radv_pipeline_generate_hw_ls(cs, pipeline, vs); 4683 else if (vs->info.vs.as_es) 4684 radv_pipeline_generate_hw_es(cs, pipeline, vs); 4685 else if (vs->info.is_ngg) 4686 radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, vs); 4687 else 4688 radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs); 4689} 4690 4691static void 4692radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, 4693 const struct radv_pipeline *pipeline) 4694{ 4695 struct radv_shader_variant *tes, *tcs; 4696 4697 tcs = pipeline->shaders[MESA_SHADER_TESS_CTRL]; 4698 tes = pipeline->shaders[MESA_SHADER_TESS_EVAL]; 4699 4700 if (tes) { 4701 if (tes->info.is_ngg) { 4702 radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, tes); 4703 } else if (tes->info.tes.as_es) 4704 radv_pipeline_generate_hw_es(cs, pipeline, tes); 4705 else 4706 radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, tes); 4707 } 4708 4709 radv_pipeline_generate_hw_hs(cs, pipeline, tcs); 4710 4711 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 && 4712 !radv_pipeline_has_gs(pipeline) && !radv_pipeline_has_ngg(pipeline)) { 4713 radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL, 4714 S_028A44_ES_VERTS_PER_SUBGRP(250) | S_028A44_GS_PRIMS_PER_SUBGRP(126) | 4715 S_028A44_GS_INST_PRIMS_IN_SUBGRP(126)); 4716 } 4717} 4718 4719static void 4720radv_pipeline_generate_tess_state(struct radeon_cmdbuf *ctx_cs, 4721 const struct radv_pipeline *pipeline, 4722 const VkGraphicsPipelineCreateInfo *pCreateInfo) 4723{ 4724 struct radv_shader_variant *tes = radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL); 4725 unsigned type = 0, partitioning = 0, topology = 0, distribution_mode = 0; 4726 unsigned num_tcs_input_cp, num_tcs_output_cp, num_patches; 4727 unsigned ls_hs_config; 4728 4729 num_tcs_input_cp = pCreateInfo->pTessellationState->patchControlPoints; 4730 num_tcs_output_cp = 4731 pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.tcs_vertices_out; // TCS VERTICES OUT 4732 num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches; 4733 4734 ls_hs_config = S_028B58_NUM_PATCHES(num_patches) | S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) | 4735 S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp); 4736 4737 if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) { 4738 radeon_set_context_reg_idx(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config); 4739 } else { 4740 radeon_set_context_reg(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config); 4741 } 4742 4743 switch (tes->info.tes.primitive_mode) { 4744 case GL_TRIANGLES: 4745 type = V_028B6C_TESS_TRIANGLE; 4746 break; 4747 case GL_QUADS: 4748 type = V_028B6C_TESS_QUAD; 4749 break; 4750 case GL_ISOLINES: 4751 type = V_028B6C_TESS_ISOLINE; 4752 break; 4753 } 4754 4755 switch (tes->info.tes.spacing) { 4756 case TESS_SPACING_EQUAL: 4757 partitioning = V_028B6C_PART_INTEGER; 4758 break; 4759 case TESS_SPACING_FRACTIONAL_ODD: 4760 partitioning = V_028B6C_PART_FRAC_ODD; 4761 break; 4762 case TESS_SPACING_FRACTIONAL_EVEN: 4763 partitioning = V_028B6C_PART_FRAC_EVEN; 4764 break; 4765 default: 4766 break; 4767 } 4768 4769 bool ccw = tes->info.tes.ccw; 4770 const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state = 4771 vk_find_struct_const(pCreateInfo->pTessellationState, 4772 PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO); 4773 4774 if (domain_origin_state && 4775 domain_origin_state->domainOrigin != VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT) 4776 ccw = !ccw; 4777 4778 if (tes->info.tes.point_mode) 4779 topology = V_028B6C_OUTPUT_POINT; 4780 else if (tes->info.tes.primitive_mode == GL_ISOLINES) 4781 topology = V_028B6C_OUTPUT_LINE; 4782 else if (ccw) 4783 topology = V_028B6C_OUTPUT_TRIANGLE_CCW; 4784 else 4785 topology = V_028B6C_OUTPUT_TRIANGLE_CW; 4786 4787 if (pipeline->device->physical_device->rad_info.has_distributed_tess) { 4788 if (pipeline->device->physical_device->rad_info.family == CHIP_FIJI || 4789 pipeline->device->physical_device->rad_info.family >= CHIP_POLARIS10) 4790 distribution_mode = V_028B6C_TRAPEZOIDS; 4791 else 4792 distribution_mode = V_028B6C_DONUTS; 4793 } else 4794 distribution_mode = V_028B6C_NO_DIST; 4795 4796 radeon_set_context_reg(ctx_cs, R_028B6C_VGT_TF_PARAM, 4797 S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | 4798 S_028B6C_TOPOLOGY(topology) | 4799 S_028B6C_DISTRIBUTION_MODE(distribution_mode)); 4800} 4801 4802static void 4803radv_pipeline_generate_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, 4804 const struct radv_pipeline *pipeline, 4805 const struct radv_shader_variant *gs) 4806{ 4807 const struct gfx9_gs_info *gs_state = &gs->info.gs_ring_info; 4808 unsigned gs_max_out_vertices; 4809 const uint8_t *num_components; 4810 uint8_t max_stream; 4811 unsigned offset; 4812 uint64_t va; 4813 4814 gs_max_out_vertices = gs->info.gs.vertices_out; 4815 max_stream = gs->info.gs.max_stream; 4816 num_components = gs->info.gs.num_stream_output_components; 4817 4818 offset = num_components[0] * gs_max_out_vertices; 4819 4820 radeon_set_context_reg_seq(ctx_cs, R_028A60_VGT_GSVS_RING_OFFSET_1, 3); 4821 radeon_emit(ctx_cs, offset); 4822 if (max_stream >= 1) 4823 offset += num_components[1] * gs_max_out_vertices; 4824 radeon_emit(ctx_cs, offset); 4825 if (max_stream >= 2) 4826 offset += num_components[2] * gs_max_out_vertices; 4827 radeon_emit(ctx_cs, offset); 4828 if (max_stream >= 3) 4829 offset += num_components[3] * gs_max_out_vertices; 4830 radeon_set_context_reg(ctx_cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset); 4831 4832 radeon_set_context_reg_seq(ctx_cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, 4); 4833 radeon_emit(ctx_cs, num_components[0]); 4834 radeon_emit(ctx_cs, (max_stream >= 1) ? num_components[1] : 0); 4835 radeon_emit(ctx_cs, (max_stream >= 2) ? num_components[2] : 0); 4836 radeon_emit(ctx_cs, (max_stream >= 3) ? num_components[3] : 0); 4837 4838 uint32_t gs_num_invocations = gs->info.gs.invocations; 4839 radeon_set_context_reg( 4840 ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT, 4841 S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0)); 4842 4843 radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE, 4844 gs_state->vgt_esgs_ring_itemsize); 4845 4846 va = radv_shader_variant_get_va(gs); 4847 4848 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) { 4849 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { 4850 radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); 4851 } else { 4852 radeon_set_sh_reg(cs, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8); 4853 } 4854 4855 radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2); 4856 radeon_emit(cs, gs->config.rsrc1); 4857 radeon_emit(cs, gs->config.rsrc2 | S_00B22C_LDS_SIZE(gs_state->lds_size)); 4858 4859 radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL, gs_state->vgt_gs_onchip_cntl); 4860 radeon_set_context_reg(ctx_cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, 4861 gs_state->vgt_gs_max_prims_per_subgroup); 4862 } else { 4863 radeon_set_sh_reg_seq(cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4); 4864 radeon_emit(cs, va >> 8); 4865 radeon_emit(cs, S_00B224_MEM_BASE(va >> 40)); 4866 radeon_emit(cs, gs->config.rsrc1); 4867 radeon_emit(cs, gs->config.rsrc2); 4868 } 4869 4870 if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) { 4871 radeon_set_sh_reg_idx( 4872 pipeline->device->physical_device, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3, 4873 S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F)); 4874 4875 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { 4876 radeon_set_sh_reg_idx( 4877 pipeline->device->physical_device, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3, 4878 S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0)); 4879 } 4880 } 4881 4882 radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, pipeline->gs_copy_shader); 4883} 4884 4885static void 4886radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, 4887 const struct radv_pipeline *pipeline) 4888{ 4889 struct radv_shader_variant *gs; 4890 4891 gs = pipeline->shaders[MESA_SHADER_GEOMETRY]; 4892 if (!gs) 4893 return; 4894 4895 if (gs->info.is_ngg) 4896 radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, gs); 4897 else 4898 radv_pipeline_generate_hw_gs(ctx_cs, cs, pipeline, gs); 4899 4900 radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, gs->info.gs.vertices_out); 4901} 4902 4903static uint32_t 4904offset_to_ps_input(uint32_t offset, bool flat_shade, bool explicit, bool float16) 4905{ 4906 uint32_t ps_input_cntl; 4907 if (offset <= AC_EXP_PARAM_OFFSET_31) { 4908 ps_input_cntl = S_028644_OFFSET(offset); 4909 if (flat_shade || explicit) 4910 ps_input_cntl |= S_028644_FLAT_SHADE(1); 4911 if (explicit) { 4912 /* Force parameter cache to be read in passthrough 4913 * mode. 4914 */ 4915 ps_input_cntl |= S_028644_OFFSET(1 << 5); 4916 } 4917 if (float16) { 4918 ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | S_028644_ATTR0_VALID(1); 4919 } 4920 } else { 4921 /* The input is a DEFAULT_VAL constant. */ 4922 assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); 4923 offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; 4924 ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset); 4925 } 4926 return ps_input_cntl; 4927} 4928 4929static void 4930radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, const struct radv_pipeline *pipeline) 4931{ 4932 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; 4933 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline); 4934 uint32_t ps_input_cntl[32]; 4935 4936 unsigned ps_offset = 0; 4937 4938 if (ps->info.ps.prim_id_input) { 4939 unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID]; 4940 if (vs_offset != AC_EXP_PARAM_UNDEFINED) { 4941 ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false); 4942 ++ps_offset; 4943 } 4944 } 4945 4946 if (ps->info.ps.layer_input) { 4947 unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER]; 4948 if (vs_offset != AC_EXP_PARAM_UNDEFINED) 4949 ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false); 4950 else 4951 ps_input_cntl[ps_offset] = 4952 offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false); 4953 ++ps_offset; 4954 } 4955 4956 if (ps->info.ps.viewport_index_input) { 4957 unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VIEWPORT]; 4958 if (vs_offset != AC_EXP_PARAM_UNDEFINED) 4959 ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false); 4960 else 4961 ps_input_cntl[ps_offset] = 4962 offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false); 4963 ++ps_offset; 4964 } 4965 4966 if (ps->info.ps.has_pcoord) { 4967 unsigned val; 4968 val = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20); 4969 ps_input_cntl[ps_offset] = val; 4970 ps_offset++; 4971 } 4972 4973 if (ps->info.ps.num_input_clips_culls) { 4974 unsigned vs_offset; 4975 4976 vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0]; 4977 if (vs_offset != AC_EXP_PARAM_UNDEFINED) { 4978 ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false, false); 4979 ++ps_offset; 4980 } 4981 4982 vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1]; 4983 if (vs_offset != AC_EXP_PARAM_UNDEFINED && ps->info.ps.num_input_clips_culls > 4) { 4984 ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false, false); 4985 ++ps_offset; 4986 } 4987 } 4988 4989 for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.ps.input_mask; ++i) { 4990 unsigned vs_offset; 4991 bool flat_shade; 4992 bool explicit; 4993 bool float16; 4994 if (!(ps->info.ps.input_mask & (1u << i))) 4995 continue; 4996 4997 vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VAR0 + i]; 4998 if (vs_offset == AC_EXP_PARAM_UNDEFINED) { 4999 ps_input_cntl[ps_offset] = S_028644_OFFSET(0x20); 5000 ++ps_offset; 5001 continue; 5002 } 5003 5004 flat_shade = !!(ps->info.ps.flat_shaded_mask & (1u << ps_offset)); 5005 explicit = !!(ps->info.ps.explicit_shaded_mask & (1u << ps_offset)); 5006 float16 = !!(ps->info.ps.float16_shaded_mask & (1u << ps_offset)); 5007 5008 ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade, explicit, float16); 5009 ++ps_offset; 5010 } 5011 5012 if (ps_offset) { 5013 radeon_set_context_reg_seq(ctx_cs, R_028644_SPI_PS_INPUT_CNTL_0, ps_offset); 5014 for (unsigned i = 0; i < ps_offset; i++) { 5015 radeon_emit(ctx_cs, ps_input_cntl[i]); 5016 } 5017 } 5018} 5019 5020static uint32_t 5021radv_compute_db_shader_control(const struct radv_device *device, 5022 const struct radv_pipeline *pipeline, 5023 const struct radv_shader_variant *ps) 5024{ 5025 unsigned conservative_z_export = V_02880C_EXPORT_ANY_Z; 5026 unsigned z_order; 5027 if (ps->info.ps.early_fragment_test || !ps->info.ps.writes_memory) 5028 z_order = V_02880C_EARLY_Z_THEN_LATE_Z; 5029 else 5030 z_order = V_02880C_LATE_Z; 5031 5032 if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_GREATER) 5033 conservative_z_export = V_02880C_EXPORT_GREATER_THAN_Z; 5034 else if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_LESS) 5035 conservative_z_export = V_02880C_EXPORT_LESS_THAN_Z; 5036 5037 bool disable_rbplus = device->physical_device->rad_info.has_rbplus && 5038 !device->physical_device->rad_info.rbplus_allowed; 5039 5040 /* It shouldn't be needed to export gl_SampleMask when MSAA is disabled 5041 * but this appears to break Project Cars (DXVK). See 5042 * https://bugs.freedesktop.org/show_bug.cgi?id=109401 5043 */ 5044 bool mask_export_enable = ps->info.ps.writes_sample_mask; 5045 5046 return S_02880C_Z_EXPORT_ENABLE(ps->info.ps.writes_z) | 5047 S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.ps.writes_stencil) | 5048 S_02880C_KILL_ENABLE(!!ps->info.ps.can_discard) | 5049 S_02880C_MASK_EXPORT_ENABLE(mask_export_enable) | 5050 S_02880C_CONSERVATIVE_Z_EXPORT(conservative_z_export) | S_02880C_Z_ORDER(z_order) | 5051 S_02880C_DEPTH_BEFORE_SHADER(ps->info.ps.early_fragment_test) | 5052 S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(ps->info.ps.post_depth_coverage) | 5053 S_02880C_EXEC_ON_HIER_FAIL(ps->info.ps.writes_memory) | 5054 S_02880C_EXEC_ON_NOOP(ps->info.ps.writes_memory) | 5055 S_02880C_DUAL_QUAD_DISABLE(disable_rbplus); 5056} 5057 5058static void 5059radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, 5060 struct radv_pipeline *pipeline) 5061{ 5062 struct radv_shader_variant *ps; 5063 uint64_t va; 5064 assert(pipeline->shaders[MESA_SHADER_FRAGMENT]); 5065 5066 ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; 5067 va = radv_shader_variant_get_va(ps); 5068 5069 radeon_set_sh_reg_seq(cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4); 5070 radeon_emit(cs, va >> 8); 5071 radeon_emit(cs, S_00B024_MEM_BASE(va >> 40)); 5072 radeon_emit(cs, ps->config.rsrc1); 5073 radeon_emit(cs, ps->config.rsrc2); 5074 5075 radeon_set_context_reg(ctx_cs, R_02880C_DB_SHADER_CONTROL, 5076 radv_compute_db_shader_control(pipeline->device, pipeline, ps)); 5077 5078 radeon_set_context_reg_seq(ctx_cs, R_0286CC_SPI_PS_INPUT_ENA, 2); 5079 radeon_emit(ctx_cs, ps->config.spi_ps_input_ena); 5080 radeon_emit(ctx_cs, ps->config.spi_ps_input_addr); 5081 5082 radeon_set_context_reg( 5083 ctx_cs, R_0286D8_SPI_PS_IN_CONTROL, 5084 S_0286D8_NUM_INTERP(ps->info.ps.num_interp) | S_0286D8_PS_W32_EN(ps->info.wave_size == 32)); 5085 5086 radeon_set_context_reg(ctx_cs, R_0286E0_SPI_BARYC_CNTL, pipeline->graphics.spi_baryc_cntl); 5087 5088 radeon_set_context_reg( 5089 ctx_cs, R_028710_SPI_SHADER_Z_FORMAT, 5090 ac_get_spi_shader_z_format(ps->info.ps.writes_z, ps->info.ps.writes_stencil, 5091 ps->info.ps.writes_sample_mask)); 5092} 5093 5094static void 5095radv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf *ctx_cs, 5096 const struct radv_pipeline *pipeline) 5097{ 5098 if (pipeline->device->physical_device->rad_info.family < CHIP_POLARIS10 || 5099 pipeline->device->physical_device->rad_info.chip_class >= GFX10) 5100 return; 5101 5102 unsigned vtx_reuse_depth = 30; 5103 if (radv_pipeline_has_tess(pipeline) && 5104 radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.tes.spacing == 5105 TESS_SPACING_FRACTIONAL_ODD) { 5106 vtx_reuse_depth = 14; 5107 } 5108 radeon_set_context_reg(ctx_cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 5109 S_028C58_VTX_REUSE_DEPTH(vtx_reuse_depth)); 5110} 5111 5112static void 5113radv_pipeline_generate_vgt_shader_config(struct radeon_cmdbuf *ctx_cs, 5114 const struct radv_pipeline *pipeline) 5115{ 5116 uint32_t stages = 0; 5117 if (radv_pipeline_has_tess(pipeline)) { 5118 stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1); 5119 5120 if (radv_pipeline_has_gs(pipeline)) 5121 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1); 5122 else if (radv_pipeline_has_ngg(pipeline)) 5123 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS); 5124 else 5125 stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS); 5126 } else if (radv_pipeline_has_gs(pipeline)) { 5127 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1); 5128 } else if (radv_pipeline_has_ngg(pipeline)) { 5129 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL); 5130 } 5131 5132 if (radv_pipeline_has_ngg(pipeline)) { 5133 stages |= S_028B54_PRIMGEN_EN(1); 5134 if (pipeline->streamout_shader) 5135 stages |= S_028B54_NGG_WAVE_ID_EN(1); 5136 if (radv_pipeline_has_ngg_passthrough(pipeline)) 5137 stages |= S_028B54_PRIMGEN_PASSTHRU_EN(1); 5138 } else if (radv_pipeline_has_gs(pipeline)) { 5139 stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER); 5140 } 5141 5142 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) 5143 stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2); 5144 5145 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { 5146 uint8_t hs_size = 64, gs_size = 64, vs_size = 64; 5147 5148 if (radv_pipeline_has_tess(pipeline)) 5149 hs_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.wave_size; 5150 5151 if (pipeline->shaders[MESA_SHADER_GEOMETRY]) { 5152 vs_size = gs_size = pipeline->shaders[MESA_SHADER_GEOMETRY]->info.wave_size; 5153 if (radv_pipeline_has_gs_copy_shader(pipeline)) 5154 vs_size = pipeline->gs_copy_shader->info.wave_size; 5155 } else if (pipeline->shaders[MESA_SHADER_TESS_EVAL]) 5156 vs_size = pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.wave_size; 5157 else if (pipeline->shaders[MESA_SHADER_VERTEX]) 5158 vs_size = pipeline->shaders[MESA_SHADER_VERTEX]->info.wave_size; 5159 5160 if (radv_pipeline_has_ngg(pipeline)) { 5161 assert(!radv_pipeline_has_gs_copy_shader(pipeline)); 5162 gs_size = vs_size; 5163 } 5164 5165 /* legacy GS only supports Wave64 */ 5166 stages |= S_028B54_HS_W32_EN(hs_size == 32 ? 1 : 0) | 5167 S_028B54_GS_W32_EN(gs_size == 32 ? 1 : 0) | 5168 S_028B54_VS_W32_EN(vs_size == 32 ? 1 : 0); 5169 } 5170 5171 radeon_set_context_reg(ctx_cs, R_028B54_VGT_SHADER_STAGES_EN, stages); 5172} 5173 5174static void 5175radv_pipeline_generate_cliprect_rule(struct radeon_cmdbuf *ctx_cs, 5176 const VkGraphicsPipelineCreateInfo *pCreateInfo) 5177{ 5178 const VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info = 5179 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT); 5180 uint32_t cliprect_rule = 0; 5181 5182 if (!discard_rectangle_info) { 5183 cliprect_rule = 0xffff; 5184 } else { 5185 for (unsigned i = 0; i < (1u << MAX_DISCARD_RECTANGLES); ++i) { 5186 /* Interpret i as a bitmask, and then set the bit in 5187 * the mask if that combination of rectangles in which 5188 * the pixel is contained should pass the cliprect 5189 * test. 5190 */ 5191 unsigned relevant_subset = i & ((1u << discard_rectangle_info->discardRectangleCount) - 1); 5192 5193 if (discard_rectangle_info->discardRectangleMode == 5194 VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT && 5195 !relevant_subset) 5196 continue; 5197 5198 if (discard_rectangle_info->discardRectangleMode == 5199 VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT && 5200 relevant_subset) 5201 continue; 5202 5203 cliprect_rule |= 1u << i; 5204 } 5205 } 5206 5207 radeon_set_context_reg(ctx_cs, R_02820C_PA_SC_CLIPRECT_RULE, cliprect_rule); 5208} 5209 5210static void 5211gfx10_pipeline_generate_ge_cntl(struct radeon_cmdbuf *ctx_cs, struct radv_pipeline *pipeline) 5212{ 5213 bool break_wave_at_eoi = false; 5214 unsigned primgroup_size; 5215 unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */ 5216 5217 if (radv_pipeline_has_tess(pipeline)) { 5218 primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches; 5219 } else if (radv_pipeline_has_gs(pipeline)) { 5220 const struct gfx9_gs_info *gs_state = 5221 &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info; 5222 unsigned vgt_gs_onchip_cntl = gs_state->vgt_gs_onchip_cntl; 5223 primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl); 5224 } else { 5225 primgroup_size = 128; /* recommended without a GS and tess */ 5226 } 5227 5228 if (radv_pipeline_has_tess(pipeline)) { 5229 if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id || 5230 radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.uses_prim_id) 5231 break_wave_at_eoi = true; 5232 } 5233 5234 radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL, 5235 S_03096C_PRIM_GRP_SIZE(primgroup_size) | 5236 S_03096C_VERT_GRP_SIZE(vertgroup_size) | 5237 S_03096C_PACKET_TO_ONE_PA(0) /* line stipple */ | 5238 S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi)); 5239} 5240 5241static void 5242radv_pipeline_generate_vgt_gs_out(struct radeon_cmdbuf *ctx_cs, 5243 const struct radv_pipeline *pipeline, 5244 const VkGraphicsPipelineCreateInfo *pCreateInfo, 5245 const struct radv_graphics_pipeline_create_info *extra) 5246{ 5247 uint32_t gs_out; 5248 5249 if (radv_pipeline_has_gs(pipeline)) { 5250 gs_out = 5251 si_conv_gl_prim_to_gs_out(pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs.output_prim); 5252 } else if (radv_pipeline_has_tess(pipeline)) { 5253 if (pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.point_mode) { 5254 gs_out = V_028A6C_POINTLIST; 5255 } else { 5256 gs_out = si_conv_gl_prim_to_gs_out( 5257 pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.primitive_mode); 5258 } 5259 } else { 5260 gs_out = si_conv_prim_to_gs_out(pCreateInfo->pInputAssemblyState->topology); 5261 } 5262 5263 if (extra && extra->use_rectlist) { 5264 gs_out = V_028A6C_TRISTRIP; 5265 if (radv_pipeline_has_ngg(pipeline)) 5266 gs_out = V_028A6C_RECTLIST; 5267 } 5268 5269 radeon_set_context_reg(ctx_cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out); 5270} 5271 5272static bool 5273gfx103_pipeline_vrs_coarse_shading(const struct radv_pipeline *pipeline) 5274{ 5275 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; 5276 struct radv_device *device = pipeline->device; 5277 5278 if (device->instance->debug_flags & RADV_DEBUG_NO_VRS_FLAT_SHADING) 5279 return false; 5280 5281 if (!ps->info.ps.allow_flat_shading) 5282 return false; 5283 5284 return true; 5285} 5286 5287static void 5288gfx103_pipeline_generate_vrs_state(struct radeon_cmdbuf *ctx_cs, 5289 const struct radv_pipeline *pipeline, 5290 const VkGraphicsPipelineCreateInfo *pCreateInfo) 5291{ 5292 uint32_t mode = V_028064_VRS_COMB_MODE_PASSTHRU; 5293 uint8_t rate_x = 0, rate_y = 0; 5294 bool enable_vrs = false; 5295 5296 if (vk_find_struct_const(pCreateInfo->pNext, 5297 PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR) || 5298 radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR)) { 5299 /* Enable draw call VRS because it's explicitly requested. */ 5300 enable_vrs = true; 5301 } else if (gfx103_pipeline_vrs_coarse_shading(pipeline)) { 5302 /* Enable VRS coarse shading 2x2 if the driver determined that 5303 * it's safe to enable. 5304 */ 5305 mode = V_028064_VRS_COMB_MODE_OVERRIDE; 5306 rate_x = rate_y = 1; 5307 } else if (pipeline->device->force_vrs != RADV_FORCE_VRS_NONE) { 5308 /* Force enable vertex VRS if requested by the user. */ 5309 radeon_set_context_reg( 5310 ctx_cs, R_028848_PA_CL_VRS_CNTL, 5311 S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE) | 5312 S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE)); 5313 5314 /* If the shader is using discard, turn off coarse shading 5315 * because discard at 2x2 pixel granularity degrades quality 5316 * too much. MIN allows sample shading but not coarse shading. 5317 */ 5318 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; 5319 5320 mode = ps->info.ps.can_discard ? V_028064_VRS_COMB_MODE_MIN : V_028064_VRS_COMB_MODE_PASSTHRU; 5321 } 5322 5323 radeon_set_context_reg(ctx_cs, R_028A98_VGT_DRAW_PAYLOAD_CNTL, S_028A98_EN_VRS_RATE(enable_vrs)); 5324 5325 radeon_set_context_reg(ctx_cs, R_028064_DB_VRS_OVERRIDE_CNTL, 5326 S_028064_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) | 5327 S_028064_VRS_OVERRIDE_RATE_X(rate_x) | 5328 S_028064_VRS_OVERRIDE_RATE_Y(rate_y)); 5329} 5330 5331static void 5332radv_pipeline_generate_pm4(struct radv_pipeline *pipeline, 5333 const VkGraphicsPipelineCreateInfo *pCreateInfo, 5334 const struct radv_graphics_pipeline_create_info *extra, 5335 const struct radv_blend_state *blend) 5336{ 5337 struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs; 5338 struct radeon_cmdbuf *cs = &pipeline->cs; 5339 5340 cs->max_dw = 64; 5341 ctx_cs->max_dw = 256; 5342 cs->buf = malloc(4 * (cs->max_dw + ctx_cs->max_dw)); 5343 ctx_cs->buf = cs->buf + cs->max_dw; 5344 5345 radv_pipeline_generate_depth_stencil_state(ctx_cs, pipeline, pCreateInfo, extra); 5346 radv_pipeline_generate_blend_state(ctx_cs, pipeline, blend); 5347 radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo); 5348 radv_pipeline_generate_multisample_state(ctx_cs, pipeline); 5349 radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline); 5350 radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline); 5351 5352 if (radv_pipeline_has_tess(pipeline)) { 5353 radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline); 5354 radv_pipeline_generate_tess_state(ctx_cs, pipeline, pCreateInfo); 5355 } 5356 5357 radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline); 5358 radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline); 5359 radv_pipeline_generate_ps_inputs(ctx_cs, pipeline); 5360 radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline); 5361 radv_pipeline_generate_vgt_shader_config(ctx_cs, pipeline); 5362 radv_pipeline_generate_cliprect_rule(ctx_cs, pCreateInfo); 5363 radv_pipeline_generate_vgt_gs_out(ctx_cs, pipeline, pCreateInfo, extra); 5364 5365 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 && 5366 !radv_pipeline_has_ngg(pipeline)) 5367 gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline); 5368 5369 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3) 5370 gfx103_pipeline_generate_vrs_state(ctx_cs, pipeline, pCreateInfo); 5371 5372 pipeline->ctx_cs_hash = _mesa_hash_data(ctx_cs->buf, ctx_cs->cdw * 4); 5373 5374 assert(ctx_cs->cdw <= ctx_cs->max_dw); 5375 assert(cs->cdw <= cs->max_dw); 5376} 5377 5378static void 5379radv_pipeline_init_vertex_input_state(struct radv_pipeline *pipeline, 5380 const VkGraphicsPipelineCreateInfo *pCreateInfo, 5381 const struct radv_pipeline_key *key) 5382{ 5383 const struct radv_shader_info *info = &radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info; 5384 if (!key->vs.dynamic_input_state) { 5385 const VkPipelineVertexInputStateCreateInfo *vi_info = pCreateInfo->pVertexInputState; 5386 5387 for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) { 5388 const VkVertexInputBindingDescription *desc = &vi_info->pVertexBindingDescriptions[i]; 5389 5390 pipeline->binding_stride[desc->binding] = desc->stride; 5391 } 5392 5393 for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) { 5394 const VkVertexInputAttributeDescription *desc = &vi_info->pVertexAttributeDescriptions[i]; 5395 5396 uint32_t end = desc->offset + vk_format_get_blocksize(desc->format); 5397 pipeline->attrib_ends[desc->location] = end; 5398 if (pipeline->binding_stride[desc->binding]) 5399 pipeline->attrib_index_offset[desc->location] = 5400 desc->offset / pipeline->binding_stride[desc->binding]; 5401 pipeline->attrib_bindings[desc->location] = desc->binding; 5402 } 5403 } 5404 5405 pipeline->use_per_attribute_vb_descs = info->vs.use_per_attribute_vb_descs; 5406 pipeline->last_vertex_attrib_bit = util_last_bit(info->vs.vb_desc_usage_mask); 5407 if (pipeline->shaders[MESA_SHADER_VERTEX]) 5408 pipeline->next_vertex_stage = MESA_SHADER_VERTEX; 5409 else if (pipeline->shaders[MESA_SHADER_TESS_CTRL]) 5410 pipeline->next_vertex_stage = MESA_SHADER_TESS_CTRL; 5411 else 5412 pipeline->next_vertex_stage = MESA_SHADER_GEOMETRY; 5413 if (pipeline->next_vertex_stage == MESA_SHADER_VERTEX) { 5414 const struct radv_shader_variant *vs_shader = pipeline->shaders[MESA_SHADER_VERTEX]; 5415 pipeline->can_use_simple_input = vs_shader->info.is_ngg == pipeline->device->physical_device->use_ngg && 5416 vs_shader->info.wave_size == pipeline->device->physical_device->ge_wave_size; 5417 } else { 5418 pipeline->can_use_simple_input = false; 5419 } 5420 if (info->vs.dynamic_inputs) 5421 pipeline->vb_desc_usage_mask = BITFIELD_MASK(pipeline->last_vertex_attrib_bit); 5422 else 5423 pipeline->vb_desc_usage_mask = info->vs.vb_desc_usage_mask; 5424 pipeline->vb_desc_alloc_size = util_bitcount(pipeline->vb_desc_usage_mask) * 16; 5425} 5426 5427static struct radv_shader_variant * 5428radv_pipeline_get_streamout_shader(struct radv_pipeline *pipeline) 5429{ 5430 int i; 5431 5432 for (i = MESA_SHADER_GEOMETRY; i >= MESA_SHADER_VERTEX; i--) { 5433 struct radv_shader_variant *shader = radv_get_shader(pipeline, i); 5434 5435 if (shader && shader->info.so.num_outputs > 0) 5436 return shader; 5437 } 5438 5439 return NULL; 5440} 5441 5442static bool 5443radv_shader_need_indirect_descriptor_sets(struct radv_pipeline *pipeline, gl_shader_stage stage) 5444{ 5445 struct radv_userdata_info *loc = 5446 radv_lookup_user_sgpr(pipeline, stage, AC_UD_INDIRECT_DESCRIPTOR_SETS); 5447 return loc->sgpr_idx != -1; 5448} 5449 5450static void 5451radv_pipeline_init_shader_stages_state(struct radv_pipeline *pipeline) 5452{ 5453 struct radv_device *device = pipeline->device; 5454 5455 for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { 5456 pipeline->user_data_0[i] = radv_pipeline_stage_to_user_data_0( 5457 pipeline, i, device->physical_device->rad_info.chip_class); 5458 5459 if (pipeline->shaders[i]) { 5460 pipeline->need_indirect_descriptor_sets |= 5461 radv_shader_need_indirect_descriptor_sets(pipeline, i); 5462 } 5463 } 5464 5465 struct radv_userdata_info *loc = 5466 radv_lookup_user_sgpr(pipeline, MESA_SHADER_VERTEX, AC_UD_VS_BASE_VERTEX_START_INSTANCE); 5467 if (loc->sgpr_idx != -1) { 5468 pipeline->graphics.vtx_base_sgpr = pipeline->user_data_0[MESA_SHADER_VERTEX]; 5469 pipeline->graphics.vtx_base_sgpr += loc->sgpr_idx * 4; 5470 pipeline->graphics.vtx_emit_num = loc->num_sgprs; 5471 pipeline->graphics.uses_drawid = 5472 radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info.vs.needs_draw_id; 5473 pipeline->graphics.uses_baseinstance = 5474 radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info.vs.needs_base_instance; 5475 } 5476} 5477 5478static VkResult 5479radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device, 5480 struct radv_pipeline_cache *cache, 5481 const VkGraphicsPipelineCreateInfo *pCreateInfo, 5482 const struct radv_graphics_pipeline_create_info *extra) 5483{ 5484 RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout); 5485 VkResult result; 5486 5487 pipeline->device = device; 5488 pipeline->graphics.last_vgt_api_stage = MESA_SHADER_NONE; 5489 5490 struct radv_blend_state blend = radv_pipeline_init_blend_state(pipeline, pCreateInfo, extra); 5491 5492 const VkPipelineCreationFeedbackCreateInfoEXT *creation_feedback = 5493 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT); 5494 radv_init_feedback(creation_feedback); 5495 5496 VkPipelineCreationFeedbackEXT *pipeline_feedback = 5497 creation_feedback ? creation_feedback->pPipelineCreationFeedback : NULL; 5498 5499 const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = { 5500 0, 5501 }; 5502 VkPipelineCreationFeedbackEXT *stage_feedbacks[MESA_SHADER_STAGES] = {0}; 5503 for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) { 5504 gl_shader_stage stage = ffs(pCreateInfo->pStages[i].stage) - 1; 5505 pStages[stage] = &pCreateInfo->pStages[i]; 5506 if (creation_feedback) 5507 stage_feedbacks[stage] = &creation_feedback->pPipelineStageCreationFeedbacks[i]; 5508 } 5509 5510 struct radv_pipeline_key key = 5511 radv_generate_graphics_pipeline_key(pipeline, pCreateInfo, &blend); 5512 5513 result = radv_create_shaders(pipeline, pipeline_layout, device, cache, &key, pStages, 5514 pCreateInfo->flags, NULL, pipeline_feedback, stage_feedbacks); 5515 if (result != VK_SUCCESS) 5516 return result; 5517 5518 pipeline->graphics.spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1); 5519 radv_pipeline_init_multisample_state(pipeline, &blend, pCreateInfo); 5520 radv_pipeline_init_input_assembly_state(pipeline, pCreateInfo, extra); 5521 radv_pipeline_init_dynamic_state(pipeline, pCreateInfo, extra); 5522 radv_pipeline_init_raster_state(pipeline, pCreateInfo); 5523 radv_pipeline_init_depth_stencil_state(pipeline, pCreateInfo); 5524 5525 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3) 5526 gfx103_pipeline_init_vrs_state(pipeline, pCreateInfo); 5527 5528 /* Ensure that some export memory is always allocated, for two reasons: 5529 * 5530 * 1) Correctness: The hardware ignores the EXEC mask if no export 5531 * memory is allocated, so KILL and alpha test do not work correctly 5532 * without this. 5533 * 2) Performance: Every shader needs at least a NULL export, even when 5534 * it writes no color/depth output. The NULL export instruction 5535 * stalls without this setting. 5536 * 5537 * Don't add this to CB_SHADER_MASK. 5538 * 5539 * GFX10 supports pixel shaders without exports by setting both the 5540 * color and Z formats to SPI_SHADER_ZERO. The hw will skip export 5541 * instructions if any are present. 5542 */ 5543 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; 5544 if ((pipeline->device->physical_device->rad_info.chip_class <= GFX9 || 5545 ps->info.ps.can_discard) && 5546 !blend.spi_shader_col_format) { 5547 if (!ps->info.ps.writes_z && !ps->info.ps.writes_stencil && !ps->info.ps.writes_sample_mask) 5548 blend.spi_shader_col_format = V_028714_SPI_SHADER_32_R; 5549 } 5550 5551 if (extra && (extra->custom_blend_mode == V_028808_CB_ELIMINATE_FAST_CLEAR || 5552 extra->custom_blend_mode == V_028808_CB_FMASK_DECOMPRESS || 5553 extra->custom_blend_mode == V_028808_CB_DCC_DECOMPRESS || 5554 extra->custom_blend_mode == V_028808_CB_RESOLVE)) { 5555 /* According to the CB spec states, CB_SHADER_MASK should be 5556 * set to enable writes to all four channels of MRT0. 5557 */ 5558 blend.cb_shader_mask = 0xf; 5559 } 5560 5561 pipeline->graphics.col_format = blend.spi_shader_col_format; 5562 pipeline->graphics.cb_target_mask = blend.cb_target_mask; 5563 5564 if (radv_pipeline_has_gs(pipeline) && !radv_pipeline_has_ngg(pipeline)) { 5565 struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY]; 5566 5567 radv_pipeline_init_gs_ring_state(pipeline, &gs->info.gs_ring_info); 5568 } 5569 5570 if (radv_pipeline_has_tess(pipeline)) { 5571 pipeline->graphics.tess_patch_control_points = 5572 pCreateInfo->pTessellationState->patchControlPoints; 5573 } 5574 5575 radv_pipeline_init_vertex_input_state(pipeline, pCreateInfo, &key); 5576 radv_pipeline_init_binning_state(pipeline, pCreateInfo, &blend); 5577 radv_pipeline_init_shader_stages_state(pipeline); 5578 radv_pipeline_init_scratch(device, pipeline); 5579 5580 /* Find the last vertex shader stage that eventually uses streamout. */ 5581 pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline); 5582 5583 pipeline->graphics.is_ngg = radv_pipeline_has_ngg(pipeline); 5584 pipeline->graphics.has_ngg_culling = 5585 pipeline->graphics.is_ngg && 5586 pipeline->shaders[pipeline->graphics.last_vgt_api_stage]->info.has_ngg_culling; 5587 5588 pipeline->push_constant_size = pipeline_layout->push_constant_size; 5589 pipeline->dynamic_offset_count = pipeline_layout->dynamic_offset_count; 5590 5591 radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend); 5592 5593 return result; 5594} 5595 5596VkResult 5597radv_graphics_pipeline_create(VkDevice _device, VkPipelineCache _cache, 5598 const VkGraphicsPipelineCreateInfo *pCreateInfo, 5599 const struct radv_graphics_pipeline_create_info *extra, 5600 const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline) 5601{ 5602 RADV_FROM_HANDLE(radv_device, device, _device); 5603 RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache); 5604 struct radv_pipeline *pipeline; 5605 VkResult result; 5606 5607 pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8, 5608 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 5609 if (pipeline == NULL) 5610 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 5611 5612 vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE); 5613 pipeline->type = RADV_PIPELINE_GRAPHICS; 5614 5615 result = radv_pipeline_init(pipeline, device, cache, pCreateInfo, extra); 5616 if (result != VK_SUCCESS) { 5617 radv_pipeline_destroy(device, pipeline, pAllocator); 5618 return result; 5619 } 5620 5621 *pPipeline = radv_pipeline_to_handle(pipeline); 5622 5623 return VK_SUCCESS; 5624} 5625 5626VkResult 5627radv_CreateGraphicsPipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count, 5628 const VkGraphicsPipelineCreateInfo *pCreateInfos, 5629 const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines) 5630{ 5631 VkResult result = VK_SUCCESS; 5632 unsigned i = 0; 5633 5634 for (; i < count; i++) { 5635 VkResult r; 5636 r = radv_graphics_pipeline_create(_device, pipelineCache, &pCreateInfos[i], NULL, pAllocator, 5637 &pPipelines[i]); 5638 if (r != VK_SUCCESS) { 5639 result = r; 5640 pPipelines[i] = VK_NULL_HANDLE; 5641 5642 if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT) 5643 break; 5644 } 5645 } 5646 5647 for (; i < count; ++i) 5648 pPipelines[i] = VK_NULL_HANDLE; 5649 5650 return result; 5651} 5652 5653static void 5654radv_pipeline_generate_hw_cs(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline) 5655{ 5656 struct radv_shader_variant *shader = pipeline->shaders[MESA_SHADER_COMPUTE]; 5657 uint64_t va = radv_shader_variant_get_va(shader); 5658 struct radv_device *device = pipeline->device; 5659 5660 radeon_set_sh_reg(cs, R_00B830_COMPUTE_PGM_LO, va >> 8); 5661 5662 radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); 5663 radeon_emit(cs, shader->config.rsrc1); 5664 radeon_emit(cs, shader->config.rsrc2); 5665 if (device->physical_device->rad_info.chip_class >= GFX10) { 5666 radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, shader->config.rsrc3); 5667 } 5668} 5669 5670static void 5671radv_pipeline_generate_compute_state(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline) 5672{ 5673 struct radv_shader_variant *shader = pipeline->shaders[MESA_SHADER_COMPUTE]; 5674 struct radv_device *device = pipeline->device; 5675 unsigned threads_per_threadgroup; 5676 unsigned threadgroups_per_cu = 1; 5677 unsigned waves_per_threadgroup; 5678 unsigned max_waves_per_sh = 0; 5679 5680 /* Calculate best compute resource limits. */ 5681 threads_per_threadgroup = 5682 shader->info.cs.block_size[0] * shader->info.cs.block_size[1] * shader->info.cs.block_size[2]; 5683 waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, shader->info.wave_size); 5684 5685 if (device->physical_device->rad_info.chip_class >= GFX10 && waves_per_threadgroup == 1) 5686 threadgroups_per_cu = 2; 5687 5688 radeon_set_sh_reg( 5689 cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 5690 ac_get_compute_resource_limits(&device->physical_device->rad_info, waves_per_threadgroup, 5691 max_waves_per_sh, threadgroups_per_cu)); 5692 5693 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); 5694 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[0])); 5695 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[1])); 5696 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[2])); 5697} 5698 5699static void 5700radv_compute_generate_pm4(struct radv_pipeline *pipeline) 5701{ 5702 struct radv_device *device = pipeline->device; 5703 struct radeon_cmdbuf *cs = &pipeline->cs; 5704 5705 cs->max_dw = device->physical_device->rad_info.chip_class >= GFX10 ? 19 : 16; 5706 cs->buf = malloc(cs->max_dw * 4); 5707 5708 radv_pipeline_generate_hw_cs(cs, pipeline); 5709 radv_pipeline_generate_compute_state(cs, pipeline); 5710 5711 assert(pipeline->cs.cdw <= pipeline->cs.max_dw); 5712} 5713 5714static struct radv_pipeline_key 5715radv_generate_compute_pipeline_key(struct radv_pipeline *pipeline, 5716 const VkComputePipelineCreateInfo *pCreateInfo) 5717{ 5718 const VkPipelineShaderStageCreateInfo *stage = &pCreateInfo->stage; 5719 struct radv_pipeline_key key; 5720 memset(&key, 0, sizeof(key)); 5721 5722 if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT) 5723 key.optimisations_disabled = 1; 5724 5725 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *subgroup_size = 5726 vk_find_struct_const(stage->pNext, 5727 PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT); 5728 5729 if (subgroup_size) { 5730 assert(subgroup_size->requiredSubgroupSize == 32 || 5731 subgroup_size->requiredSubgroupSize == 64); 5732 key.cs.compute_subgroup_size = subgroup_size->requiredSubgroupSize; 5733 } else if (stage->flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT) { 5734 key.cs.require_full_subgroups = true; 5735 } 5736 5737 return key; 5738} 5739 5740VkResult 5741radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache, 5742 const VkComputePipelineCreateInfo *pCreateInfo, 5743 const VkAllocationCallbacks *pAllocator, const uint8_t *custom_hash, 5744 struct radv_pipeline_shader_stack_size *rt_stack_sizes, 5745 uint32_t rt_group_count, VkPipeline *pPipeline) 5746{ 5747 RADV_FROM_HANDLE(radv_device, device, _device); 5748 RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache); 5749 RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout); 5750 const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = { 5751 0, 5752 }; 5753 VkPipelineCreationFeedbackEXT *stage_feedbacks[MESA_SHADER_STAGES] = {0}; 5754 struct radv_pipeline *pipeline; 5755 VkResult result; 5756 5757 pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8, 5758 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 5759 if (pipeline == NULL) { 5760 free(rt_stack_sizes); 5761 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 5762 } 5763 5764 vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE); 5765 pipeline->type = RADV_PIPELINE_COMPUTE; 5766 5767 pipeline->device = device; 5768 pipeline->graphics.last_vgt_api_stage = MESA_SHADER_NONE; 5769 pipeline->compute.rt_stack_sizes = rt_stack_sizes; 5770 pipeline->compute.group_count = rt_group_count; 5771 5772 const VkPipelineCreationFeedbackCreateInfoEXT *creation_feedback = 5773 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT); 5774 radv_init_feedback(creation_feedback); 5775 5776 VkPipelineCreationFeedbackEXT *pipeline_feedback = 5777 creation_feedback ? creation_feedback->pPipelineCreationFeedback : NULL; 5778 if (creation_feedback) 5779 stage_feedbacks[MESA_SHADER_COMPUTE] = &creation_feedback->pPipelineStageCreationFeedbacks[0]; 5780 5781 pStages[MESA_SHADER_COMPUTE] = &pCreateInfo->stage; 5782 5783 struct radv_pipeline_key key = radv_generate_compute_pipeline_key(pipeline, pCreateInfo); 5784 5785 result = radv_create_shaders(pipeline, pipeline_layout, device, cache, &key, pStages, 5786 pCreateInfo->flags, custom_hash, pipeline_feedback, stage_feedbacks); 5787 if (result != VK_SUCCESS) { 5788 radv_pipeline_destroy(device, pipeline, pAllocator); 5789 return result; 5790 } 5791 5792 pipeline->user_data_0[MESA_SHADER_COMPUTE] = radv_pipeline_stage_to_user_data_0( 5793 pipeline, MESA_SHADER_COMPUTE, device->physical_device->rad_info.chip_class); 5794 pipeline->need_indirect_descriptor_sets |= 5795 radv_shader_need_indirect_descriptor_sets(pipeline, MESA_SHADER_COMPUTE); 5796 radv_pipeline_init_scratch(device, pipeline); 5797 5798 pipeline->push_constant_size = pipeline_layout->push_constant_size; 5799 pipeline->dynamic_offset_count = pipeline_layout->dynamic_offset_count; 5800 5801 radv_compute_generate_pm4(pipeline); 5802 5803 *pPipeline = radv_pipeline_to_handle(pipeline); 5804 5805 return VK_SUCCESS; 5806} 5807 5808VkResult 5809radv_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count, 5810 const VkComputePipelineCreateInfo *pCreateInfos, 5811 const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines) 5812{ 5813 VkResult result = VK_SUCCESS; 5814 5815 unsigned i = 0; 5816 for (; i < count; i++) { 5817 VkResult r; 5818 r = radv_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator, NULL, 5819 NULL, 0, &pPipelines[i]); 5820 if (r != VK_SUCCESS) { 5821 result = r; 5822 pPipelines[i] = VK_NULL_HANDLE; 5823 5824 if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT) 5825 break; 5826 } 5827 } 5828 5829 for (; i < count; ++i) 5830 pPipelines[i] = VK_NULL_HANDLE; 5831 5832 return result; 5833} 5834 5835static uint32_t 5836radv_get_executable_count(const struct radv_pipeline *pipeline) 5837{ 5838 uint32_t ret = 0; 5839 for (int i = 0; i < MESA_SHADER_STAGES; ++i) { 5840 if (!pipeline->shaders[i]) 5841 continue; 5842 5843 if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(pipeline)) { 5844 ret += 2u; 5845 } else { 5846 ret += 1u; 5847 } 5848 } 5849 return ret; 5850} 5851 5852static struct radv_shader_variant * 5853radv_get_shader_from_executable_index(const struct radv_pipeline *pipeline, int index, 5854 gl_shader_stage *stage) 5855{ 5856 for (int i = 0; i < MESA_SHADER_STAGES; ++i) { 5857 if (!pipeline->shaders[i]) 5858 continue; 5859 if (!index) { 5860 *stage = i; 5861 return pipeline->shaders[i]; 5862 } 5863 5864 --index; 5865 5866 if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(pipeline)) { 5867 if (!index) { 5868 *stage = i; 5869 return pipeline->gs_copy_shader; 5870 } 5871 --index; 5872 } 5873 } 5874 5875 *stage = -1; 5876 return NULL; 5877} 5878 5879/* Basically strlcpy (which does not exist on linux) specialized for 5880 * descriptions. */ 5881static void 5882desc_copy(char *desc, const char *src) 5883{ 5884 int len = strlen(src); 5885 assert(len < VK_MAX_DESCRIPTION_SIZE); 5886 memcpy(desc, src, len); 5887 memset(desc + len, 0, VK_MAX_DESCRIPTION_SIZE - len); 5888} 5889 5890VkResult 5891radv_GetPipelineExecutablePropertiesKHR(VkDevice _device, const VkPipelineInfoKHR *pPipelineInfo, 5892 uint32_t *pExecutableCount, 5893 VkPipelineExecutablePropertiesKHR *pProperties) 5894{ 5895 RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelineInfo->pipeline); 5896 const uint32_t total_count = radv_get_executable_count(pipeline); 5897 5898 if (!pProperties) { 5899 *pExecutableCount = total_count; 5900 return VK_SUCCESS; 5901 } 5902 5903 const uint32_t count = MIN2(total_count, *pExecutableCount); 5904 for (unsigned i = 0, executable_idx = 0; i < MESA_SHADER_STAGES && executable_idx < count; ++i) { 5905 if (!pipeline->shaders[i]) 5906 continue; 5907 pProperties[executable_idx].stages = mesa_to_vk_shader_stage(i); 5908 const char *name = NULL; 5909 const char *description = NULL; 5910 switch (i) { 5911 case MESA_SHADER_VERTEX: 5912 name = "Vertex Shader"; 5913 description = "Vulkan Vertex Shader"; 5914 break; 5915 case MESA_SHADER_TESS_CTRL: 5916 if (!pipeline->shaders[MESA_SHADER_VERTEX]) { 5917 pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT; 5918 name = "Vertex + Tessellation Control Shaders"; 5919 description = "Combined Vulkan Vertex and Tessellation Control Shaders"; 5920 } else { 5921 name = "Tessellation Control Shader"; 5922 description = "Vulkan Tessellation Control Shader"; 5923 } 5924 break; 5925 case MESA_SHADER_TESS_EVAL: 5926 name = "Tessellation Evaluation Shader"; 5927 description = "Vulkan Tessellation Evaluation Shader"; 5928 break; 5929 case MESA_SHADER_GEOMETRY: 5930 if (radv_pipeline_has_tess(pipeline) && !pipeline->shaders[MESA_SHADER_TESS_EVAL]) { 5931 pProperties[executable_idx].stages |= VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT; 5932 name = "Tessellation Evaluation + Geometry Shaders"; 5933 description = "Combined Vulkan Tessellation Evaluation and Geometry Shaders"; 5934 } else if (!radv_pipeline_has_tess(pipeline) && !pipeline->shaders[MESA_SHADER_VERTEX]) { 5935 pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT; 5936 name = "Vertex + Geometry Shader"; 5937 description = "Combined Vulkan Vertex and Geometry Shaders"; 5938 } else { 5939 name = "Geometry Shader"; 5940 description = "Vulkan Geometry Shader"; 5941 } 5942 break; 5943 case MESA_SHADER_FRAGMENT: 5944 name = "Fragment Shader"; 5945 description = "Vulkan Fragment Shader"; 5946 break; 5947 case MESA_SHADER_COMPUTE: 5948 name = "Compute Shader"; 5949 description = "Vulkan Compute Shader"; 5950 break; 5951 } 5952 5953 pProperties[executable_idx].subgroupSize = pipeline->shaders[i]->info.wave_size; 5954 desc_copy(pProperties[executable_idx].name, name); 5955 desc_copy(pProperties[executable_idx].description, description); 5956 5957 ++executable_idx; 5958 if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(pipeline)) { 5959 assert(pipeline->gs_copy_shader); 5960 if (executable_idx >= count) 5961 break; 5962 5963 pProperties[executable_idx].stages = VK_SHADER_STAGE_GEOMETRY_BIT; 5964 pProperties[executable_idx].subgroupSize = 64; 5965 desc_copy(pProperties[executable_idx].name, "GS Copy Shader"); 5966 desc_copy(pProperties[executable_idx].description, 5967 "Extra shader stage that loads the GS output ringbuffer into the rasterizer"); 5968 5969 ++executable_idx; 5970 } 5971 } 5972 5973 VkResult result = *pExecutableCount < total_count ? VK_INCOMPLETE : VK_SUCCESS; 5974 *pExecutableCount = count; 5975 return result; 5976} 5977 5978VkResult 5979radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, 5980 const VkPipelineExecutableInfoKHR *pExecutableInfo, 5981 uint32_t *pStatisticCount, 5982 VkPipelineExecutableStatisticKHR *pStatistics) 5983{ 5984 RADV_FROM_HANDLE(radv_device, device, _device); 5985 RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline); 5986 gl_shader_stage stage; 5987 struct radv_shader_variant *shader = 5988 radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage); 5989 5990 enum chip_class chip_class = device->physical_device->rad_info.chip_class; 5991 unsigned lds_increment = chip_class >= GFX7 ? 512 : 256; 5992 unsigned max_waves = radv_get_max_waves(device, shader, stage); 5993 5994 VkPipelineExecutableStatisticKHR *s = pStatistics; 5995 VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount : 0); 5996 VkResult result = VK_SUCCESS; 5997 5998 if (s < end) { 5999 desc_copy(s->name, "SGPRs"); 6000 desc_copy(s->description, "Number of SGPR registers allocated per subgroup"); 6001 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 6002 s->value.u64 = shader->config.num_sgprs; 6003 } 6004 ++s; 6005 6006 if (s < end) { 6007 desc_copy(s->name, "VGPRs"); 6008 desc_copy(s->description, "Number of VGPR registers allocated per subgroup"); 6009 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 6010 s->value.u64 = shader->config.num_vgprs; 6011 } 6012 ++s; 6013 6014 if (s < end) { 6015 desc_copy(s->name, "Spilled SGPRs"); 6016 desc_copy(s->description, "Number of SGPR registers spilled per subgroup"); 6017 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 6018 s->value.u64 = shader->config.spilled_sgprs; 6019 } 6020 ++s; 6021 6022 if (s < end) { 6023 desc_copy(s->name, "Spilled VGPRs"); 6024 desc_copy(s->description, "Number of VGPR registers spilled per subgroup"); 6025 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 6026 s->value.u64 = shader->config.spilled_vgprs; 6027 } 6028 ++s; 6029 6030 if (s < end) { 6031 desc_copy(s->name, "Code size"); 6032 desc_copy(s->description, "Code size in bytes"); 6033 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 6034 s->value.u64 = shader->exec_size; 6035 } 6036 ++s; 6037 6038 if (s < end) { 6039 desc_copy(s->name, "LDS size"); 6040 desc_copy(s->description, "LDS size in bytes per workgroup"); 6041 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 6042 s->value.u64 = shader->config.lds_size * lds_increment; 6043 } 6044 ++s; 6045 6046 if (s < end) { 6047 desc_copy(s->name, "Scratch size"); 6048 desc_copy(s->description, "Private memory in bytes per subgroup"); 6049 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 6050 s->value.u64 = shader->config.scratch_bytes_per_wave; 6051 } 6052 ++s; 6053 6054 if (s < end) { 6055 desc_copy(s->name, "Subgroups per SIMD"); 6056 desc_copy(s->description, "The maximum number of subgroups in flight on a SIMD unit"); 6057 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 6058 s->value.u64 = max_waves; 6059 } 6060 ++s; 6061 6062 if (shader->statistics) { 6063 for (unsigned i = 0; i < aco_num_statistics; i++) { 6064 const struct aco_compiler_statistic_info *info = &aco_statistic_infos[i]; 6065 if (s < end) { 6066 desc_copy(s->name, info->name); 6067 desc_copy(s->description, info->desc); 6068 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 6069 s->value.u64 = shader->statistics[i]; 6070 } 6071 ++s; 6072 } 6073 } 6074 6075 if (!pStatistics) 6076 *pStatisticCount = s - pStatistics; 6077 else if (s > end) { 6078 *pStatisticCount = end - pStatistics; 6079 result = VK_INCOMPLETE; 6080 } else { 6081 *pStatisticCount = s - pStatistics; 6082 } 6083 6084 return result; 6085} 6086 6087static VkResult 6088radv_copy_representation(void *data, size_t *data_size, const char *src) 6089{ 6090 size_t total_size = strlen(src) + 1; 6091 6092 if (!data) { 6093 *data_size = total_size; 6094 return VK_SUCCESS; 6095 } 6096 6097 size_t size = MIN2(total_size, *data_size); 6098 6099 memcpy(data, src, size); 6100 if (size) 6101 *((char *)data + size - 1) = 0; 6102 return size < total_size ? VK_INCOMPLETE : VK_SUCCESS; 6103} 6104 6105VkResult 6106radv_GetPipelineExecutableInternalRepresentationsKHR( 6107 VkDevice device, const VkPipelineExecutableInfoKHR *pExecutableInfo, 6108 uint32_t *pInternalRepresentationCount, 6109 VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations) 6110{ 6111 RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline); 6112 gl_shader_stage stage; 6113 struct radv_shader_variant *shader = 6114 radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage); 6115 6116 VkPipelineExecutableInternalRepresentationKHR *p = pInternalRepresentations; 6117 VkPipelineExecutableInternalRepresentationKHR *end = 6118 p + (pInternalRepresentations ? *pInternalRepresentationCount : 0); 6119 VkResult result = VK_SUCCESS; 6120 /* optimized NIR */ 6121 if (p < end) { 6122 p->isText = true; 6123 desc_copy(p->name, "NIR Shader(s)"); 6124 desc_copy(p->description, "The optimized NIR shader(s)"); 6125 if (radv_copy_representation(p->pData, &p->dataSize, shader->nir_string) != VK_SUCCESS) 6126 result = VK_INCOMPLETE; 6127 } 6128 ++p; 6129 6130 /* backend IR */ 6131 if (p < end) { 6132 p->isText = true; 6133 if (radv_use_llvm_for_stage(pipeline->device, stage)) { 6134 desc_copy(p->name, "LLVM IR"); 6135 desc_copy(p->description, "The LLVM IR after some optimizations"); 6136 } else { 6137 desc_copy(p->name, "ACO IR"); 6138 desc_copy(p->description, "The ACO IR after some optimizations"); 6139 } 6140 if (radv_copy_representation(p->pData, &p->dataSize, shader->ir_string) != VK_SUCCESS) 6141 result = VK_INCOMPLETE; 6142 } 6143 ++p; 6144 6145 /* Disassembler */ 6146 if (p < end && shader->disasm_string) { 6147 p->isText = true; 6148 desc_copy(p->name, "Assembly"); 6149 desc_copy(p->description, "Final Assembly"); 6150 if (radv_copy_representation(p->pData, &p->dataSize, shader->disasm_string) != VK_SUCCESS) 6151 result = VK_INCOMPLETE; 6152 } 6153 ++p; 6154 6155 if (!pInternalRepresentations) 6156 *pInternalRepresentationCount = p - pInternalRepresentations; 6157 else if (p > end) { 6158 result = VK_INCOMPLETE; 6159 *pInternalRepresentationCount = end - pInternalRepresentations; 6160 } else { 6161 *pInternalRepresentationCount = p - pInternalRepresentations; 6162 } 6163 6164 return result; 6165} 6166